From 9a9ca3fee3e268e03f64ccd1760850aabed0ba08 Mon Sep 17 00:00:00 2001
From: JosephMoore25 <moore.joseph002@gmail.com>
Date: Wed, 1 May 2024 18:55:08 +0100
Subject: [PATCH 01/38] Added LDRSWroW, LDAXRB, stlxrb insts

---
 src/lib/arch/aarch64/Instruction_address.cc | 15 +++++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc | 12 ++++++++++++
 2 files changed, 27 insertions(+)
diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index 357077e7b3..3f27b5acc3 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -451,6 +451,10 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 8}});
         break;
       }
+      case Opcode::AArch64_LDAXRB: {  // ldaxrb wt, [xn]
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 1}});
+        break;
+      }
       case Opcode::AArch64_LDAXRW: {  // ldaxr wd, [xn]
         setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 4}});
         break;
@@ -749,6 +753,13 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 4}});
         break;
       }
+      case Opcode::AArch64_LDRSWroW: {  // ldrsw xt, [xn, wm{, extend
+                                        // {#amount}}]
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint32_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 4}});
+        break;
+      }
       case Opcode::AArch64_LDRSWui: {  // ldrsw xt, [xn{, #pimm}]
         uint64_t base =
             sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp;
@@ -1350,6 +1361,10 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 8}});
         break;
       }
+      case Opcode::AArch64_STLXRB: {  // stlxrb ws, wt, [xn]
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 1}});
+        break;
+      }
       case Opcode::AArch64_STLXRW: {  // stlxr ws, wt, [xn]
         setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 4}});
         break;
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 20b62904b9..b8352c79bf 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -3330,6 +3330,11 @@ void Instruction::execute() {
         results_[0] = memoryData_[0];
         break;
       }
+      case Opcode::AArch64_LDAXRB: {  // ldaxrb wt, [xn]
+        // LOAD
+        results_[0] = memoryData_[0].zeroExtend(1, 8);
+        break;
+      }
       case Opcode::AArch64_LDAXRW: {  // ldaxr wd, [xn]
         // LOAD
         results_[0] = memoryData_[0].zeroExtend(4, 8);
@@ -3603,6 +3608,12 @@ void Instruction::execute() {
         results_[0] = static_cast<int64_t>(memoryData_[0].get<int32_t>());
         break;
       }
+      case Opcode::AArch64_LDRSWroW: {  // ldrsw xt, [xn, wm, {extend
+                                        // {#amount}}]
+        // LOAD
+        results_[0] = static_cast<int64_t>(memoryData_[0].get<int32_t>());
+        break;
+      }
       case Opcode::AArch64_LDRSWui: {  // ldrsw xt, [xn{, #pimm}]
         // LOAD
         results_[0] = static_cast<int64_t>(memoryData_[0].get<int32_t>());
@@ -4945,6 +4956,7 @@ void Instruction::execute() {
         memoryData_[0] = sourceValues_[0];
         break;
       }
+      case Opcode::AArch64_STLXRB:    // stlxrb ws, wt, [xn]
       case Opcode::AArch64_STLXRW:    // stlxr ws, wt, [xn]
       case Opcode::AArch64_STLXRX: {  // stlxr ws, xt, [xn]
         // STORE

From 9adaeee95dcea32951e9b1f4a4969fb8fe266379 Mon Sep 17 00:00:00 2001
From: JosephMoore25 <moore.joseph002@gmail.com>
Date: Wed, 1 May 2024 19:01:09 +0100
Subject: [PATCH 02/38] Magic OMP affinity fix (thanks Jack)

---
 src/lib/arch/aarch64/ExceptionHandler.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/lib/arch/aarch64/ExceptionHandler.cc b/src/lib/arch/aarch64/ExceptionHandler.cc
index ae98dddb1a..33701b049b 100644
--- a/src/lib/arch/aarch64/ExceptionHandler.cc
+++ b/src/lib/arch/aarch64/ExceptionHandler.cc
@@ -429,15 +429,16 @@ bool ExceptionHandler::init() {
                       << std::endl;
             return fatal();
           }
-          uint64_t retval = (pid == 0) ? 1 : 0;
-          stateChange = {ChangeType::REPLACEMENT, {R0}, {retval}};
-          stateChange.memoryAddresses.push_back({mask, 1});
+          uint64_t retval = static_cast<uint64_t>(bitmask);
+          stateChange = {ChangeType::REPLACEMENT, {R0}, {sizeof(retval)}};
+          stateChange.memoryAddresses.push_back({mask, 8});
           stateChange.memoryAddressValues.push_back(bitmask);
         } else {
           stateChange = {ChangeType::REPLACEMENT, {R0}, {-1ll}};
         }
         break;
       }
+
       case 131: {  // tgkill
         // TODO: Functionality temporarily omitted since simeng only has a
         // single thread at the moment

From 70f0387cae35aa3f15dbb185135097c3237e09f4 Mon Sep 17 00:00:00 2001
From: JosephMoore25 <moore.joseph002@gmail.com>
Date: Mon, 20 May 2024 12:07:22 +0100
Subject: [PATCH 03/38] Added Cpy (Simd&FP scalar) instruction and alias, with
 tests for each size

---
 .../simeng/arch/aarch64/helpers/sve.hh        |  26 ++++
 src/lib/arch/aarch64/Instruction_execute.cc   |  16 +++
 test/regression/aarch64/instructions/sve.cc   | 117 ++++++++++++++++++
 3 files changed, 159 insertions(+)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 2c33ccfbe6..4c162bcddf 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -257,6 +257,32 @@ RegisterValue sveCpy_imm(
   return {out, 256};
 }
 
+/** Helper function for SVE instructions with the format `cpy zd, pg/m, vn
+ * T represents the type of sourceValues (e.g. for zd.d, T = int64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveCpy_Scalar(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const T* zd = sourceValues[0].getAsVector<T>();
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const T vn = sourceValues[2].get<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      out[i] = vn;
+    } else {
+      out[i] = zd[i];
+    }
+  }
+  return {out, 256};
+}
+
 /** Helper function for SVE instructions with the format `dec<b,d,h,s> xdn{,
  * pattern{, MUL #imm}}`.
  * T represents the type of operation (e.g. for DECD, T = uint64_t).
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index b8352c79bf..a7f8da3e86 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -949,6 +949,22 @@ void Instruction::execute() {
         results_[0] = sveCpy_imm<int32_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
+      case Opcode::AArch64_CPY_ZPmV_B: {  // cpy zd.b, pg/m, vn.b
+        results_[0] = sveCpy_Scalar<int8_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_CPY_ZPmV_D: {  // cpy zd.d, pg/m, vn.d
+        results_[0] = sveCpy_Scalar<int64_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_CPY_ZPmV_H: {  // cpy zd.h, pg/m, vn.h
+        results_[0] = sveCpy_Scalar<int16_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_CPY_ZPmV_S: {  // cpy zd.s, pg/m, vn.s
+        results_[0] = sveCpy_Scalar<int32_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_DUPi32: {  // dup vd, vn.s[index]
         results_[0] =
             vecDup_gprOrIndex<uint32_t, 1>(sourceValues_, metadata_, false);
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 6a52d46b95..e75b1c2061 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -1544,6 +1544,123 @@ TEST_P(InstSve, cpy) {
   CHECK_NEON(4, int64_t, fillNeon<int64_t>({12}, VL / 8));
   CHECK_NEON(5, int64_t,
              fillNeon<int64_t>({static_cast<int16_t>(-2048)}, VL / 16));
+
+  // SIMD & FP scalar
+  // 8-bit
+  RUN_AARCH64(R"(
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    sdiv x0, x0, x1
+
+    ptrue p0.b
+    whilelo p1.b, xzr, x0
+
+    dup z1.b, #10
+    dup z2.b, #-8
+
+    cpy z0.b, p0/m, z1.b
+    cpy z1.b, p0/m, z2.b
+    cpy z2.b, p1/m, z1.b
+    cpy z3.b, p1/m, z2.b
+
+    # Test Alias
+    mov z4.b, p0/m, z1.b
+    mov z5.b, p1/m, z2.b
+  )");
+  CHECK_NEON(0, int8_t, fillNeon<int8_t>({10}, VL / 8));
+  CHECK_NEON(1, int8_t, fillNeon<int8_t>({-8}, VL / 8));
+  CHECK_NEON(2, int8_t, fillNeon<int8_t>({10}, VL / 16));
+  CHECK_NEON(3, int8_t, fillNeon<int8_t>({-8}, VL / 16));
+  CHECK_NEON(4, int8_t, fillNeon<int8_t>({-8}, VL / 8));
+  CHECK_NEON(5, int8_t, fillNeon<int8_t>({-8}, VL / 16));
+
+  // 16-bit
+  RUN_AARCH64(R"(
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    sdiv x0, x0, x1
+
+    ptrue p0.h
+    whilelo p1.h, xzr, x0
+
+    dup z1.h, #10
+    dup z2.h, #2048
+
+    cpy z0.h, p0/m, z1.h
+    cpy z1.h, p0/m, z2.h
+    cpy z2.h, p1/m, z1.h
+    cpy z3.h, p1/m, z2.h
+
+    # Test Alias
+    mov z4.h, p0/m, z1.h
+    mov z5.h, p1/m, z2.h
+  )");
+  CHECK_NEON(0, int16_t, fillNeon<int16_t>({10}, VL / 8));
+  CHECK_NEON(1, int16_t, fillNeon<int16_t>({2048}, VL / 8));
+  CHECK_NEON(2, int16_t, fillNeon<int16_t>({10}, VL / 16));
+  CHECK_NEON(3, int16_t, fillNeon<int16_t>({2048}, VL / 16));
+  CHECK_NEON(4, int16_t, fillNeon<int16_t>({2048}, VL / 8));
+  CHECK_NEON(5, int16_t, fillNeon<int16_t>({2048}, VL / 16));
+
+  // 32-bit
+  RUN_AARCH64(R"(
+    mov x0, #0
+    mov x1, #8
+    addvl x0, x0, #1
+    sdiv x0, x0, x1
+
+    ptrue p0.s
+    whilelo p1.s, xzr, x0
+
+    dup z1.s, #10
+    dup z2.s, #2048
+
+    cpy z0.s, p0/m, z1.s
+    cpy z1.s, p0/m, z2.s
+    cpy z2.s, p1/m, z1.s
+    cpy z3.s, p1/m, z2.s
+
+    # Test Alias
+    mov z4.s, p0/m, z1.s
+    mov z5.s, p1/m, z2.s
+  )");
+  CHECK_NEON(0, int32_t, fillNeon<int32_t>({10}, VL / 8));
+  CHECK_NEON(1, int32_t, fillNeon<int32_t>({2048}, VL / 8));
+  CHECK_NEON(2, int32_t, fillNeon<int32_t>({10}, VL / 16));
+  CHECK_NEON(3, int32_t, fillNeon<int32_t>({2048}, VL / 16));
+  CHECK_NEON(4, int32_t, fillNeon<int32_t>({2048}, VL / 8));
+  CHECK_NEON(5, int32_t, fillNeon<int32_t>({2048}, VL / 16));
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    mov x0, #0
+    mov x1, #16
+    addvl x0, x0, #1
+    sdiv x0, x0, x1
+
+    ptrue p0.d
+    whilelo p1.d, xzr, x0
+
+    dup z1.d, #10
+    dup z2.d, #2048
+
+    cpy z0.d, p0/m, z1.d
+    cpy z1.d, p0/m, z2.d
+    cpy z2.d, p1/m, z1.d
+    cpy z3.d, p1/m, z2.d
+
+    # Test Alias
+    mov z4.d, p0/m, z1.d
+    mov z5.d, p1/m, z2.d
+  )");
+  CHECK_NEON(0, int64_t, fillNeon<int64_t>({10}, VL / 8));
+  CHECK_NEON(1, int64_t, fillNeon<int64_t>({2048}, VL / 8));
+  CHECK_NEON(2, int64_t, fillNeon<int64_t>({10}, VL / 16));
+  CHECK_NEON(3, int64_t, fillNeon<int64_t>({2048}, VL / 16));
+  CHECK_NEON(4, int64_t, fillNeon<int64_t>({2048}, VL / 8));
+  CHECK_NEON(5, int64_t, fillNeon<int64_t>({2048}, VL / 16));
 }
 
 TEST_P(InstSve, fcpy) {

From 1873378bdae477d35a7e3f299c841ae91eedc80d Mon Sep 17 00:00:00 2001
From: JosephMoore25 <moore.joseph002@gmail.com>
Date: Mon, 20 May 2024 22:53:23 +0100
Subject: [PATCH 04/38] Fixed OMP getaffinity syscall for new fix. Fixed tests
 for CPY_ZPmV instructions

---
 test/regression/aarch64/Syscall.cc          |   2 +-
 test/regression/aarch64/instructions/sve.cc | 151 +++++++++++---------
 2 files changed, 87 insertions(+), 66 deletions(-)

diff --git a/test/regression/aarch64/Syscall.cc b/test/regression/aarch64/Syscall.cc
index 0866c278e2..c7c19eb9a2 100644
--- a/test/regression/aarch64/Syscall.cc
+++ b/test/regression/aarch64/Syscall.cc
@@ -1080,7 +1080,7 @@ TEST_P(Syscall, sched_getaffinity) {
     )");
   EXPECT_EQ(getGeneralRegister<int64_t>(21), -1);
   EXPECT_EQ(getGeneralRegister<int64_t>(22), -1);
-  EXPECT_EQ(getGeneralRegister<int64_t>(23), 1);
+  EXPECT_EQ(getGeneralRegister<int64_t>(23), 8);
 }
 
 // TODO: write tgkill test
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index e75b1c2061..f7d4d445e6 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -1546,33 +1546,39 @@ TEST_P(InstSve, cpy) {
              fillNeon<int64_t>({static_cast<int16_t>(-2048)}, VL / 16));
 
   // SIMD & FP scalar
+  // Tests are different for 8/16 bit vs 32/64 bit due to the lack of fmov
+  // support for h and b registers
   // 8-bit
   RUN_AARCH64(R"(
-    mov x0, #0
-    mov x1, #2
-    addvl x0, x0, #1
-    sdiv x0, x0, x1
-
-    ptrue p0.b
-    whilelo p1.b, xzr, x0
-
-    dup z1.b, #10
-    dup z2.b, #-8
-
-    cpy z0.b, p0/m, z1.b
-    cpy z1.b, p0/m, z2.b
-    cpy z2.b, p1/m, z1.b
-    cpy z3.b, p1/m, z2.b
-
-    # Test Alias
-    mov z4.b, p0/m, z1.b
-    mov z5.b, p1/m, z2.b
-  )");
+      mov x0, #0
+      mov x1, #2
+      addvl x0, x0, #1
+      sdiv x0, x0, x1
+
+      ptrue p0.b
+      whilelo p1.b, xzr, x0
+
+      cpy z6.b, p0/z, #10
+      cpy z7.b, p0/z, #-8
+      cpy z8.b, p0/z, #12
+      cpy z9.b, p0/z, #-16
+      cpy z10.b, p0/z, #12
+      cpy z11.b, p0/z, #-8
+
+      cpy z0.b, p0/m, b6
+      cpy z1.b, p0/m, b7
+      cpy z2.b, p1/m, b8
+      cpy z3.b, p1/m, b9
+
+      # Test Alias
+      mov z4.b, p0/m, b10
+      mov z5.b, p1/m, b11
+    )");
   CHECK_NEON(0, int8_t, fillNeon<int8_t>({10}, VL / 8));
   CHECK_NEON(1, int8_t, fillNeon<int8_t>({-8}, VL / 8));
-  CHECK_NEON(2, int8_t, fillNeon<int8_t>({10}, VL / 16));
-  CHECK_NEON(3, int8_t, fillNeon<int8_t>({-8}, VL / 16));
-  CHECK_NEON(4, int8_t, fillNeon<int8_t>({-8}, VL / 8));
+  CHECK_NEON(2, int8_t, fillNeon<int8_t>({12}, VL / 16));
+  CHECK_NEON(3, int8_t, fillNeon<int8_t>({-16}, VL / 16));
+  CHECK_NEON(4, int8_t, fillNeon<int8_t>({12}, VL / 8));
   CHECK_NEON(5, int8_t, fillNeon<int8_t>({-8}, VL / 16));
 
   // 16-bit
@@ -1585,24 +1591,31 @@ TEST_P(InstSve, cpy) {
     ptrue p0.h
     whilelo p1.h, xzr, x0
 
-    dup z1.h, #10
-    dup z2.h, #2048
+    cpy z6.h, p0/z, #10
+    cpy z7.h, p0/z, #8, lsl #8
+    cpy z8.h, p0/z, #-12
+    cpy z9.h, p0/z, #-16, lsl #8
+    cpy z10.h, p0/z, #12
+    cpy z11.h, p0/z, #-8, lsl #8
 
-    cpy z0.h, p0/m, z1.h
-    cpy z1.h, p0/m, z2.h
-    cpy z2.h, p1/m, z1.h
-    cpy z3.h, p1/m, z2.h
+    cpy z0.h, p0/m, h6
+    cpy z1.h, p0/m, h7
+    cpy z2.h, p1/m, h8
+    cpy z3.h, p1/m, h9
 
     # Test Alias
-    mov z4.h, p0/m, z1.h
-    mov z5.h, p1/m, z2.h
+    mov z4.h, p0/m, h10
+    mov z5.h, p1/m, h11
   )");
   CHECK_NEON(0, int16_t, fillNeon<int16_t>({10}, VL / 8));
-  CHECK_NEON(1, int16_t, fillNeon<int16_t>({2048}, VL / 8));
-  CHECK_NEON(2, int16_t, fillNeon<int16_t>({10}, VL / 16));
-  CHECK_NEON(3, int16_t, fillNeon<int16_t>({2048}, VL / 16));
-  CHECK_NEON(4, int16_t, fillNeon<int16_t>({2048}, VL / 8));
-  CHECK_NEON(5, int16_t, fillNeon<int16_t>({2048}, VL / 16));
+  CHECK_NEON(1, int16_t,
+             fillNeon<int16_t>({static_cast<int16_t>(2048)}, VL / 8));
+  CHECK_NEON(2, int16_t, fillNeon<int16_t>({-12}, VL / 16));
+  CHECK_NEON(3, int16_t,
+             fillNeon<int16_t>({static_cast<int16_t>(-4096)}, VL / 16));
+  CHECK_NEON(4, int16_t, fillNeon<int16_t>({12}, VL / 8));
+  CHECK_NEON(5, int16_t,
+             fillNeon<int16_t>({static_cast<int16_t>(-2048)}, VL / 16));
 
   // 32-bit
   RUN_AARCH64(R"(
@@ -1614,24 +1627,28 @@ TEST_P(InstSve, cpy) {
     ptrue p0.s
     whilelo p1.s, xzr, x0
 
-    dup z1.s, #10
-    dup z2.s, #2048
+    fmov s6, #10
+    fmov s7, #-8
+    fmov s8, #12
+    fmov s9, #-16
+    fmov s10, #12
+    fmov s11, #-8
 
-    cpy z0.s, p0/m, z1.s
-    cpy z1.s, p0/m, z2.s
-    cpy z2.s, p1/m, z1.s
-    cpy z3.s, p1/m, z2.s
+    cpy z0.s, p0/m, s6
+    cpy z1.s, p0/m, s7
+    cpy z2.s, p1/m, s8
+    cpy z3.s, p1/m, s9
 
     # Test Alias
-    mov z4.s, p0/m, z1.s
-    mov z5.s, p1/m, z2.s
+    mov z4.S, p0/m, s10
+    mov z5.S, p1/m, s11
   )");
-  CHECK_NEON(0, int32_t, fillNeon<int32_t>({10}, VL / 8));
-  CHECK_NEON(1, int32_t, fillNeon<int32_t>({2048}, VL / 8));
-  CHECK_NEON(2, int32_t, fillNeon<int32_t>({10}, VL / 16));
-  CHECK_NEON(3, int32_t, fillNeon<int32_t>({2048}, VL / 16));
-  CHECK_NEON(4, int32_t, fillNeon<int32_t>({2048}, VL / 8));
-  CHECK_NEON(5, int32_t, fillNeon<int32_t>({2048}, VL / 16));
+  CHECK_NEON(0, float, fillNeon<float>({10}, VL / 8));
+  CHECK_NEON(1, float, fillNeon<float>({static_cast<int16_t>(-8)}, VL / 8));
+  CHECK_NEON(2, float, fillNeon<float>({12}, VL / 16));
+  CHECK_NEON(3, float, fillNeon<float>({static_cast<int16_t>(-16)}, VL / 16));
+  CHECK_NEON(4, float, fillNeon<float>({12}, VL / 8));
+  CHECK_NEON(5, float, fillNeon<float>({static_cast<int16_t>(-8)}, VL / 16));
 
   // 64-bit
   RUN_AARCH64(R"(
@@ -1643,24 +1660,28 @@ TEST_P(InstSve, cpy) {
     ptrue p0.d
     whilelo p1.d, xzr, x0
 
-    dup z1.d, #10
-    dup z2.d, #2048
+    fmov d6, #10
+    fmov d7, #-8
+    fmov d8, #12
+    fmov d9, #-16
+    fmov d10, #12
+    fmov d11, #-8
 
-    cpy z0.d, p0/m, z1.d
-    cpy z1.d, p0/m, z2.d
-    cpy z2.d, p1/m, z1.d
-    cpy z3.d, p1/m, z2.d
+    cpy z0.d, p0/m, d6
+    cpy z1.d, p0/m, d7
+    cpy z2.d, p1/m, d8
+    cpy z3.d, p1/m, d9
 
     # Test Alias
-    mov z4.d, p0/m, z1.d
-    mov z5.d, p1/m, z2.d
-  )");
-  CHECK_NEON(0, int64_t, fillNeon<int64_t>({10}, VL / 8));
-  CHECK_NEON(1, int64_t, fillNeon<int64_t>({2048}, VL / 8));
-  CHECK_NEON(2, int64_t, fillNeon<int64_t>({10}, VL / 16));
-  CHECK_NEON(3, int64_t, fillNeon<int64_t>({2048}, VL / 16));
-  CHECK_NEON(4, int64_t, fillNeon<int64_t>({2048}, VL / 8));
-  CHECK_NEON(5, int64_t, fillNeon<int64_t>({2048}, VL / 16));
+    mov z4.d, p0/m, d10
+    mov z5.d, p1/m, d11
+  )");
+  CHECK_NEON(0, double, fillNeon<double>({10}, VL / 8));
+  CHECK_NEON(1, double, fillNeon<double>({static_cast<int16_t>(-8)}, VL / 8));
+  CHECK_NEON(2, double, fillNeon<double>({12}, VL / 16));
+  CHECK_NEON(3, double, fillNeon<double>({static_cast<int16_t>(-16)}, VL / 16));
+  CHECK_NEON(4, double, fillNeon<double>({12}, VL / 8));
+  CHECK_NEON(5, double, fillNeon<double>({static_cast<int16_t>(-8)}, VL / 16));
 }
 
 TEST_P(InstSve, fcpy) {

From 351832716060a8da59e70863ae7a5d6479997a4a Mon Sep 17 00:00:00 2001
From: JosephMoore25 <moore.joseph002@gmail.com>
Date: Tue, 21 May 2024 16:54:31 +0100
Subject: [PATCH 05/38] Added more instructions so stream+sve compiles with
 armclang23. Some instructions/helpers from neoverse-v2 branch.

---
 .../simeng/arch/aarch64/helpers/sve.hh        |  91 ++++++++--
 src/lib/arch/aarch64/Instruction_execute.cc   | 170 +++++++++++++++---
 2 files changed, 227 insertions(+), 34 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 4c162bcddf..27cb63d3ae 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -962,6 +962,34 @@ RegisterValue sveIndex(
   return {out, 256};
 }
 
+/** Helper function for SVE instructions with the format `lastb vd, pg, zn`.
+ * T represents the vector register type (e.g. zd.d would be uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveLastBScalar(srcValContainer& sourceValues,
+                             const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* n = sourceValues[1].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out;
+
+  // Get last active element
+  int lastElem = 0;
+  for (int i = partition_num - 1; i >= 0; i--) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      lastElem = i;
+      break;
+    }
+    // If no active lane has been found, select highest element instead
+    if (i == 0) lastElem = partition_num - 1;
+  }
+
+  out = n[lastElem];
+  return {out, 256};
+}
+
 /** Helper function for SVE instructions with the format `<AND, EOR, ...>
  * pd, pg/z, pn, pm`.
  * T represents the type of sourceValues (e.g. for pn.d, T = uint64_t).
@@ -1451,6 +1479,51 @@ RegisterValue sveSminv(srcValContainer& sourceValues, const uint16_t VL_bits) {
   return {out, 256};
 }
 
+/** Helper function for SVE instructions with the format `splice zd, pg, zn,
+ * zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveSplice(srcValContainer& sourceValues, const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* n = sourceValues[1].getAsVector<T>();
+  const T* m = sourceValues[2].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  // Get last active element
+  int lastElem = 0;
+  for (int i = partition_num - 1; i >= 0; i--) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      lastElem = i;
+      break;
+    }
+  }
+
+  // Extract region from n as denoted by predicate p. Copy region into the
+  // lowest elements of the destination operand
+  bool active = false;
+  int index = 0;
+  for (int i = 0; i <= lastElem; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) active = true;
+    if (active) {
+      out[index] = n[i];
+      index++;
+    }
+  }
+
+  // Set any unassigned elements to the lowest elements in m
+  int elemsLeft = partition_num - index;
+  for (int i = 0; i < elemsLeft; i++) {
+    out[index] = m[i];
+    index++;
+  }
+  return {out, 256};
+}
+
 /** Helper function for SVE instructions with the format `Sub zd, zn,
  * zm`.
  * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
@@ -1660,33 +1733,31 @@ RegisterValue sveUzp_vecs(srcValContainer& sourceValues, const uint16_t VL_bits,
   return {out, 256};
 }
 
-/** Helper function for SVE instructions with the format `whilelo pd,
- * <w,x>n, <w,x>m`.
+/** Helper function for SVE instructions with the format `while<ge, gt, hi, hs,
+ * le, lo, ls, lt> pd, <w,x>n, <w,x>m`.
  * T represents the type of sourceValues n and m (e.g. for wn, T = uint32_t).
  * P represents the type of operand p (e.g. for pd.b, P = uint8_t).
  * Returns tuple of type [pred results (array of 4 uint64_t), nzcv]. */
 template <typename T, typename P>
-std::tuple<std::array<uint64_t, 4>, uint8_t> sveWhilelo(
-    srcValContainer& sourceValues, const uint16_t VL_bits, bool calcNZCV) {
+std::tuple<std::array<uint64_t, 4>, uint8_t> sveWhile(
+    srcValContainer& sourceValues, const uint16_t VL_bits,
+    std::function<bool(T, T)> func) {
   const T n = sourceValues[0].get<T>();
   const T m = sourceValues[1].get<T>();
 
   const uint16_t partition_num = VL_bits / (sizeof(P) * 8);
   std::array<uint64_t, 4> out = {0, 0, 0, 0};
-  uint16_t index = 0;
 
   for (int i = 0; i < partition_num; i++) {
     // Determine whether lane should be active and shift to align with
     // element in predicate register.
     uint64_t shifted_active =
-        (n + i) < m ? 1ull << ((i % (64 / (sizeof(P))) * (sizeof(P)))) : 0;
-    out[index / (64 / (sizeof(P)))] =
-        out[index / (64 / (sizeof(P)))] | shifted_active;
-    index++;
+        func((n + i), m) ? 1ull << ((i % (64 / (sizeof(P))) * (sizeof(P)))) : 0;
+    out[i / (64 / (sizeof(P)))] |= shifted_active;
   }
   // Byte count = sizeof(P) as destination predicate is predicate of P
   // bytes.
-  uint8_t nzcv = calcNZCV ? getNZCVfromPred(out, VL_bits, sizeof(P)) : 0;
+  uint8_t nzcv = getNZCVfromPred(out, VL_bits, sizeof(P));
   return {out, nzcv};
 }
 
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index a7f8da3e86..e2ac80dab8 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -2573,6 +2573,22 @@ void Instruction::execute() {
             vecInsIndex_gpr<uint8_t, uint32_t, 16>(sourceValues_, metadata_);
         break;
       }
+      case Opcode::AArch64_LASTB_VPZ_D: {  // lastb dd, pg, zn.d
+        results_[0] = sveLastBScalar<uint64_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_LASTB_VPZ_S: {  // lastb sd, pg, zn.s
+        results_[0] = sveLastBScalar<uint32_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_LASTB_VPZ_H: {  // lastb hd, pg, zn.h
+        results_[0] = sveLastBScalar<uint16_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_LASTB_VPZ_B: {  // lastb bd, pg, zn.b
+        results_[0] = sveLastBScalar<uint8_t>(sourceValues_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_LD1_MXIPXX_H_D: {  // ld1d {zath.d[ws, #imm]}, pg/z,
                                               // [<xn|sp>{, xm, lsl #3}]
         // SME, LOAD
@@ -4327,34 +4343,88 @@ void Instruction::execute() {
         results_[0] = maddl_4ops<int64_t, int32_t>(sourceValues_);
         break;
       }
+      case Opcode::AArch64_SMAX_ZI_D: {  // smax zdn.d, zdn.d, #imm
+        results_[0] = sveMax_vecImm<int64_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_SMAX_ZI_S: {  // smax zdn.s, zdn.s, #imm
         results_[0] = sveMax_vecImm<int32_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
+      case Opcode::AArch64_SMAX_ZI_H: {  // smax zdn.h, zdn.h, #imm
+        results_[0] = sveMax_vecImm<int16_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_SMAX_ZI_B: {  // smax zdn.b, zdn.b, #imm
+        results_[0] = sveMax_vecImm<int8_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_SMAX_ZPmZ_D: {  // smax zd.d, pg/m, zn.d, zm.d
+        results_[0] = sveMaxPredicated_vecs<int64_t>(sourceValues_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_SMAX_ZPmZ_S: {  // smax zd.s, pg/m, zn.s, zm.s
         results_[0] = sveMaxPredicated_vecs<int32_t>(sourceValues_, VL_bits);
         break;
       }
+      case Opcode::AArch64_SMAX_ZPmZ_H: {  // smax zd.h, pg/m, zn.h, zm.h
+        results_[0] = sveMaxPredicated_vecs<int16_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_SMAX_ZPmZ_B: {  // smax zd.b, pg/m, zn.b, zm.b
+        results_[0] = sveMaxPredicated_vecs<int8_t>(sourceValues_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_SMAXv4i32: {  // smax vd.4s, vn.4s, vm.4s
         results_[0] = vecLogicOp_3vecs<int32_t, 4>(
             sourceValues_,
             [](int32_t x, int32_t y) -> int32_t { return std::max(x, y); });
         break;
       }
+      case Opcode::AArch64_SMINV_VPZ_D: {  // sminv sd, pg, zn.d
+        results_[0] = sveSminv<int64_t>(sourceValues_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_SMINV_VPZ_S: {  // sminv sd, pg, zn.s
         results_[0] = sveSminv<int32_t>(sourceValues_, VL_bits);
         break;
       }
+      case Opcode::AArch64_SMINV_VPZ_H: {  // sminv sd, pg, zn.h
+        results_[0] = sveSminv<int16_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_SMINV_VPZ_B: {  // sminv sd, pg, zn.b
+        results_[0] = sveSminv<int8_t>(sourceValues_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_SMINVv4i32v: {  // sminv sd, vn.4s
         results_[0] = vecMinv_2ops<int32_t, 4>(sourceValues_);
         break;
       }
+      case Opcode::AArch64_SMIN_ZPmZ_D: {  // smin zd.d, pg/m, zn.d, zm.d
+        results_[0] = sveLogicOpPredicated_3vecs<int64_t>(
+            sourceValues_, VL_bits,
+            [](int64_t x, int64_t y) -> int64_t { return std::min(x, y); });
+        break;
+      }
       case Opcode::AArch64_SMIN_ZPmZ_S: {  // smin zd.s, pg/m, zn.s, zm.s
         results_[0] = sveLogicOpPredicated_3vecs<int32_t>(
             sourceValues_, VL_bits,
             [](int32_t x, int32_t y) -> int32_t { return std::min(x, y); });
         break;
       }
+      case Opcode::AArch64_SMIN_ZPmZ_H: {  // smin zd.h, pg/m, zn.h, zm.h
+        results_[0] = sveLogicOpPredicated_3vecs<int16_t>(
+            sourceValues_, VL_bits,
+            [](int16_t x, int16_t y) -> int16_t { return std::min(x, y); });
+        break;
+      }
+      case Opcode::AArch64_SMIN_ZPmZ_B: {  // smin zd.b, pg/m, zn.b, zm.b
+        results_[0] = sveLogicOpPredicated_3vecs<int8_t>(
+            sourceValues_, VL_bits,
+            [](int8_t x, int8_t y) -> int8_t { return std::min(x, y); });
+        break;
+      }
       case Opcode::AArch64_SMINv4i32: {  // smin vd.4s, vn.4s, vm.4s
         results_[0] = vecLogicOp_3vecs<int32_t, 4>(
             sourceValues_,
@@ -4386,6 +4456,14 @@ void Instruction::execute() {
                             sourceValues_[1].get<uint64_t>());
         break;
       }
+      case Opcode::AArch64_SPLICE_ZPZ_D: {  // splice zdn.d, pv, zdn.t, zm.d
+        results_[0] = sveSplice<double>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_SPLICE_ZPZ_S: {  // splice zdn.s, pv, zdn.t, zm.s
+        results_[0] = sveSplice<float>(sourceValues_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_SSHLLv2i32_shift: {  // sshll vd.2d, vn.2s, #imm
         results_[0] = vecShllShift_vecImm<int64_t, int32_t, 2>(
             sourceValues_, metadata_, false);
@@ -5754,85 +5832,129 @@ void Instruction::execute() {
         break;
       }
       case Opcode::AArch64_WHILELO_PWW_B: {  // whilelo pd.b, wn, wm
-        auto [output, nzcv] =
-            sveWhilelo<uint32_t, uint8_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<uint32_t, uint8_t>(
+            sourceValues_, VL_bits,
+            [](uint32_t x, uint32_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PWW_D: {  // whilelo pd.d, wn, wm
-        auto [output, nzcv] =
-            sveWhilelo<uint32_t, uint64_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<uint32_t, uint64_t>(
+            sourceValues_, VL_bits,
+            [](uint32_t x, uint32_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PWW_H: {  // whilelo pd.h, wn, wm
-        auto [output, nzcv] =
-            sveWhilelo<uint32_t, uint16_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<uint32_t, uint16_t>(
+            sourceValues_, VL_bits,
+            [](uint32_t x, uint32_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PWW_S: {  // whilelo pd.s, wn, wm
-        auto [output, nzcv] =
-            sveWhilelo<uint32_t, uint32_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<uint32_t, uint32_t>(
+            sourceValues_, VL_bits,
+            [](uint32_t x, uint32_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PXX_B: {  // whilelo pd.b, xn, xm
-        auto [output, nzcv] =
-            sveWhilelo<uint64_t, uint8_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<uint64_t, uint8_t>(
+            sourceValues_, VL_bits,
+            [](uint64_t x, uint64_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PXX_D: {  // whilelo pd.d, xn, xm
-        auto [output, nzcv] =
-            sveWhilelo<uint64_t, uint64_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<uint64_t, uint64_t>(
+            sourceValues_, VL_bits,
+            [](uint64_t x, uint64_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PXX_H: {  // whilelo pd.h, xn, xm
-        auto [output, nzcv] =
-            sveWhilelo<uint64_t, uint16_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<uint64_t, uint16_t>(
+            sourceValues_, VL_bits,
+            [](uint64_t x, uint64_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PXX_S: {  // whilelo pd.s, xn, xm
-        auto [output, nzcv] =
-            sveWhilelo<uint64_t, uint32_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<uint64_t, uint32_t>(
+            sourceValues_, VL_bits,
+            [](uint64_t x, uint64_t y) -> bool { return x < y; });
+        results_[0] = nzcv;
+        results_[1] = output;
+        break;
+      }
+      case Opcode::AArch64_WHILELS_PXX_B: {  // whilels pd.b, xn, xm
+        auto [output, nzcv] = sveWhile<uint64_t, uint8_t>(
+            sourceValues_, VL_bits,
+            [](uint64_t x, uint64_t y) -> bool { return x <= y; });
+        results_[0] = nzcv;
+        results_[1] = output;
+        break;
+      }
+      case Opcode::AArch64_WHILELS_PXX_D: {  // whilels pd.d, xn, xm
+        auto [output, nzcv] = sveWhile<uint64_t, uint64_t>(
+            sourceValues_, VL_bits,
+            [](uint64_t x, uint64_t y) -> bool { return x <= y; });
+        results_[0] = nzcv;
+        results_[1] = output;
+        break;
+      }
+      case Opcode::AArch64_WHILELS_PXX_H: {  // whilels pd.h, xn, xm
+        auto [output, nzcv] = sveWhile<uint64_t, uint16_t>(
+            sourceValues_, VL_bits,
+            [](uint64_t x, uint64_t y) -> bool { return x <= y; });
+        results_[0] = nzcv;
+        results_[1] = output;
+        break;
+      }
+      case Opcode::AArch64_WHILELS_PXX_S: {  // whilels pd.s, xn, xm
+        auto [output, nzcv] = sveWhile<uint64_t, uint32_t>(
+            sourceValues_, VL_bits,
+            [](uint64_t x, uint64_t y) -> bool { return x <= y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELT_PXX_B: {  // whilelt pd.b, xn, xm
-        auto [output, nzcv] =
-            sveWhilelo<int64_t, int8_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<int64_t, int8_t>(
+            sourceValues_, VL_bits,
+            [](int64_t x, int64_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELT_PXX_D: {  // whilelt pd.d, xn, xm
-        auto [output, nzcv] =
-            sveWhilelo<int64_t, int64_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<int64_t, int64_t>(
+            sourceValues_, VL_bits,
+            [](int64_t x, int64_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELT_PXX_H: {  // whilelt pd.h, xn, xm
-        auto [output, nzcv] =
-            sveWhilelo<int64_t, int16_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<int64_t, int16_t>(
+            sourceValues_, VL_bits,
+            [](int64_t x, int64_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELT_PXX_S: {  // whilelt pd.s, xn, xm
-        auto [output, nzcv] =
-            sveWhilelo<int64_t, int32_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<int64_t, int32_t>(
+            sourceValues_, VL_bits,
+            [](int64_t x, int64_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;

From 81889ab63429296234f3a3a604b58d4d722af7a1 Mon Sep 17 00:00:00 2001
From: JosephMoore25 <moore.joseph002@gmail.com>
Date: Wed, 22 May 2024 19:24:58 +0100
Subject: [PATCH 06/38] Added a couple more instructions, working towards
 minibude armclang23

---
 .../simeng/arch/aarch64/helpers/sve.hh        | 22 ++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc   | 36 +++++++++++++++++++
 2 files changed, 58 insertions(+)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 27cb63d3ae..fd9047f635 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -1347,6 +1347,28 @@ std::array<uint64_t, 4> svePsel(
   return out;
 }
 
+/** Helper function for SVE instructions with the format `pfirst pdn, pg, pdn`.
+ * Returns an array of 4 uint64_t elements. */
+std::array<uint64_t, 4> svePfirst(srcValContainer& sourceValues,
+                                  const uint16_t VL_bits) {
+  const uint16_t partition_num = VL_bits / 8;
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const uint64_t* dn = sourceValues[1].getAsVector<uint64_t>();
+  // Set destination d as source n to copy all false lanes and the active lanes
+  // beyond the first
+  std::array<uint64_t, 4> out = {dn[0], dn[1], dn[2], dn[3]};
+
+  // Get the first active lane and set same lane in destination predicate
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64)));
+    if (p[i / 64] & shifted_active) {
+      out[i / 64] |= shifted_active;
+      break;
+    }
+  }
+  return out;
+}
+
 /** Helper function for SVE instructions with the format `ptrue pd{,
  * pattern}.
  * T represents the type of sourceValues (e.g. for pd.d, T = uint64_t).
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index e2ac80dab8..e93ed70fce 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -833,6 +833,38 @@ void Instruction::execute() {
         results_[1] = output;
         break;
       }
+      case Opcode::AArch64_CMPHS_PPzZZ_B: {  // cmphs pd.b, pg/z, zn.b, zm.b
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint8_t>(
+            sourceValues_, metadata_, VL_bits, true,
+            [](uint8_t x, uint8_t y) -> bool { return x >= y; });
+        results_[0] = nzcv;
+        results_[1] = output;
+        break;
+      }
+      case Opcode::AArch64_CMPHS_PPzZZ_D: {  // cmphs pd.d, pg/z, zn.d, zm.d
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint64_t>(
+            sourceValues_, metadata_, VL_bits, true,
+            [](uint64_t x, uint64_t y) -> bool { return x >= y; });
+        results_[0] = nzcv;
+        results_[1] = output;
+        break;
+      }
+      case Opcode::AArch64_CMPHS_PPzZZ_H: {  // cmphs pd.h, pg/z, zn.h, zm.h
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint16_t>(
+            sourceValues_, metadata_, VL_bits, true,
+            [](uint16_t x, uint16_t y) -> bool { return x >= y; });
+        results_[0] = nzcv;
+        results_[1] = output;
+        break;
+      }
+      case Opcode::AArch64_CMPHS_PPzZZ_S: {  // cmphs pd.s, pg/z, zn.s, zm.s
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint32_t>(
+            sourceValues_, metadata_, VL_bits, true,
+            [](uint32_t x, uint32_t y) -> bool { return x >= y; });
+        results_[0] = nzcv;
+        results_[1] = output;
+        break;
+      }
       case Opcode::AArch64_CMPNE_PPzZI_B: {  // cmpne pd.b, pg/z. zn.b, #imm
         auto [output, nzcv] = sveCmpPredicated_toPred<int8_t>(
             sourceValues_, metadata_, VL_bits, true,
@@ -4055,6 +4087,10 @@ void Instruction::execute() {
         results_[0] = out;
         break;
       }
+      case Opcode::AArch64_PFIRST_B: {  // pfirst pdn.b, pg, pdn.b
+        results_[0] = svePfirst(sourceValues_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_PRFMui: {  // prfm op, [xn, xm{, extend{, #amount}}]
         break;
       }

From c6c600018add51796b5c7a5bf601862e4686654b Mon Sep 17 00:00:00 2001
From: JosephMoore25 <moore.joseph002@gmail.com>
Date: Thu, 30 May 2024 14:26:54 +0100
Subject: [PATCH 07/38] Added ClastB instructions with tests that (finally)
 pass. More tests to come

---
 .../simeng/arch/aarch64/helpers/sve.hh        | 31 +++++++++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc   | 16 ++++++++++
 test/regression/aarch64/instructions/sve.cc   | 30 ++++++++++++++++++
 3 files changed, 77 insertions(+)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index fd9047f635..b1871f646f 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -990,6 +990,37 @@ RegisterValue sveLastBScalar(srcValContainer& sourceValues,
   return {out, 256};
 }
 
+/** Helper function for SVE instructions with the format `clastb vd, pg, vd,
+ * zn`. T represents the vector register type (e.g. zd.d would be uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveCLastBScalar(srcValContainer& sourceValues,
+                              const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const uint64_t* m = sourceValues[2].getAsVector<uint64_t>();
+  const T* n = sourceValues[3].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out;
+
+  // Get last active element
+  int lastElem = -1;
+  for (int i = partition_num - 1; i >= 0; i--) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      lastElem = i;
+      break;
+    }
+  }
+
+  if (lastElem < 0) {
+    out = static_cast<uint64_t>(static_cast<T>(m[0]));
+  } else {
+    out = static_cast<uint64_t>(static_cast<T>(n[lastElem]));
+  }
+  return {out, 256};
+}
+
 /** Helper function for SVE instructions with the format `<AND, EOR, ...>
  * pd, pg/z, pn, pm`.
  * T represents the type of sourceValues (e.g. for pn.d, T = uint64_t).
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index e93ed70fce..2f16247aa1 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -2621,6 +2621,22 @@ void Instruction::execute() {
         results_[0] = sveLastBScalar<uint8_t>(sourceValues_, VL_bits);
         break;
       }
+      case Opcode::AArch64_CLASTB_VPZ_D: {  // clastb dd, pg, dn, zn.d
+        results_[0] = sveCLastBScalar<uint64_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_CLASTB_VPZ_S: {  // clastb sd, pg, sn, zn.s
+        results_[0] = sveCLastBScalar<uint32_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_CLASTB_VPZ_H: {  // clastb hd, pg, hn, zn.h
+        results_[0] = sveCLastBScalar<uint16_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_CLASTB_VPZ_B: {  // clastb bd, pg, bn, zn.b
+        results_[0] = sveCLastBScalar<uint8_t>(sourceValues_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_LD1_MXIPXX_H_D: {  // ld1d {zath.d[ws, #imm]}, pg/z,
                                               // [<xn|sp>{, xm, lsl #3}]
         // SME, LOAD
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index f7d4d445e6..dbb0961f96 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -6302,6 +6302,36 @@ TEST_P(InstSve, smulh) {
              fillNeonCombined<int32_t>({-12}, {-1076902265}, VL / 8));
 }
 
+TEST_P(InstSve, clastb) {
+  RUN_AARCH64(R"(
+        movz    x0, #0xCDEF
+        movk    x0, #0x89AB, LSL #16
+        movk x0, #0x4567, LSL #32
+        movk x0, #0x0123, LSL #48
+        movz x1, #0x4321
+        movk x1, #0x8765, LSL #16
+        movk x1, #0xCBA9, LSL #32
+        movk x1, #0x1FED, LSL #48
+
+        dup z0.d, x0
+        dup z1.d, x1
+        
+        pfalse p0.b
+        clastb d0, p0, d0, z1.d
+        mov z4.d, z0.d
+
+        ptrue p0.d
+        clastb d0, p0, d0, z1.d
+        mov z5.d, z0.d
+    )");
+  // EXPECT_EQ(getGeneralRegister<int64_t>(0), (0x0123456789ABCDEF));
+  CHECK_NEON(4, uint64_t,
+             fillNeon<uint64_t>({0x0123456789ABCDEF}, 8));  // False
+
+  CHECK_NEON(5, uint64_t, fillNeon<uint64_t>({0x1FEDCBA987654321}, 8));  //
+  // True
+}
+
 TEST_P(InstSve, st1b) {
   initialHeapData_.resize(VL / 4);
   uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());

From 240ae68113ea24e082af53d80c5368357ecce39d Mon Sep 17 00:00:00 2001
From: JosephMoore25 <moore.joseph002@gmail.com>
Date: Thu, 30 May 2024 15:16:49 +0100
Subject: [PATCH 08/38] Cleaned up clastb tests and added S,H,B cases

---
 test/regression/aarch64/instructions/sve.cc | 94 ++++++++++++++++++---
 1 file changed, 83 insertions(+), 11 deletions(-)

diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index dbb0961f96..417a73e436 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -6303,6 +6303,7 @@ TEST_P(InstSve, smulh) {
 }
 
 TEST_P(InstSve, clastb) {
+  // 64 bit
   RUN_AARCH64(R"(
         movz    x0, #0xCDEF
         movk    x0, #0x89AB, LSL #16
@@ -6313,23 +6314,94 @@ TEST_P(InstSve, clastb) {
         movk x1, #0xCBA9, LSL #32
         movk x1, #0x1FED, LSL #48
 
-        dup z0.d, x0
-        dup z1.d, x1
+        dup z2.d, x0
+        dup z3.d, x1
         
         pfalse p0.b
-        clastb d0, p0, d0, z1.d
-        mov z4.d, z0.d
+        clastb d2, p0, d2, z3.d
+        mov z0.d, z2.d
 
         ptrue p0.d
-        clastb d0, p0, d0, z1.d
-        mov z5.d, z0.d
+        clastb d2, p0, d2, z3.d
+        mov z1.d, z2.d
     )");
-  // EXPECT_EQ(getGeneralRegister<int64_t>(0), (0x0123456789ABCDEF));
-  CHECK_NEON(4, uint64_t,
-             fillNeon<uint64_t>({0x0123456789ABCDEF}, 8));  // False
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0x0123456789ABCDEF}, 8));
+  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1FEDCBA987654321}, 8));
+
+  // 32 bit
+  RUN_AARCH64(R"(
+        movz    x0, #0xCDEF
+        movk    x0, #0x89AB, LSL #16
+        movk x0, #0x4567, LSL #32
+        movk x0, #0x0123, LSL #48
+        movz x1, #0x4321
+        movk x1, #0x8765, LSL #16
+        movk x1, #0xCBA9, LSL #32
+        movk x1, #0x1FED, LSL #48
 
-  CHECK_NEON(5, uint64_t, fillNeon<uint64_t>({0x1FEDCBA987654321}, 8));  //
-  // True
+        dup z2.d, x0
+        dup z3.d, x1
+        
+        pfalse p0.b
+        clastb s2, p0, s2, z3.s
+        mov z0.d, z2.d
+
+        ptrue p0.s
+        clastb s2, p0, s2, z3.s
+        mov z1.d, z2.d
+    )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0x89ABCDEF}, 8));
+  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1FEDCBA9}, 8));
+
+  // 16 bit
+  RUN_AARCH64(R"(
+        movz    x0, #0xCDEF
+        movk    x0, #0x89AB, LSL #16
+        movk x0, #0x4567, LSL #32
+        movk x0, #0x0123, LSL #48
+        movz x1, #0x4321
+        movk x1, #0x8765, LSL #16
+        movk x1, #0xCBA9, LSL #32
+        movk x1, #0x1FED, LSL #48
+
+        dup z2.d, x0
+        dup z3.d, x1
+        
+        pfalse p0.b
+        clastb h2, p0, h2, z3.h
+        mov z0.d, z2.d
+
+        ptrue p0.h
+        clastb h2, p0, h2, z3.h
+        mov z1.d, z2.d
+    )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0xCDEF}, 8));
+  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1FED}, 8));
+
+  // 8 bit
+  RUN_AARCH64(R"(
+        movz    x0, #0xCDEF
+        movk    x0, #0x89AB, LSL #16
+        movk x0, #0x4567, LSL #32
+        movk x0, #0x0123, LSL #48
+        movz x1, #0x4321
+        movk x1, #0x8765, LSL #16
+        movk x1, #0xCBA9, LSL #32
+        movk x1, #0x1FED, LSL #48
+
+        dup z2.d, x0
+        dup z3.d, x1
+        
+        pfalse p0.b
+        clastb b2, p0, b2, z3.b
+        mov z0.d, z2.d
+
+        ptrue p0.b
+        clastb b2, p0, b2, z3.b
+        mov z1.d, z2.d
+    )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0xEF}, 8));
+  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1F}, 8));
 }
 
 TEST_P(InstSve, st1b) {

From 5e798500400a52424a204c18a4b1203704f6adbe Mon Sep 17 00:00:00 2001
From: JosephMoore25 <moore.joseph002@gmail.com>
Date: Thu, 30 May 2024 18:22:47 +0100
Subject: [PATCH 09/38] Dirty WIP for pnext instruction

---
 .../simeng/arch/aarch64/helpers/sve.hh        | 45 +++++++++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc   | 16 +++++++
 2 files changed, 61 insertions(+)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index b1871f646f..bc53e4bcff 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -1400,6 +1400,51 @@ std::array<uint64_t, 4> svePfirst(srcValContainer& sourceValues,
   return out;
 }
 
+/** Helper function for SVE instructions with the format `pnext pdn, pv, pdn`.
+ * Returns an array of 4 uint64_t elements. */
+template <typename T>
+std::array<uint64_t, 4> svePnext(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const uint64_t* dn = sourceValues[1].getAsVector<uint64_t>();
+  std::array<uint64_t, 4> out = {dn[0], dn[1], dn[2], dn[3]};
+
+  // Get pattern
+  const uint16_t count =
+      sveGetPattern(metadata.operandStr, sizeof(T) * 8, VL_bits);
+  // Exit early if count == 0
+  if (count == 0) return out;
+
+  // Create mask so we can zero the pattern
+  uint64_t mask = ~((1ULL << (64 - count * 8)) - 1);
+  out[0] &= mask;
+
+  // Get last active element of dn.pattern
+  int lastElem = -1;
+  for (int i = partition_num - 1; i >= 0; i--) {
+    if (i < count) {
+      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+      if (dn[i / (64 / sizeof(T))] & shifted_active) {
+        lastElem = i;
+        break;
+      }
+    }
+  }
+  // Get next active element of p, starting from last of dn.pattern
+  for (int i = lastElem + 1; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      out[i / (64 / sizeof(T))] |= shifted_active;
+      break;
+    }
+  }
+
+  return out;
+}
+
 /** Helper function for SVE instructions with the format `ptrue pd{,
  * pattern}.
  * T represents the type of sourceValues (e.g. for pd.d, T = uint64_t).
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 2f16247aa1..ad0448c8db 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -4107,6 +4107,22 @@ void Instruction::execute() {
         results_[0] = svePfirst(sourceValues_, VL_bits);
         break;
       }
+      case Opcode::AArch64_PNEXT_B: {  // pnext pdn.b, pv, pdn.b
+        results_[0] = svePnext<uint8_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_PNEXT_H: {  // pnext pdn.h, pv, pdn.h
+        results_[0] = svePnext<uint16_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_PNEXT_S: {  // pnext pdn.s, pv, pdn.s
+        results_[0] = svePnext<uint32_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_PNEXT_D: {  // pnext pdn.d, pv, pdn.d
+        results_[0] = svePnext<uint64_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_PRFMui: {  // prfm op, [xn, xm{, extend{, #amount}}]
         break;
       }

From f8ea7f29f39919e1989d4f8d2e123759a5cb03cd Mon Sep 17 00:00:00 2001
From: JosephMoore25 <moore.joseph002@gmail.com>
Date: Mon, 3 Jun 2024 15:45:01 +0100
Subject: [PATCH 10/38] Added pnext inst along with tests

---
 .../simeng/arch/aarch64/helpers/sve.hh        | 13 +--
 test/regression/aarch64/instructions/sve.cc   | 96 +++++++++++++++++++
 2 files changed, 101 insertions(+), 8 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index bc53e4bcff..7fa84ad4b8 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -1408,20 +1408,18 @@ std::array<uint64_t, 4> svePnext(
     const simeng::arch::aarch64::InstructionMetadata& metadata,
     const uint16_t VL_bits) {
   const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
-  const uint64_t* dn = sourceValues[1].getAsVector<uint64_t>();
-  std::array<uint64_t, 4> out = {dn[0], dn[1], dn[2], dn[3]};
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const uint64_t* dn = sourceValues[2].getAsVector<uint64_t>();
+  // Set destination elements to 0
+  std::array<uint64_t, 4> out = {0, 0, 0, 0};
 
   // Get pattern
   const uint16_t count =
       sveGetPattern(metadata.operandStr, sizeof(T) * 8, VL_bits);
+
   // Exit early if count == 0
   if (count == 0) return out;
 
-  // Create mask so we can zero the pattern
-  uint64_t mask = ~((1ULL << (64 - count * 8)) - 1);
-  out[0] &= mask;
-
   // Get last active element of dn.pattern
   int lastElem = -1;
   for (int i = partition_num - 1; i >= 0; i--) {
@@ -1441,7 +1439,6 @@ std::array<uint64_t, 4> svePnext(
       break;
     }
   }
-
   return out;
 }
 
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 417a73e436..29394d00a8 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -5798,6 +5798,102 @@ TEST_P(InstSve, ptrue) {
   CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 2));
 }
 
+TEST_P(InstSve, pnext) {
+  initialHeapData_.resize(1024);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+
+  //      B arrangement
+  // Allow 32 Byte space for each predicate register for when VL=2048
+  std::vector<uint64_t> src = {0xAAAA, 0x0, 0x0,    0x0, 0x0, 0x0,
+                               0x0,    0x0, 0xAA00, 0x0, 0x0, 0x0};
+  fillHeap<uint64_t>(heap64, src, 12);
+  RUN_AARCH64(R"(
+        # Get heap address
+        mov x0, 0
+        mov x8, 214
+        svc #0
+
+        ldr p2, [x0]
+        add x0, x0, #32
+        ldr p0, [x0]
+
+        pnext p0.b, p2, p0.b
+
+        ldr p1, [x0]
+        add x0, x0, #32
+        ldr p3, [x0]
+
+        pnext p1.b, p3, p1.b
+  )");
+  CHECK_PREDICATE(0, uint64_t,
+                  fillPredFromSource<uint64_t>({0x02, 0, 0, 0}, 32));
+  CHECK_PREDICATE(1, uint64_t,
+                  fillPredFromSource<uint64_t>({0x0200, 0, 0, 0}, 32));
+
+  //      H arrangement
+  src = {0x5555, 0x0, 0x0, 0x0, 0x3333, 0x0, 0x0, 0x0};
+  fillHeap<uint64_t>(heap64, src, 8);
+  RUN_AARCH64(R"(
+        # Get heap address
+        mov x0, 0
+        mov x8, 214
+        svc #0
+
+        ldr p1, [x0]
+        add x0, x0, #32
+        ldr p0, [x0]
+
+        pnext p0.h, p1, p0.h
+  )");
+  CHECK_PREDICATE(0, uint64_t,
+                  fillPredFromSource<uint64_t>({0x4000, 0, 0, 0}, 32));
+
+  //      S arrangement
+  src = {0x9, 0x0, 0x0, 0x0, 0x6, 0x0, 0x0, 0x0};
+  fillHeap<uint64_t>(heap64, src, 8);
+  RUN_AARCH64(R"(
+        # Get heap address
+        mov x0, 0
+        mov x8, 214
+        svc #0
+
+        ldr p1, [x0]
+        add x0, x0, #32
+        ldr p0, [x0]
+
+        pnext p0.s, p1, p0.s
+  )");
+  CHECK_PREDICATE(0, uint64_t,
+                  fillPredFromSource<uint64_t>({0x1, 0, 0, 0}, 32));
+
+  //      D arrangement
+  src = {0x3,   0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0,
+         0xFF0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
+  fillHeap<uint64_t>(heap64, src, 12);
+  RUN_AARCH64(R"(
+        # Get heap address
+        mov x0, 0
+        mov x8, 214
+        svc #0
+
+        ldr p2, [x0]
+        add x0, x0, #32
+        ldr p0, [x0]
+
+        pnext p0.d, p2, p0.d
+
+        add x0, x0, #32
+        ldr p3, [x0]
+        add x0, x0, #32
+        ldr p1, [x0]
+
+        pnext p1.d, p3, p1.d
+  )");
+  CHECK_PREDICATE(0, uint64_t, fillPredFromSource<uint64_t>({0, 0, 0, 0}, 32));
+  CHECK_PREDICATE(1, uint64_t,
+                  fillPredFromSource<uint64_t>({0x100, 0, 0, 0}, 32));
+}
+
 TEST_P(InstSve, punpk) {
   RUN_AARCH64(R"(
     ptrue p0.b

From 5992cd17d1a62e9ee4206964418538d97e068cd2 Mon Sep 17 00:00:00 2001
From: JosephMoore25 <moore.joseph002@gmail.com>
Date: Mon, 3 Jun 2024 17:00:55 +0100
Subject: [PATCH 11/38] Added NZCV changes to pnext and updated tests

---
 .../simeng/arch/aarch64/helpers/sve.hh        |  9 ++++-----
 src/lib/arch/aarch64/Instruction_execute.cc   | 20 +++++++++++++++----
 test/regression/aarch64/instructions/sve.cc   | 14 ++++++++-----
 3 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 7fa84ad4b8..fa111280bb 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -1401,9 +1401,9 @@ std::array<uint64_t, 4> svePfirst(srcValContainer& sourceValues,
 }
 
 /** Helper function for SVE instructions with the format `pnext pdn, pv, pdn`.
- * Returns an array of 4 uint64_t elements. */
+ * Returns an array of 4 uint64_t elements, and updates the NZCV flags. */
 template <typename T>
-std::array<uint64_t, 4> svePnext(
+std::tuple<std::array<uint64_t, 4>, uint8_t> svePnext(
     srcValContainer& sourceValues,
     const simeng::arch::aarch64::InstructionMetadata& metadata,
     const uint16_t VL_bits) {
@@ -1418,8 +1418,7 @@ std::array<uint64_t, 4> svePnext(
       sveGetPattern(metadata.operandStr, sizeof(T) * 8, VL_bits);
 
   // Exit early if count == 0
-  if (count == 0) return out;
-
+  if (count == 0) return {out, getNZCVfromPred(out, VL_bits, sizeof(T))};
   // Get last active element of dn.pattern
   int lastElem = -1;
   for (int i = partition_num - 1; i >= 0; i--) {
@@ -1439,7 +1438,7 @@ std::array<uint64_t, 4> svePnext(
       break;
     }
   }
-  return out;
+  return {out, getNZCVfromPred(out, VL_bits, sizeof(T))};
 }
 
 /** Helper function for SVE instructions with the format `ptrue pd{,
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index ad0448c8db..bc56cb8934 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -4108,19 +4108,31 @@ void Instruction::execute() {
         break;
       }
       case Opcode::AArch64_PNEXT_B: {  // pnext pdn.b, pv, pdn.b
-        results_[0] = svePnext<uint8_t>(sourceValues_, metadata_, VL_bits);
+        auto [result, nzcv] =
+            svePnext<uint8_t>(sourceValues_, metadata_, VL_bits);
+        results_[0] = nzcv;
+        results_[1] = result;
         break;
       }
       case Opcode::AArch64_PNEXT_H: {  // pnext pdn.h, pv, pdn.h
-        results_[0] = svePnext<uint16_t>(sourceValues_, metadata_, VL_bits);
+        auto [result, nzcv] =
+            svePnext<uint16_t>(sourceValues_, metadata_, VL_bits);
+        results_[0] = nzcv;
+        results_[1] = result;
         break;
       }
       case Opcode::AArch64_PNEXT_S: {  // pnext pdn.s, pv, pdn.s
-        results_[0] = svePnext<uint32_t>(sourceValues_, metadata_, VL_bits);
+        auto [result, nzcv] =
+            svePnext<uint32_t>(sourceValues_, metadata_, VL_bits);
+        results_[0] = nzcv;
+        results_[1] = result;
         break;
       }
       case Opcode::AArch64_PNEXT_D: {  // pnext pdn.d, pv, pdn.d
-        results_[0] = svePnext<uint64_t>(sourceValues_, metadata_, VL_bits);
+        auto [result, nzcv] =
+            svePnext<uint64_t>(sourceValues_, metadata_, VL_bits);
+        results_[0] = nzcv;
+        results_[1] = result;
         break;
       }
       case Opcode::AArch64_PRFMui: {  // prfm op, [xn, xm{, extend{, #amount}}]
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 29394d00a8..fde761a148 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -5829,9 +5829,10 @@ TEST_P(InstSve, pnext) {
                   fillPredFromSource<uint64_t>({0x02, 0, 0, 0}, 32));
   CHECK_PREDICATE(1, uint64_t,
                   fillPredFromSource<uint64_t>({0x0200, 0, 0, 0}, 32));
+  EXPECT_EQ(getNZCV(), 0b0010);
 
   //      H arrangement
-  src = {0x5555, 0x0, 0x0, 0x0, 0x3333, 0x0, 0x0, 0x0};
+  src = {0x555, 0x0, 0x0, 0x0, 0x333, 0x0, 0x0, 0x0};
   fillHeap<uint64_t>(heap64, src, 8);
   RUN_AARCH64(R"(
         # Get heap address
@@ -5846,7 +5847,8 @@ TEST_P(InstSve, pnext) {
         pnext p0.h, p1, p0.h
   )");
   CHECK_PREDICATE(0, uint64_t,
-                  fillPredFromSource<uint64_t>({0x4000, 0, 0, 0}, 32));
+                  fillPredFromSource<uint64_t>({0x400, 0, 0, 0}, 32));
+  EXPECT_EQ(getNZCV(), 0b0010);
 
   //      S arrangement
   src = {0x9, 0x0, 0x0, 0x0, 0x6, 0x0, 0x0, 0x0};
@@ -5865,10 +5867,11 @@ TEST_P(InstSve, pnext) {
   )");
   CHECK_PREDICATE(0, uint64_t,
                   fillPredFromSource<uint64_t>({0x1, 0, 0, 0}, 32));
+  EXPECT_EQ(getNZCV(), 0b1010);
 
   //      D arrangement
-  src = {0x3,   0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0,
-         0xFF0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
+  src = {0x3,  0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0,
+         0xF3, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
   fillHeap<uint64_t>(heap64, src, 12);
   RUN_AARCH64(R"(
         # Get heap address
@@ -5891,7 +5894,8 @@ TEST_P(InstSve, pnext) {
   )");
   CHECK_PREDICATE(0, uint64_t, fillPredFromSource<uint64_t>({0, 0, 0, 0}, 32));
   CHECK_PREDICATE(1, uint64_t,
-                  fillPredFromSource<uint64_t>({0x100, 0, 0, 0}, 32));
+                  fillPredFromSource<uint64_t>({0x1, 0, 0, 0}, 32));
+  EXPECT_EQ(getNZCV(), 0b1010);
 }
 
 TEST_P(InstSve, punpk) {

From 49dbbbe2eed2024d68e8fdf2b53de6efddc7c615 Mon Sep 17 00:00:00 2001
From: JosephMoore25 <zi23956@bristol.ac.uk>
Date: Thu, 13 Jun 2024 17:36:03 +0100
Subject: [PATCH 12/38] Added weird FP Trig SVE insts (untested). Minibude now
 works with armclang23!

---
 .../simeng/arch/aarch64/helpers/sve.hh        | 126 ++++++++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc   |  24 ++++
 2 files changed, 150 insertions(+)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index fa111280bb..931ca27d42 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -875,6 +875,132 @@ RegisterValue sveFsqrtPredicated_2vecs(srcValContainer& sourceValues,
   return {out, 256};
 }
 
+/** Helper function for SVE instructions with the format `ftsmul zd, zn, zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = double).
+ * Returns correctly formatted RegisterValue. U represents the same precision as
+ * T, but as an integer type for the second source register. */
+template <typename T, typename U>
+RegisterValue sveFTrigSMul(srcValContainer& sourceValues,
+                           const uint16_t VL_bits) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const U* m = sourceValues[1].getAsVector<U>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  U bit_0_mask = 1ull << (sizeof(T) * 8 - 1);
+  // Square each element in the first source vector and then set the sign bit
+  // to a copy of bit 0 of the corresponding element in the second source
+  // register
+  for (int i = 0; i < partition_num; i++) {
+    out[i] = n[i] * n[i];
+    T sign_bit = m[i] & bit_0_mask ? 1.0 : -1.0;
+    out[i] = std::abs(out[i]) * sign_bit;
+  }
+
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `ftssel zd, zn, zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = double).
+ * Returns correctly formatted RegisterValue. U represents the same precision as
+ * T, but as an integer type for the second source register. */
+template <typename T, typename U>
+RegisterValue sveFTrigSSel(srcValContainer& sourceValues,
+                           const uint16_t VL_bits) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const U* m = sourceValues[1].getAsVector<U>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  U bit_0_mask = 1ull << (sizeof(T) * 8 - 1);
+  U bit_1_mask = 1ull << (sizeof(T) * 8 - 2);
+
+  // Place the value 1.0 or a copy of the first source vector element in the
+  // destination element, depending on bit 0 of the corresponding element of
+  // the second source vector. The sign bit of the destination element is
+  // copied from bit 1 of the second source vector
+  for (int i = 0; i < partition_num; i++) {
+    out[i] = m[i] & bit_0_mask ? 1.0 : n[i];
+    T sign_bit = m[i] & bit_1_mask ? 1.0 : -1.0;
+    out[i] = std::abs(out[i]) * sign_bit;
+  }
+
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `ftmad zd, zn, zm,
+ * #imm`. T represents the type of sourceValues (e.g. for zn.d, T = double).
+ * Returns correctly formatted RegisterValue. **/
+template <typename T>
+RegisterValue sveFTrigMad(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const T* m = sourceValues[1].getAsVector<T>();
+  const uint8_t imm = static_cast<uint8_t>(metadata.operands[1].imm);
+
+  const std::array<double, 8> sin64 = {1.0,
+                                       -0.1666666666666661,
+                                       0.8333333333320002e-02,
+                                       -0.1984126982840213e-03,
+                                       0.2755731329901505e-05,
+                                       -0.2505070584637887e-07,
+                                       0.1589413637195215e-09,
+                                       0.0};
+
+  const std::array<double, 8> cos64 = {1.0,
+                                       -0.5000000000000000,
+                                       0.4166666666666645e-01,
+                                       -0.1388888888886111e-02,
+                                       0.2480158728388683e-04,
+                                       -0.2755731309913950e-06,
+                                       0.2087558253975872e-08,
+                                       -0.1135338700720054e-10};
+
+  const std::array<float, 8> sin32 = {1.0,
+                                      -1.666666716337e-01,
+                                      8.333330973983e-03,
+                                      -1.983967522392e-04,
+                                      2.721174723774e-06,
+                                      0.0,
+                                      0.0,
+                                      0.0};
+
+  const std::array<float, 8> cos32 = {1.0,
+                                      -5.000000000000e-01,
+                                      4.166664928198e-02,
+                                      -1.388759003021e-03,
+                                      2.446388680255e-05,
+                                      0.0,
+                                      0.0,
+                                      0.0};
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+  // std::array<T, 8> lut;
+
+  for (int i = 0; i < partition_num; i++) {
+    T coeff;
+    const bool sign_bit = m[i] < 0 ? 1 : 0;
+    // If float then use those LUTs
+    if (sizeof(T) == 4) {
+      coeff = sign_bit ? cos32[imm] : sin32[imm];
+    }
+    // Else if double use those LUTs
+    else {
+      coeff = sign_bit ? cos64[imm] : sin64[imm];
+    }
+    // TODO: Add FP16 support if/when we eventually support these (may require
+    // C++23)
+    out[i] = n[i] * std::abs(m[i]) + coeff;
+  }
+
+  return {out, 256};
+}
+
 /** Helper function for SVE instructions with the format `inc<b, d, h, w>
  * xdn{, pattern{, MUL #imm}}`.
  * T represents the type of operation (e.g. for INCB, T = int8_t).
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index bc56cb8934..1d50f77e85 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -355,6 +355,30 @@ void Instruction::execute() {
             sveAdr_packedOffsets<uint32_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
+      case Opcode::AArch64_FTSMUL_ZZZ_S: {  // ftsmul zd.s, zn.s, zm.s
+        results_[0] = sveFTrigSMul<float, int32_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_FTSMUL_ZZZ_D: {  // ftsmul zd.d, zn.d, zm.d
+        results_[0] = sveFTrigSMul<double, int64_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_FTSSEL_ZZZ_S: {  // ftssel zd.s, zn.s, zm.s
+        results_[0] = sveFTrigSSel<float, int32_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_FTSSEL_ZZZ_D: {  // ftssel zd.d, zn.d, zm.d
+        results_[0] = sveFTrigSSel<double, int64_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_FTMAD_ZZI_S: {  // ftmad zd.s, zn.s, zm.s, #imm
+        results_[0] = sveFTrigMad<float>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_FTMAD_ZZI_D: {  // ftmad zd.s, zn.s, zm.s, #imm
+        results_[0] = sveFTrigMad<double>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_ANDSWri: {  // ands wd, wn, #imm
         auto [result, nzcv] = logicOp_imm<uint32_t>(
             sourceValues_, metadata_, true,

From 2716a711e7ba913d286cec48a1f0cbc20f8bb30e Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Tue, 18 Jun 2024 18:12:59 +0100
Subject: [PATCH 13/38] Supported minisweep

---
 .../simeng/arch/aarch64/helpers/sve.hh        | 13 +++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc   | 22 +++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 931ca27d42..e7e841e82f 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -114,6 +114,19 @@ RegisterValue sveAddvPredicated(srcValContainer& sourceValues,
   return {out, 256};
 }
 
+/** Helper function for NEON instructions with the format `uaddlv Vd, Vn.T`.
+ * T represents the type of sourceValues (e.g. for vn.s, T = uint32_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, typename U, int I>
+RegisterValue sveAddlv(srcValContainer& sourceValues) {
+  const U* n = sourceValues[0].getAsVector<U>();
+  T out = 0;
+  for (int i = 0; i < I; i++) {
+    out += n[i];
+  }
+  return {out, 256};
+}
+
 /** Helper function for SVE instructions with the format `adr zd, [zn, zm{,
  * lsl #<1,2,3>}]`.
  * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 1d50f77e85..d4b795f876 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -205,6 +205,10 @@ void Instruction::execute() {
         results_[0] = vecSumElems_2ops<uint8_t, 8>(sourceValues_);
         break;
       }
+      case Opcode::AArch64_UADDLVv8i8v: {  // uaddlv hd, vn.8b
+        results_[0] = sveAddlv<uint32_t, uint8_t, 8>(sourceValues_);
+        break;
+      }
       case Opcode::AArch64_ADDWri: {  // add wd, wn, #imm{, shift}
         auto [result, nzcv] =
             addShift_imm<uint32_t>(sourceValues_, metadata_, false);
@@ -699,6 +703,12 @@ void Instruction::execute() {
             [](uint8_t x, uint8_t y) -> bool { return (x == y); });
         break;
       }
+      case Opcode::AArch64_CMEQv2i32rz: {  // cmeq vd.2s, vn.2s, #0
+        results_[0] = vecCompare<uint32_t, 2>(
+            sourceValues_, true,
+            [](uint32_t x, uint32_t y) -> bool { return (x == y); });
+        break;
+      }
       case Opcode::AArch64_CMEQv4i32: {  // cmeq vd.4s, vn.4s, vm.4s
         results_[0] = vecCompare<uint32_t, 4>(
             sourceValues_, false,
@@ -717,6 +727,12 @@ void Instruction::execute() {
             [](int8_t x, int8_t y) -> bool { return (x == y); });
         break;
       }
+      case Opcode::AArch64_CMHIv2i32: {  // cmhi vd.2s, vn.2s, vm.2s
+        results_[0] = vecCompare<uint32_t, 2>(
+            sourceValues_, false,
+            [](uint32_t x, uint32_t y) -> bool { return (x > y); });
+        break;
+      }
       case Opcode::AArch64_CMHIv4i32: {  // cmhi vd.4s, vn.4s, vm.4s
         results_[0] = vecCompare<uint32_t, 4>(
             sourceValues_, false,
@@ -4122,6 +4138,12 @@ void Instruction::execute() {
             [](uint8_t x, uint8_t y) -> uint8_t { return x | y; });
         break;
       }
+      case Opcode::AArch64_ORNv8i8: {  // orn vd.8b, vn.8b, vn.8b
+        results_[0] = vecLogicOp_3vecs<uint8_t, 8>(
+            sourceValues_,
+            [](uint8_t x, uint8_t y) -> uint8_t { return x | (~y); });
+        break;
+      }
       case Opcode::AArch64_PFALSE: {  // pfalse pd.b
         uint64_t out[4] = {0, 0, 0, 0};
         results_[0] = out;

From 8c56ee5d350d9c272859542b91a7179671a4e997 Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Wed, 19 Jun 2024 16:41:06 +0100
Subject: [PATCH 14/38] Added instructions to support CloverLeaf armclang23.
 Numerical error :O

---
 .../simeng/arch/aarch64/helpers/sve.hh        | 20 ++++++++-
 src/lib/arch/aarch64/Instruction_execute.cc   | 20 +++++++++
 test/regression/aarch64/instructions/sve.cc   | 43 +++++++++++++++++++
 3 files changed, 81 insertions(+), 2 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index e7e841e82f..7bf8d2a427 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -115,8 +115,9 @@ RegisterValue sveAddvPredicated(srcValContainer& sourceValues,
 }
 
 /** Helper function for NEON instructions with the format `uaddlv Vd, Vn.T`.
- * T represents the type of sourceValues (e.g. for vn.s, T = uint32_t).
- * Returns correctly formatted RegisterValue. */
+ * T represents the type of the destination register (e.g. for h0, T =
+ * uint32_t). U represents the type of the sourceValues[0] (e.g. for v0.8b, U =
+ * uint8_t) Returns correctly formatted RegisterValue. */
 template <typename T, typename U, int I>
 RegisterValue sveAddlv(srcValContainer& sourceValues) {
   const U* n = sourceValues[0].getAsVector<U>();
@@ -127,6 +128,21 @@ RegisterValue sveAddlv(srcValContainer& sourceValues) {
   return {out, 256};
 }
 
+/** Helper function for NEON instructions with the format `umaxv Vd, Vn.T`.
+ * T represents the type of sourceValues (e.g. for vn.s, T = uint32_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue sveUMaxV(srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  T out = n[0];
+  for (int i = 1; i < I; i++) {
+    std::cout << "Comparing " << n[i] << " and " << out;
+    out = std::max(n[i], out);
+    std::cout << ". " << out << " won\n";
+  }
+  return {out, 256};
+}
+
 /** Helper function for SVE instructions with the format `adr zd, [zn, zm{,
  * lsl #<1,2,3>}]`.
  * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index d4b795f876..f65f233082 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -5800,6 +5800,26 @@ void Instruction::execute() {
         results_[0] = vecUMinP<uint8_t, 16>(sourceValues_);
         break;
       }
+      case Opcode::AArch64_UMAXVv16i8v: {  // umaxv bd, vn.16b
+        results_[0] = sveUMaxV<uint8_t, 16>(sourceValues_);
+        break;
+      }
+      case Opcode::AArch64_UMAXVv4i16v: {  // umaxv hd, vn.4h
+        results_[0] = sveUMaxV<uint16_t, 4>(sourceValues_);
+        break;
+      }
+      case Opcode::AArch64_UMAXVv4i32v: {  // umaxv sd, vn.4s
+        results_[0] = sveUMaxV<uint32_t, 4>(sourceValues_);
+        break;
+      }
+      case Opcode::AArch64_UMAXVv8i16v: {  // umaxv hd, vn.8h
+        results_[0] = sveUMaxV<uint16_t, 8>(sourceValues_);
+        break;
+      }
+      case Opcode::AArch64_UMAXVv8i8v: {  // umaxv bd, vn.8b
+        results_[0] = sveUMaxV<uint8_t, 8>(sourceValues_);
+        break;
+      }
       case Opcode::AArch64_UMOVvi32_idx0:  // umov wd, vn.s[0]
       case Opcode::AArch64_UMOVvi32: {     // umov wd, vn.s[index]
         const uint32_t* vec = sourceValues_[0].getAsVector<uint32_t>();
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index fde761a148..3ef1d1148a 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -6402,6 +6402,49 @@ TEST_P(InstSve, smulh) {
              fillNeonCombined<int32_t>({-12}, {-1076902265}, VL / 8));
 }
 
+TEST_P(InstSve, umaxp) {
+  // umaxv vd, vn.t
+  initialHeapData_.resize(32);
+  uint8_t* heap = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+
+  // v0
+  heap[0] = 0x01;
+  heap[1] = 0x00;
+  heap[2] = 0xFF;
+  heap[3] = 0xAA;
+  heap[4] = 0xBB;
+  heap[5] = 0xCC;
+  heap[6] = 0xDD;
+  heap[7] = 0xEE;
+
+  // v1
+  heap[8] = 0x00;
+  heap[9] = 0x00;
+  heap[10] = 0xEE;
+  heap[11] = 0x11;
+  heap[12] = 0x22;
+  heap[13] = 0x33;
+  heap[14] = 0x44;
+  heap[15] = 0x55;
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #8]
+    umaxv h2, v0.4h
+    umaxv h3, v1.4h
+
+  )");
+  CHECK_NEON(2, uint16_t,
+             {0xEEDD, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000});
+  CHECK_NEON(3, uint16_t,
+             {0x5544, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000});
+}
+
 TEST_P(InstSve, clastb) {
   // 64 bit
   RUN_AARCH64(R"(

From 32d0d6c8e15cc29e5d8bca62bf3b1c7aa194e607 Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Wed, 19 Jun 2024 19:08:24 +0100
Subject: [PATCH 15/38] Added a test to start investigating what's wrong with
 cloverleaf

---
 src/include/simeng/arch/aarch64/helpers/sve.hh   | 2 --
 test/regression/aarch64/instructions/bitmanip.cc | 6 ++++++
 test/regression/aarch64/instructions/sve.cc      | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 7bf8d2a427..38f21baee5 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -136,9 +136,7 @@ RegisterValue sveUMaxV(srcValContainer& sourceValues) {
   const T* n = sourceValues[0].getAsVector<T>();
   T out = n[0];
   for (int i = 1; i < I; i++) {
-    std::cout << "Comparing " << n[i] << " and " << out;
     out = std::max(n[i], out);
-    std::cout << ". " << out << " won\n";
   }
   return {out, 256};
 }
diff --git a/test/regression/aarch64/instructions/bitmanip.cc b/test/regression/aarch64/instructions/bitmanip.cc
index a72dcb64dc..8622169db0 100644
--- a/test/regression/aarch64/instructions/bitmanip.cc
+++ b/test/regression/aarch64/instructions/bitmanip.cc
@@ -71,11 +71,17 @@ TEST_P(InstBitmanip, extr) {
     extr w4, w1, w2, 4
     extr w5, w1, w2, 24
     extr w6, w1, w2, 31
+
+    # Check alias
+    ror w7, w1, 31
+    ror w8, w1, 24
   )");
   EXPECT_EQ(getGeneralRegister<uint32_t>(3), 0x12345678);
   EXPECT_EQ(getGeneralRegister<uint32_t>(4), 0xF1234567);
   EXPECT_EQ(getGeneralRegister<uint32_t>(5), 0xADBEEF12);
   EXPECT_EQ(getGeneralRegister<uint32_t>(6), 0xBD5B7DDE);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(7), 0xBD5B7DDF);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(8), 0xADBEEFDE);
 
   // 64-bit
   initialHeapData_.resize(16);
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 3ef1d1148a..951a6f7627 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -6402,7 +6402,7 @@ TEST_P(InstSve, smulh) {
              fillNeonCombined<int32_t>({-12}, {-1076902265}, VL / 8));
 }
 
-TEST_P(InstSve, umaxp) {
+TEST_P(InstSve, umaxv) {
   // umaxv vd, vn.t
   initialHeapData_.resize(32);
   uint8_t* heap = reinterpret_cast<uint8_t*>(initialHeapData_.data());

From 2bb065b790aeeb4b7f59c90ff96337ce2a6b8b49 Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Thu, 20 Jun 2024 15:09:42 +0100
Subject: [PATCH 16/38] Added test for LDRSWroW

---
 test/regression/aarch64/instructions/load.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/regression/aarch64/instructions/load.cc b/test/regression/aarch64/instructions/load.cc
index 09269eebb8..05ffdd90a0 100644
--- a/test/regression/aarch64/instructions/load.cc
+++ b/test/regression/aarch64/instructions/load.cc
@@ -1277,17 +1277,19 @@ TEST_P(InstLoad, ldrsw) {
     mov x0, 0
     mov x8, 214
     svc #0
-    mov x5, 1
+    mov x6, 1
     # Load 32-bit values from heap and sign-extend to 64-bits
     ldrsw x1, [x0, #4]
     ldrsw x2, [x0], #4
     ldrsw x3, [x0]
-    ldrsw x4, [x0, x5, lsl #2]
+    ldrsw x4, [x0, x6, lsl #2]
+    ldrsw x5, [x0, w6, uxtw #2]
   )");
   EXPECT_EQ(getGeneralRegister<int64_t>(1), INT32_MAX);
   EXPECT_EQ(getGeneralRegister<int64_t>(2), -2);
   EXPECT_EQ(getGeneralRegister<int64_t>(3), INT32_MAX);
   EXPECT_EQ(getGeneralRegister<int64_t>(4), -5);
+  EXPECT_EQ(getGeneralRegister<int64_t>(5), -5);
 
   // ldursw
   RUN_AARCH64(R"(

From b40d0113237fca8920053e4bfc7d56191d6d23a8 Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Thu, 22 Aug 2024 17:33:52 +0000
Subject: [PATCH 17/38] Added mechanism to detect ROB loops. Also added
 FDIVv4f32 inst

---
 src/include/simeng/pipeline/ReorderBuffer.hh |  6 +++++
 src/lib/arch/aarch64/Instruction_execute.cc  |  4 ++++
 src/lib/pipeline/ReorderBuffer.cc            | 24 +++++++++++++++-----
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/src/include/simeng/pipeline/ReorderBuffer.hh b/src/include/simeng/pipeline/ReorderBuffer.hh
index 2e6e68e37b..d0622c2a30 100644
--- a/src/include/simeng/pipeline/ReorderBuffer.hh
+++ b/src/include/simeng/pipeline/ReorderBuffer.hh
@@ -125,6 +125,12 @@ class ReorderBuffer {
    */
   uint64_t pc_;
 
+  /** The address of the last instruction at the head of the ROB to check if it's stuck */
+  uint64_t last_inst_addr = 0;
+
+  /** A counter for how many cycles the same instruction has been at the head of the ROB */
+  uint64_t inst_repeat_counter = 0;
+
   /** The sequence ID of the youngest instruction that should remain after the
    * current flush. */
   uint64_t flushAfter_;
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index f65f233082..65541eefcd 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -1840,6 +1840,10 @@ void Instruction::execute() {
         results_[0] = vecFDiv<double, 2>(sourceValues_);
         break;
       }
+      case Opcode::AArch64_FDIVv4f32: {  // fdiv vd.4s, vn.4s, vm.4s
+        results_[0] = vecFDiv<float, 4>(sourceValues_);
+        break;
+      }
       case Opcode::AArch64_FDUP_ZI_D: {  // fdup zd.d, #imm
         results_[0] =
             sveDup_immOrScalar<double>(sourceValues_, metadata_, VL_bits, true);
diff --git a/src/lib/pipeline/ReorderBuffer.cc b/src/lib/pipeline/ReorderBuffer.cc
index e72e6e79dc..e2ce8ebc63 100644
--- a/src/lib/pipeline/ReorderBuffer.cc
+++ b/src/lib/pipeline/ReorderBuffer.cc
@@ -36,20 +36,18 @@ void ReorderBuffer::reserve(const std::shared_ptr<Instruction>& insn) {
 void ReorderBuffer::commitMicroOps(uint64_t insnId) {
   if (buffer_.size()) {
     size_t index = 0;
-    uint64_t firstOp = UINT64_MAX;
+    int64_t firstOp = -1;
     bool validForCommit = false;
-    bool foundFirstInstance = false;
 
     // Find first instance of uop belonging to macro-op instruction
     for (; index < buffer_.size(); index++) {
       if (buffer_[index]->getInstructionId() == insnId) {
         firstOp = index;
-        foundFirstInstance = true;
         break;
       }
     }
 
-    if (foundFirstInstance) {
+    if (firstOp > -1) {
       // If found, see if all uops are committable
       for (; index < buffer_.size(); index++) {
         if (buffer_[index]->getInstructionId() != insnId) break;
@@ -62,7 +60,7 @@ void ReorderBuffer::commitMicroOps(uint64_t insnId) {
       }
       if (!validForCommit) return;
 
-      assert(firstOp != UINT64_MAX && "firstOp hasn't been populated");
+      assert(firstOp > -1 && "firstOp hasn't been populated");
       // No early return thus all uops are committable
       for (; firstOp < buffer_.size(); firstOp++) {
         if (buffer_[firstOp]->getInstructionId() != insnId) break;
@@ -81,6 +79,19 @@ unsigned int ReorderBuffer::commit(uint64_t maxCommitSize) {
   unsigned int n;
   for (n = 0; n < maxCommits; n++) {
     auto& uop = buffer_[0];
+    if (uop->getInstructionAddress() == last_inst_addr) {
+      inst_repeat_counter++;
+    } else {
+      inst_repeat_counter = 0;
+    }
+    if (inst_repeat_counter > 10000000) {
+      std::cout << "Infinite loop detected in rob commit at instruction address "
+                << std::hex << uop->getInstructionAddress() << std::dec << " ("
+                << uop->getMicroOpIndex() << "). Killing.\n";
+      exit(1);
+    }
+    last_inst_addr = uop->getInstructionAddress();
+
     if (!uop->canCommit()) {
       break;
     }
@@ -97,7 +108,7 @@ unsigned int ReorderBuffer::commit(uint64_t maxCommitSize) {
     for (size_t i = 0; i < destinations.size(); i++) {
       rat_.commit(destinations[i]);
     }
-
+    
     // If it's a memory op, commit the entry at the head of the respective queue
     if (uop->isLoad()) {
       lsq_.commitLoad(uop);
@@ -227,3 +238,4 @@ uint64_t ReorderBuffer::getRetiredBranchesCount() const {
 }
 }  // namespace pipeline
 }  // namespace simeng
+

From 14cc2e1ab31dadde8f46390363dab3575077d75b Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Wed, 28 Aug 2024 18:37:58 +0100
Subject: [PATCH 18/38] Clang format

---
 src/include/simeng/pipeline/ReorderBuffer.hh |  6 ++++--
 src/lib/pipeline/ReorderBuffer.cc            | 10 +++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/include/simeng/pipeline/ReorderBuffer.hh b/src/include/simeng/pipeline/ReorderBuffer.hh
index d0622c2a30..06a9aefadd 100644
--- a/src/include/simeng/pipeline/ReorderBuffer.hh
+++ b/src/include/simeng/pipeline/ReorderBuffer.hh
@@ -125,10 +125,12 @@ class ReorderBuffer {
    */
   uint64_t pc_;
 
-  /** The address of the last instruction at the head of the ROB to check if it's stuck */
+  /** The address of the last instruction at the head of the ROB to check if
+   * it's stuck */
   uint64_t last_inst_addr = 0;
 
-  /** A counter for how many cycles the same instruction has been at the head of the ROB */
+  /** A counter for how many cycles the same instruction has been at the head of
+   * the ROB */
   uint64_t inst_repeat_counter = 0;
 
   /** The sequence ID of the youngest instruction that should remain after the
diff --git a/src/lib/pipeline/ReorderBuffer.cc b/src/lib/pipeline/ReorderBuffer.cc
index e2ce8ebc63..4887a69ad2 100644
--- a/src/lib/pipeline/ReorderBuffer.cc
+++ b/src/lib/pipeline/ReorderBuffer.cc
@@ -85,9 +85,10 @@ unsigned int ReorderBuffer::commit(uint64_t maxCommitSize) {
       inst_repeat_counter = 0;
     }
     if (inst_repeat_counter > 10000000) {
-      std::cout << "Infinite loop detected in rob commit at instruction address "
-                << std::hex << uop->getInstructionAddress() << std::dec << " ("
-                << uop->getMicroOpIndex() << "). Killing.\n";
+      std::cout
+          << "Infinite loop detected in rob commit at instruction address "
+          << std::hex << uop->getInstructionAddress() << std::dec << " ("
+          << uop->getMicroOpIndex() << "). Killing.\n";
       exit(1);
     }
     last_inst_addr = uop->getInstructionAddress();
@@ -108,7 +109,7 @@ unsigned int ReorderBuffer::commit(uint64_t maxCommitSize) {
     for (size_t i = 0; i < destinations.size(); i++) {
       rat_.commit(destinations[i]);
     }
-    
+
     // If it's a memory op, commit the entry at the head of the respective queue
     if (uop->isLoad()) {
       lsq_.commitLoad(uop);
@@ -238,4 +239,3 @@ uint64_t ReorderBuffer::getRetiredBranchesCount() const {
 }
 }  // namespace pipeline
 }  // namespace simeng
-

From bd3bfc8b1aa893ad4fadc6dd7b7ba12610a90575 Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Thu, 29 Aug 2024 17:37:49 +0100
Subject: [PATCH 19/38] Fixed a couple build issues/warnings

---
 src/include/simeng/arch/aarch64/helpers/sve.hh   | 6 +++---
 src/lib/pipeline/ReorderBuffer.cc                | 6 ++++--
 test/regression/aarch64/AArch64RegressionTest.hh | 2 +-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 38f21baee5..27e9ed7753 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -915,7 +915,7 @@ RegisterValue sveFTrigSMul(srcValContainer& sourceValues,
   const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
   T out[256 / sizeof(T)] = {0};
 
-  U bit_0_mask = 1ull << (sizeof(T) * 8 - 1);
+  U bit_0_mask = static_cast<U>(1) << (sizeof(T) * 8 - 1);
   // Square each element in the first source vector and then set the sign bit
   // to a copy of bit 0 of the corresponding element in the second source
   // register
@@ -941,8 +941,8 @@ RegisterValue sveFTrigSSel(srcValContainer& sourceValues,
   const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
   T out[256 / sizeof(T)] = {0};
 
-  U bit_0_mask = 1ull << (sizeof(T) * 8 - 1);
-  U bit_1_mask = 1ull << (sizeof(T) * 8 - 2);
+  U bit_0_mask = static_cast<U>(1) << (sizeof(T) * 8 - 1);
+  U bit_1_mask = static_cast<U>(1) << (sizeof(T) * 8 - 2);
 
   // Place the value 1.0 or a copy of the first source vector element in the
   // destination element, depending on bit 0 of the corresponding element of
diff --git a/src/lib/pipeline/ReorderBuffer.cc b/src/lib/pipeline/ReorderBuffer.cc
index 4887a69ad2..f70b50b8bc 100644
--- a/src/lib/pipeline/ReorderBuffer.cc
+++ b/src/lib/pipeline/ReorderBuffer.cc
@@ -36,18 +36,20 @@ void ReorderBuffer::reserve(const std::shared_ptr<Instruction>& insn) {
 void ReorderBuffer::commitMicroOps(uint64_t insnId) {
   if (buffer_.size()) {
     size_t index = 0;
-    int64_t firstOp = -1;
+    uint64_t firstOp = UINT64_MAX;
     bool validForCommit = false;
+    bool foundFirstInstance = false;
 
     // Find first instance of uop belonging to macro-op instruction
     for (; index < buffer_.size(); index++) {
       if (buffer_[index]->getInstructionId() == insnId) {
         firstOp = index;
+        foundFirstInstance = true;
         break;
       }
     }
 
-    if (firstOp > -1) {
+    if (foundFirstInstance) {
       // If found, see if all uops are committable
       for (; index < buffer_.size(); index++) {
         if (buffer_[index]->getInstructionId() != insnId) break;
diff --git a/test/regression/aarch64/AArch64RegressionTest.hh b/test/regression/aarch64/AArch64RegressionTest.hh
index 32d975b09d..3b2490666d 100644
--- a/test/regression/aarch64/AArch64RegressionTest.hh
+++ b/test/regression/aarch64/AArch64RegressionTest.hh
@@ -503,7 +503,7 @@ class AArch64RegressionTest : public RegressionTest {
     std::array<T, (32 / sizeof(T))> generatedArray;
     generatedArray.fill(0);
     // Fill array by cycling through source elements
-    for (int i = 0; i < (num_bytes / sizeof(T)); i++) {
+    for (size_t i = 0; i < (num_bytes / sizeof(T)); i++) {
       generatedArray[i] = src[i % src.size()];
     }
     return generatedArray;

From 3e45c868a32992b7f5aa0fe54d81ca3bacd0bb01 Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Fri, 30 Aug 2024 13:45:31 +0100
Subject: [PATCH 20/38] Added uaddlv test, as well as rolled back a ROB fix

---
 src/lib/pipeline/ReorderBuffer.cc            |  2 +-
 test/regression/aarch64/instructions/neon.cc | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/lib/pipeline/ReorderBuffer.cc b/src/lib/pipeline/ReorderBuffer.cc
index f70b50b8bc..1ff4a6b6c5 100644
--- a/src/lib/pipeline/ReorderBuffer.cc
+++ b/src/lib/pipeline/ReorderBuffer.cc
@@ -62,7 +62,7 @@ void ReorderBuffer::commitMicroOps(uint64_t insnId) {
       }
       if (!validForCommit) return;
 
-      assert(firstOp > -1 && "firstOp hasn't been populated");
+      assert(firstOp != UINT64_MAX && "firstOp hasn't been populated");
       // No early return thus all uops are committable
       for (; firstOp < buffer_.size(); firstOp++) {
         if (buffer_[firstOp]->getInstructionId() != insnId) break;
diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc
index a4731f388f..fdf7405c86 100644
--- a/test/regression/aarch64/instructions/neon.cc
+++ b/test/regression/aarch64/instructions/neon.cc
@@ -356,6 +356,26 @@ TEST_P(InstNeon, addv) {
   CHECK_NEON(1, uint8_t, {40});
 }
 
+TEST_P(InstNeon, uaddlv) {
+  // 16-bit
+  initialHeapData_.resize(16);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  for (int i = 0; i < 16; i++) {
+    heap8[i] = (i + 1);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ldr q0, [x0]
+    uaddlv h1, v0.8b
+  )");
+  CHECK_NEON(1, uint16_t, {36});
+}
+
 TEST_P(InstNeon, and) {
   initialHeapData_.resize(32);
   uint32_t* heap = reinterpret_cast<uint32_t*>(initialHeapData_.data());

From 96034a525f77eece4a7053fda21b42dcf3c881ac Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Fri, 30 Aug 2024 17:30:36 +0100
Subject: [PATCH 21/38] Added tests for cmphs and a couple other insts. Fixed a
 couple bugs to do with cmphs

---
 src/lib/arch/aarch64/Instruction_execute.cc  |   8 +-
 test/regression/aarch64/instructions/neon.cc |  97 ++++++++--
 test/regression/aarch64/instructions/sve.cc  | 186 +++++++++++++++++++
 3 files changed, 268 insertions(+), 23 deletions(-)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 65541eefcd..fecf3a36ae 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -875,7 +875,7 @@ void Instruction::execute() {
       }
       case Opcode::AArch64_CMPHS_PPzZZ_B: {  // cmphs pd.b, pg/z, zn.b, zm.b
         auto [output, nzcv] = sveCmpPredicated_toPred<uint8_t>(
-            sourceValues_, metadata_, VL_bits, true,
+            sourceValues_, metadata_, VL_bits, false,
             [](uint8_t x, uint8_t y) -> bool { return x >= y; });
         results_[0] = nzcv;
         results_[1] = output;
@@ -883,7 +883,7 @@ void Instruction::execute() {
       }
       case Opcode::AArch64_CMPHS_PPzZZ_D: {  // cmphs pd.d, pg/z, zn.d, zm.d
         auto [output, nzcv] = sveCmpPredicated_toPred<uint64_t>(
-            sourceValues_, metadata_, VL_bits, true,
+            sourceValues_, metadata_, VL_bits, false,
             [](uint64_t x, uint64_t y) -> bool { return x >= y; });
         results_[0] = nzcv;
         results_[1] = output;
@@ -891,7 +891,7 @@ void Instruction::execute() {
       }
       case Opcode::AArch64_CMPHS_PPzZZ_H: {  // cmphs pd.h, pg/z, zn.h, zm.h
         auto [output, nzcv] = sveCmpPredicated_toPred<uint16_t>(
-            sourceValues_, metadata_, VL_bits, true,
+            sourceValues_, metadata_, VL_bits, false,
             [](uint16_t x, uint16_t y) -> bool { return x >= y; });
         results_[0] = nzcv;
         results_[1] = output;
@@ -899,7 +899,7 @@ void Instruction::execute() {
       }
       case Opcode::AArch64_CMPHS_PPzZZ_S: {  // cmphs pd.s, pg/z, zn.s, zm.s
         auto [output, nzcv] = sveCmpPredicated_toPred<uint32_t>(
-            sourceValues_, metadata_, VL_bits, true,
+            sourceValues_, metadata_, VL_bits, false,
             [](uint32_t x, uint32_t y) -> bool { return x >= y; });
         results_[0] = nzcv;
         results_[1] = output;
diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc
index fdf7405c86..320014d0cb 100644
--- a/test/regression/aarch64/instructions/neon.cc
+++ b/test/regression/aarch64/instructions/neon.cc
@@ -711,18 +711,53 @@ TEST_P(InstNeon, cmeq) {
   CHECK_NEON(2, uint8_t, {0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0xFF});
   CHECK_NEON(3, uint8_t, {0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00});
 
-  // 32-bit
+  // 32-bit, 2 lane
   initialHeapData_.resize(128);
-  uint32_t* heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
-  heap32[0] = 10;
-  heap32[1] = 11;
-  heap32[2] = 12;
-  heap32[3] = 13;
+  uint32_t* heapv2i32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heapv2i32[0] = 10;
+  heapv2i32[1] = 0;
+
+  heapv2i32[2] = 0;
+  heapv2i32[3] = 12;
+
+  heapv2i32[4] = 15;
+  heapv2i32[5] = 9;
+
+  heapv2i32[6] = 0;
+  heapv2i32[7] = 0;
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #8]
+    ldr q2, [x0, #16]
+    ldr q3, [x0, #24]
+    cmeq v4.2s, v0.2s, #0
+    cmeq v5.2s, v1.2s, #0
+    cmeq v6.2s, v2.2s, #0
+    cmeq v7.2s, v3.2s, #0
+  )");
+  CHECK_NEON(4, uint32_t, {0, 0xFFFFFFFFu});
+  CHECK_NEON(5, uint32_t, {0xFFFFFFFFu, 0});
+  CHECK_NEON(6, uint32_t, {0, 0});
+  CHECK_NEON(7, uint32_t, {0xFFFFFFFFu, 0xFFFFFFFFu});
+
+  // 32-bit, 4 lane
+  initialHeapData_.resize(128);
+  uint32_t* heapv4i32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heapv4i32[0] = 10;
+  heapv4i32[1] = 11;
+  heapv4i32[2] = 12;
+  heapv4i32[3] = 13;
 
-  heap32[4] = 13;
-  heap32[5] = 11;
-  heap32[6] = 12;
-  heap32[7] = 10;
+  heapv4i32[4] = 13;
+  heapv4i32[5] = 11;
+  heapv4i32[6] = 12;
+  heapv4i32[7] = 10;
 
   RUN_AARCH64(R"(
     # Get heap address
@@ -800,16 +835,40 @@ TEST_P(InstNeon, cmhs) {
 }
 
 TEST_P(InstNeon, cmhi) {
+  // 32-bit, 2 lane
   initialHeapData_.resize(32);
-  uint32_t* heap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
-  heap[0] = 42;
-  heap[1] = 7;
-  heap[2] = UINT32_MAX;
-  heap[3] = 7;
-  heap[4] = 1;
-  heap[5] = (1u << 31) - 1;
-  heap[6] = 0;
-  heap[7] = 7;
+  uint32_t* heapv2i32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heapv2i32[0] = UINT32_MAX;
+  heapv2i32[1] = 7;
+
+  heapv2i32[2] = 1;
+  heapv2i32[3] = 7;
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #8]
+    cmhi v2.2s, v0.2s, v1.2s
+    cmhi v3.2s, v1.2s, v0.2s
+  )");
+  CHECK_NEON(2, uint32_t, {0xFFFFFFFF, 0x0});
+  CHECK_NEON(3, uint32_t, {0x0, 0x0});
+
+  // 32-bit, 4 lane
+  initialHeapData_.resize(32);
+  uint32_t* heapv4i32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heapv4i32[0] = 42;
+  heapv4i32[1] = 7;
+  heapv4i32[2] = UINT32_MAX;
+  heapv4i32[3] = 7;
+  heapv4i32[4] = 1;
+  heapv4i32[5] = (1u << 31) - 1;
+  heapv4i32[6] = 0;
+  heapv4i32[7] = 7;
 
   RUN_AARCH64(R"(
     # Get heap address
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 951a6f7627..4b0fe0e8ff 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -1322,6 +1322,192 @@ TEST_P(InstSve, cmphi_vec) {
   EXPECT_EQ(getNZCV(), 0b0110);
 }
 
+TEST_P(InstSve, cmphs_vec) {
+  // 8-bit
+  RUN_AARCH64(R"(
+    ptrue p0.b
+    dup z0.b, #-5
+    dup z1.b, #4
+
+    cmphs p1.b, p0/z, z0.b, z1.b
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 1));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    sdiv x0, x0, x1
+
+    whilelo p0.b, xzr, x0
+    dup z0.b, #-5
+    dup z1.b, #4
+
+    cmphs p1.b, p0/z, z0.b, z1.b
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 16, {1}, 1));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    ptrue p0.b
+    dup z0.b, #-5
+    dup z1.b, #4
+
+    cmphs p1.b, p0/z, z1.b, z0.b
+  )");
+  CHECK_PREDICATE(1, uint64_t, {0, 0, 0, 0});
+  EXPECT_EQ(getNZCV(), 0b0110);
+
+  RUN_AARCH64(R"(
+    ptrue p0.b
+    dup z0.b, #6
+    dup z1.b, #6
+
+    cmphs p1.b, p0/z, z1.b, z0.b
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 1));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  // 16-bit
+  RUN_AARCH64(R"(
+    ptrue p0.h
+    dup z0.h, #-5
+    dup z1.h, #4
+
+    cmphs p1.h, p0/z, z0.h, z1.h
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 2));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    sdiv x0, x0, x1
+
+    whilelo p0.h, xzr, x0
+    dup z0.h, #-5
+    dup z1.h, #4
+
+    cmphs p1.h, p0/z, z0.h, z1.h
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 16, {1}, 2));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    ptrue p0.h
+    dup z0.h, #-5
+    dup z1.h, #4
+
+    cmphs p1.h, p0/z, z1.h, z0.h
+  )");
+  CHECK_PREDICATE(1, uint64_t, {0, 0, 0, 0});
+  EXPECT_EQ(getNZCV(), 0b0110);
+
+  RUN_AARCH64(R"(
+    ptrue p0.h
+    dup z0.h, #6
+    dup z1.h, #6
+
+    cmphs p1.h, p0/z, z1.h, z0.h
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 2));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  // 32-bit
+  RUN_AARCH64(R"(
+    ptrue p0.s
+    dup z0.s, #-5
+    dup z1.s, #4
+
+    cmphs p1.s, p0/z, z0.s, z1.s
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 4));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    mov x1, #8
+    addvl x0, x0, #1
+    sdiv x0, x0, x1
+
+    whilelo p0.s, xzr, x0
+    dup z0.s, #-5
+    dup z1.s, #4
+
+    cmphs p1.s, p0/z, z0.s, z1.s
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 16, {1}, 4));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    ptrue p0.s
+    dup z0.s, #-5
+    dup z1.s, #4
+
+    cmphs p1.s, p0/z, z1.s, z0.s
+  )");
+  CHECK_PREDICATE(1, uint64_t, {0, 0, 0, 0});
+  EXPECT_EQ(getNZCV(), 0b0110);
+
+  RUN_AARCH64(R"(
+    ptrue p0.s
+    dup z0.s, #6
+    dup z1.s, #6
+
+    cmphs p1.s, p0/z, z1.s, z0.s
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 4));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    ptrue p0.d
+    dup z0.d, #-5
+    dup z1.d, #4
+
+    cmphs p1.d, p0/z, z0.d, z1.d
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 8));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    mov x1, #16
+    addvl x0, x0, #1
+    sdiv x0, x0, x1
+
+    whilelo p0.d, xzr, x0
+    dup z0.d, #-5
+    dup z1.d, #4
+
+    cmphs p1.d, p0/z, z0.d, z1.d
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 16, {1}, 8));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    ptrue p0.d
+    dup z0.d, #-5
+    dup z1.d, #4
+
+    cmphs p1.d, p0/z, z1.d, z0.d
+  )");
+  CHECK_PREDICATE(1, uint64_t, {0, 0, 0, 0});
+  EXPECT_EQ(getNZCV(), 0b0110);
+
+  RUN_AARCH64(R"(
+    ptrue p0.d
+    dup z0.d, #6
+    dup z1.d, #6
+
+    cmphs p1.d, p0/z, z1.d, z0.d
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 8));
+  EXPECT_EQ(getNZCV(), 0b1000);
+}
+
 TEST_P(InstSve, cnt) {
   // pattern = all
   RUN_AARCH64(R"(

From 466fc3d772256618cf88c9cfd31cc8c4496d2bd4 Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Mon, 9 Sep 2024 21:00:47 +0100
Subject: [PATCH 22/38] Added tests for FDIV and LASTB. Fixed LASTB logic.

---
 .../simeng/arch/aarch64/helpers/sve.hh        |   6 +-
 test/regression/aarch64/instructions/neon.cc  |  34 +++++-
 test/regression/aarch64/instructions/sve.cc   | 104 ++++++++++++++++++
 3 files changed, 137 insertions(+), 7 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 27e9ed7753..8b9f32c89a 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -1121,8 +1121,9 @@ RegisterValue sveIndex(
 template <typename T>
 RegisterValue sveLastBScalar(srcValContainer& sourceValues,
                              const uint16_t VL_bits) {
-  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
-  const T* n = sourceValues[1].getAsVector<T>();
+  // sourceValues are wrong and the correct value is in the previous index.
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const T* n = sourceValues[2].getAsVector<T>();
 
   const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
   T out;
@@ -1149,6 +1150,7 @@ RegisterValue sveLastBScalar(srcValContainer& sourceValues,
 template <typename T>
 RegisterValue sveCLastBScalar(srcValContainer& sourceValues,
                               const uint16_t VL_bits) {
+  // sourceValues are wrong and the correct value is in the previous index.
   const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
   const uint64_t* m = sourceValues[2].getAsVector<uint64_t>();
   const T* n = sourceValues[3].getAsVector<T>();
diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc
index 320014d0cb..680574158a 100644
--- a/test/regression/aarch64/instructions/neon.cc
+++ b/test/regression/aarch64/instructions/neon.cc
@@ -1610,11 +1610,12 @@ TEST_P(InstNeon, fcvtl2) {
 
 TEST_P(InstNeon, fdiv) {
   initialHeapData_.resize(32);
-  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
-  heap[0] = 1.0;
-  heap[1] = -42.5;
-  heap[2] = -0.125;
-  heap[3] = 16.0;
+  // 2 Doubles
+  double* heapv2f64 = reinterpret_cast<double*>(initialHeapData_.data());
+  heapv2f64[0] = 1.0;
+  heapv2f64[1] = -42.5;
+  heapv2f64[2] = -0.125;
+  heapv2f64[3] = 16.0;
 
   RUN_AARCH64(R"(
     # Get heap address
@@ -1627,6 +1628,29 @@ TEST_P(InstNeon, fdiv) {
     fdiv v2.2d, v0.2d, v1.2d
   )");
   CHECK_NEON(2, double, {-8.0, -2.65625});
+
+  // 4 Floats
+  float* heapv4f32 = reinterpret_cast<float*>(initialHeapData_.data());
+  heapv4f32[0] = 1.0f;
+  heapv4f32[1] = -42.5f;
+  heapv4f32[2] = 10.0f;
+  heapv4f32[3] = 0.0f;
+  heapv4f32[4] = -0.125f;
+  heapv4f32[5] = 16.0f;
+  heapv4f32[6] = -2.0f;
+  heapv4f32[7] = 256.0f;
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #16]
+    fdiv v2.4s, v0.4s, v1.4s
+  )");
+  CHECK_NEON(2, float, {-8.0f, -2.65625f, -5.0f, 0.0f});
 }
 
 TEST_P(InstNeon, fmla) {
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 4b0fe0e8ff..a1a5429c2c 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -6733,6 +6733,110 @@ TEST_P(InstSve, clastb) {
   CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1F}, 8));
 }
 
+TEST_P(InstSve, lastb) {
+  // 64 bit
+  RUN_AARCH64(R"(
+        movz    x0, #0xCDEF
+        movk    x0, #0x89AB, LSL #16
+        movk x0, #0x4567, LSL #32
+        movk x0, #0x0123, LSL #48
+
+        movz x1, #0x4321
+        movk x1, #0x8765, LSL #16
+        movk x1, #0xCBA9, LSL #32
+        movk x1, #0x1FED, LSL #48
+
+        ptrue p0.d
+        dup z2.d, x0
+        dup z3.d, x1
+        
+        pfalse p0.b
+        lastb d4, p0, z2.d
+        mov z0.d, z4.d
+
+        ptrue p0.d
+        lastb d5, p0, z3.d
+        mov z1.d, z5.d
+    )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0x0123456789ABCDEF}, 8));
+  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1FEDCBA987654321}, 8));
+
+  // 32 bit
+  RUN_AARCH64(R"(
+        movz    x0, #0xCDEF
+        movk    x0, #0x89AB, LSL #16
+        movk x0, #0x4567, LSL #32
+        movk x0, #0x0123, LSL #48
+        movz x1, #0x4321
+        movk x1, #0x8765, LSL #16
+        movk x1, #0xCBA9, LSL #32
+        movk x1, #0x1FED, LSL #48
+
+        dup z2.d, x0
+        dup z3.d, x1
+        
+        pfalse p0.b
+        lastb s4, p0, z2.s
+        mov z0.d, z4.d
+
+        ptrue p0.s
+        lastb s4, p0, z3.s
+        mov z1.d, z4.d
+    )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0x01234567}, 8));
+  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1FEDCBA9}, 8));
+
+  // 16 bit
+  RUN_AARCH64(R"(
+        movz    x0, #0xCDEF
+        movk    x0, #0x89AB, LSL #16
+        movk x0, #0x4567, LSL #32
+        movk x0, #0x0123, LSL #48
+        movz x1, #0x4321
+        movk x1, #0x8765, LSL #16
+        movk x1, #0xCBA9, LSL #32
+        movk x1, #0x1FED, LSL #48
+
+        dup z2.d, x0
+        dup z3.d, x1
+        
+        pfalse p0.b
+        lastb h4, p0, z2.h
+        mov z0.d, z4.d
+
+        ptrue p0.h
+        lastb h4, p0, z3.h
+        mov z1.d, z4.d
+    )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0x0123}, 8));
+  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1FED}, 8));
+
+  // 8 bit
+  RUN_AARCH64(R"(
+        movz    x0, #0xCDEF
+        movk    x0, #0x89AB, LSL #16
+        movk x0, #0x4567, LSL #32
+        movk x0, #0x0123, LSL #48
+        movz x1, #0x4321
+        movk x1, #0x8765, LSL #16
+        movk x1, #0xCBA9, LSL #32
+        movk x1, #0x1FED, LSL #48
+
+        dup z2.d, x0
+        dup z3.d, x1
+        
+        pfalse p0.b
+        lastb b4, p0, z2.b
+        mov z0.d, z4.d
+
+        ptrue p0.b
+        lastb b4, p0, z3.b
+        mov z1.d, z4.d
+    )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0x01}, 8));
+  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1F}, 8));
+}
+
 TEST_P(InstSve, st1b) {
   initialHeapData_.resize(VL / 4);
   uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());

From 0aa2584943b44fde2a28f9c94236fb28ccc8326f Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Tue, 10 Sep 2024 15:53:49 +0100
Subject: [PATCH 23/38] Finally got smax tests

---
 test/regression/aarch64/instructions/sve.cc | 174 ++++++++++++++++++++
 1 file changed, 174 insertions(+)

diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index a1a5429c2c..f79402491a 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -6389,6 +6389,64 @@ TEST_P(InstSve, sel) {
 }
 
 TEST_P(InstSve, smax) {
+  // 64-bit
+  initialHeapData_.resize(VL / 4);
+  int64_t* heap64 = reinterpret_cast<int64_t*>(initialHeapData_.data());
+  std::vector<int64_t> srcA64 = {1,  2,   3,   4,   5,  6,  7,   8,
+                                 -9, -10, -11, -12, 13, 14, -15, -1};
+  std::vector<int64_t> srcB64 = {16, 15, 14, 13, -12, -11, -10, -9,
+                                 8,  7,  6,  5,  4,   3,   -2,  -1};
+  fillHeapCombined<int64_t>(heap64, srcA64, srcB64, VL / 32);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #0
+    mov x2, #0
+    mov x3, #0
+    mov x4, #8
+    mov x5, #2
+    addvl x2, x2, #1
+    udiv x2, x2, x4
+    udiv x3, x2, x5
+    whilelo p1.d, xzr, x3
+    ptrue p0.d
+
+    ld1d {z0.d}, p0/z, [x0, x1, lsl #3]
+    ld1d {z1.d}, p0/z, [x0, x2, lsl #3]
+    ld1d {z2.d}, p0/z, [x0, x2, lsl #3]
+    ld1d {z3.d}, p0/z, [x0, x1, lsl #3]
+    ld1d {z4.d}, p0/z, [x0, x1, lsl #3]
+    ld1d {z5.d}, p0/z, [x0, x1, lsl #3]
+
+    smax z1.d, p0/m, z1.d, z0.d
+    smax z2.d, p1/m, z2.d, z0.d
+      
+    smax z3.d, z3.d, #0
+    smax z4.d, z4.d, #-128
+    smax z5.d, z5.d, #127
+  )");
+  std::vector<int64_t> results64 = {16, 15, 14, 13, 5,  6,  7,  8,
+                                    8,  7,  6,  5,  13, 14, -2, -1};
+  CHECK_NEON(1, int64_t, fillNeon<int64_t>(results64, VL / 8));
+  std::rotate(srcB64.begin(), srcB64.begin() + ((VL / 128) % 16), srcB64.end());
+  CHECK_NEON(2, int64_t, fillNeonCombined<int64_t>(results64, srcB64, VL / 8));
+
+  CHECK_NEON(3, int64_t,
+             fillNeon<int64_t>(
+                 {1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 13, 14, 0, 0}, VL / 8));
+  CHECK_NEON(4, int64_t,
+             fillNeon<int64_t>(
+                 {1, 2, 3, 4, 5, 6, 7, 8, -9, -10, -11, -12, 13, 14, -15, -1},
+                 VL / 8));
+  CHECK_NEON(5, int64_t,
+             fillNeon<int64_t>({127, 127, 127, 127, 127, 127, 127, 127, 127,
+                                127, 127, 127, 127, 127, 127, 127},
+                               VL / 8));
+
   // 32-bit
   initialHeapData_.resize(VL / 4);
   int32_t* heap32 = reinterpret_cast<int32_t*>(initialHeapData_.data());
@@ -6446,6 +6504,122 @@ TEST_P(InstSve, smax) {
              fillNeon<int32_t>({127, 127, 127, 127, 127, 127, 127, 127, 127,
                                 127, 127, 127, 127, 127, 127, 127},
                                VL / 8));
+
+  // 16-bit
+  initialHeapData_.resize(VL / 4);
+  int16_t* heap16 = reinterpret_cast<int16_t*>(initialHeapData_.data());
+  std::vector<int16_t> srcA16 = {1,  2,   3,   4,   5,  6,  7,   8,
+                                 -9, -10, -11, -12, 13, 14, -15, -1};
+  std::vector<int16_t> srcB16 = {16, 15, 14, 13, -12, -11, -10, -9,
+                                 8,  7,  6,  5,  4,   3,   -2,  -1};
+  fillHeapCombined<int16_t>(heap16, srcA16, srcB16, VL / 8);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #0
+    mov x2, #0
+    mov x3, #0
+    mov x4, #2
+    mov x5, #2
+    addvl x2, x2, #1
+    udiv x2, x2, x4
+    udiv x3, x2, x5
+    whilelo p1.h, xzr, x3
+    ptrue p0.h
+
+    ld1h {z0.h}, p0/z, [x0, x1, lsl #1]
+    ld1h {z1.h}, p0/z, [x0, x2, lsl #1]
+    ld1h {z2.h}, p0/z, [x0, x2, lsl #1]
+    ld1h {z3.h}, p0/z, [x0, x1, lsl #1]
+    ld1h {z4.h}, p0/z, [x0, x1, lsl #1]
+    ld1h {z5.h}, p0/z, [x0, x1, lsl #1]
+
+    smax z1.h, p0/m, z1.h, z0.h
+    smax z2.h, p1/m, z2.h, z0.h
+      
+    smax z3.h, z3.h, #0
+    smax z4.h, z4.h, #-128
+    smax z5.h, z5.h, #127
+  )");
+  std::vector<int16_t> results16 = {16, 15, 14, 13, 5,  6,  7,  8,
+                                    8,  7,  6,  5,  13, 14, -2, -1};
+  CHECK_NEON(1, int16_t, fillNeon<int16_t>(results16, VL / 8));
+  std::rotate(srcB16.begin(), srcB16.begin() + ((VL / 32) % 16), srcB16.end());
+  CHECK_NEON(2, int16_t, fillNeonCombined<int16_t>(results16, srcB16, VL / 8));
+
+  CHECK_NEON(3, int16_t,
+             fillNeon<int16_t>(
+                 {1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 13, 14, 0, 0}, VL / 8));
+  CHECK_NEON(4, int16_t,
+             fillNeon<int16_t>(
+                 {1, 2, 3, 4, 5, 6, 7, 8, -9, -10, -11, -12, 13, 14, -15, -1},
+                 VL / 8));
+  CHECK_NEON(5, int16_t,
+             fillNeon<int16_t>({127, 127, 127, 127, 127, 127, 127, 127, 127,
+                                127, 127, 127, 127, 127, 127, 127},
+                               VL / 8));
+
+  // 8-bit
+  initialHeapData_.resize(VL / 4);
+  int8_t* heap8 = reinterpret_cast<int8_t*>(initialHeapData_.data());
+  std::vector<int8_t> srcA8 = {1,  2,   3,   4,   5,  6,  7,   8,
+                               -9, -10, -11, -12, 13, 14, -15, -1};
+  std::vector<int8_t> srcB8 = {16, 15, 14, 13, -12, -11, -10, -9,
+                               8,  7,  6,  5,  4,   3,   -2,  -1};
+  fillHeapCombined<int8_t>(heap8, srcA8, srcB8, VL / 4);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #0
+    mov x2, #0
+    mov x3, #0
+    mov x4, #1
+    mov x5, #2
+    addvl x2, x2, #1
+    udiv x2, x2, x4
+    udiv x3, x2, x5
+    whilelo p1.b, xzr, x3
+    ptrue p0.b
+
+    ld1b {z0.b}, p0/z, [x0, x1]
+    ld1b {z1.b}, p0/z, [x0, x2]
+    ld1b {z2.b}, p0/z, [x0, x2]
+    ld1b {z3.b}, p0/z, [x0, x1]
+    ld1b {z4.b}, p0/z, [x0, x1]
+    ld1b {z5.b}, p0/z, [x0, x1]
+
+    smax z1.b, p0/m, z1.b, z0.b
+    smax z2.b, p1/m, z2.b, z0.b
+      
+    smax z3.b, z3.b, #0
+    smax z4.b, z4.b, #-128
+    smax z5.b, z5.b, #127
+  )");
+  std::vector<int8_t> results8 = {16, 15, 14, 13, 5,  6,  7,  8,
+                                  8,  7,  6,  5,  13, 14, -2, -1};
+  CHECK_NEON(1, int8_t, fillNeon<int8_t>(results8, VL / 8));
+  std::rotate(srcB8.begin(), srcB8.begin() + ((VL / 16) % 16), srcB8.end());
+  CHECK_NEON(2, int8_t, fillNeonCombined<int8_t>(results8, srcB8, VL / 8));
+
+  CHECK_NEON(3, int8_t,
+             fillNeon<int8_t>(
+                 {1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 13, 14, 0, 0}, VL / 8));
+  CHECK_NEON(4, int8_t,
+             fillNeon<int8_t>(
+                 {1, 2, 3, 4, 5, 6, 7, 8, -9, -10, -11, -12, 13, 14, -15, -1},
+                 VL / 8));
+  CHECK_NEON(5, int8_t,
+             fillNeon<int8_t>({127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+                               127, 127, 127, 127, 127, 127},
+                              VL / 8));
 }
 
 TEST_P(InstSve, smin) {

From 75f0d9f32f85f62c5064099b014f41e06e299a31 Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Tue, 10 Sep 2024 16:21:39 +0100
Subject: [PATCH 24/38] Also added smin tests

---
 test/regression/aarch64/instructions/sve.cc | 195 ++++++++++++++++++--
 1 file changed, 183 insertions(+), 12 deletions(-)

diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index f79402491a..b55d6b2a4d 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -6623,6 +6623,63 @@ TEST_P(InstSve, smax) {
 }
 
 TEST_P(InstSve, smin) {
+  // 64-bit
+  initialHeapData_.resize(VL / 4);
+  int64_t* heap64 = reinterpret_cast<int64_t*>(initialHeapData_.data());
+  std::vector<int64_t> srcA64 = {1,  2,   3,   4,   5,  6,  7,   8,
+                                 -9, -10, -11, -12, 13, 14, -15, -1};
+  std::vector<int64_t> srcB64 = {16, 15, 14, 13, -12, -11, -10, -9,
+                                 8,  7,  6,  5,  4,   3,   -2,  -1};
+  fillHeapCombined<int64_t>(heap64, srcA64, srcB64, VL / 32);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #0
+    mov x2, #0
+    mov x3, #0
+    mov x4, #8
+    mov x5, #2
+    addvl x2, x2, #1
+    udiv x2, x2, x4
+    udiv x3, x2, x5
+    whilelo p1.d, xzr, x3
+    ptrue p0.d
+
+    ld1d {z0.d}, p0/z, [x0, x1, lsl #3]
+    ld1d {z1.d}, p0/z, [x0, x2, lsl #3]
+    ld1d {z2.d}, p0/z, [x0, x2, lsl #3]
+
+    smin z1.d, p0/m, z1.d, z0.d
+    smin z2.d, p1/m, z2.d, z0.d
+
+    sminv d3, p1, z1.d
+    sminv d4, p0, z2.d
+  )");
+
+  std::vector<int64_t> results64 = {1,  2,   3,   4,   -12, -11, -10, -9,
+                                    -9, -10, -11, -12, 4,   3,   -15, -1};
+  std::array<int64_t, 32> arrA64 = fillNeon<int64_t>(results64, VL / 8);
+  std::rotate(srcB64.begin(), srcB64.begin() + ((VL / 128) % 16), srcB64.end());
+  std::array<int64_t, 32> arrB64 =
+      fillNeonCombined<int64_t>(results64, srcB64, VL / 8);
+
+  CHECK_NEON(1, int64_t, arrA64);
+  CHECK_NEON(2, int64_t, arrB64);
+  // Find miniumum element. Modify search end point to only consider the
+  // elements within the current VL and predication.
+  int64_t minElemA64 = arrA64[std::distance(
+      arrA64.begin(),
+      std::min_element(arrA64.begin(), arrA64.end() - (32 - VL / 128)))];
+  int64_t minElemB64 = arrB64[std::distance(
+      arrB64.begin(),
+      std::min_element(arrB64.begin(), arrB64.end() - (32 - VL / 64)))];
+  CHECK_NEON(3, int64_t, {minElemA64, 0, 0, 0});
+  CHECK_NEON(4, int64_t, {minElemB64, 0, 0, 0});
+
   // 32-bit
   initialHeapData_.resize(VL / 4);
   int32_t* heap32 = reinterpret_cast<int32_t*>(initialHeapData_.data());
@@ -6662,23 +6719,137 @@ TEST_P(InstSve, smin) {
 
   std::vector<int32_t> results32 = {1,  2,   3,   4,   -12, -11, -10, -9,
                                     -9, -10, -11, -12, 4,   3,   -15, -1};
-  std::array<int32_t, 64> arrA = fillNeon<int32_t>(results32, VL / 8);
+  std::array<int32_t, 64> arrA32 = fillNeon<int32_t>(results32, VL / 8);
   std::rotate(srcB32.begin(), srcB32.begin() + ((VL / 64) % 16), srcB32.end());
-  std::array<int32_t, 64> arrB =
+  std::array<int32_t, 64> arrB32 =
       fillNeonCombined<int32_t>(results32, srcB32, VL / 8);
 
-  CHECK_NEON(1, int32_t, arrA);
-  CHECK_NEON(2, int32_t, arrB);
+  CHECK_NEON(1, int32_t, arrA32);
+  CHECK_NEON(2, int32_t, arrB32);
+  // Find miniumum element. Modify search end point to only consider the
+  // elements within the current VL and predication.
+  int32_t minElemA32 = arrA32[std::distance(
+      arrA32.begin(),
+      std::min_element(arrA32.begin(), arrA32.end() - (64 - VL / 64)))];
+  int32_t minElemB32 = arrB32[std::distance(
+      arrB32.begin(),
+      std::min_element(arrB32.begin(), arrB32.end() - (64 - VL / 32)))];
+  CHECK_NEON(3, int32_t, {minElemA32, 0, 0, 0});
+  CHECK_NEON(4, int32_t, {minElemB32, 0, 0, 0});
+
+  // 16-bit
+  initialHeapData_.resize(VL / 4);
+  int16_t* heap16 = reinterpret_cast<int16_t*>(initialHeapData_.data());
+  std::vector<int16_t> srcA16 = {1,  2,   3,   4,   5,  6,  7,   8,
+                                 -9, -10, -11, -12, 13, 14, -15, -1};
+  std::vector<int16_t> srcB16 = {16, 15, 14, 13, -12, -11, -10, -9,
+                                 8,  7,  6,  5,  4,   3,   -2,  -1};
+  fillHeapCombined<int16_t>(heap16, srcA16, srcB16, VL / 8);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #0
+    mov x2, #0
+    mov x3, #0
+    mov x4, #2
+    mov x5, #2
+    addvl x2, x2, #1
+    udiv x2, x2, x4
+    udiv x3, x2, x5
+    whilelo p1.h, xzr, x3
+    ptrue p0.h
+
+    ld1h {z0.h}, p0/z, [x0, x1, lsl #1]
+    ld1h {z1.h}, p0/z, [x0, x2, lsl #1]
+    ld1h {z2.h}, p0/z, [x0, x2, lsl #1]
+
+    smin z1.h, p0/m, z1.h, z0.h
+    smin z2.h, p1/m, z2.h, z0.h
+
+    sminv h3, p1, z1.h
+    sminv h4, p0, z2.h
+  )");
+
+  std::vector<int16_t> results16 = {1,  2,   3,   4,   -12, -11, -10, -9,
+                                    -9, -10, -11, -12, 4,   3,   -15, -1};
+  std::array<int16_t, 128> arrA16 = fillNeon<int16_t>(results16, VL / 8);
+  std::rotate(srcB16.begin(), srcB16.begin() + ((VL / 32) % 16), srcB16.end());
+  std::array<int16_t, 128> arrB16 =
+      fillNeonCombined<int16_t>(results16, srcB16, VL / 8);
+
+  CHECK_NEON(1, int16_t, arrA16);
+  CHECK_NEON(2, int16_t, arrB16);
+  // Find miniumum element. Modify search end point to only consider the
+  // elements within the current VL and predication.
+  int16_t minElemA16 = arrA16[std::distance(
+      arrA16.begin(),
+      std::min_element(arrA16.begin(), arrA16.end() - (128 - VL / 32)))];
+  int16_t minElemB16 = arrB16[std::distance(
+      arrB16.begin(),
+      std::min_element(arrB16.begin(), arrB16.end() - (128 - VL / 16)))];
+  CHECK_NEON(3, int16_t, {minElemA16, 0, 0, 0});
+  CHECK_NEON(4, int16_t, {minElemB16, 0, 0, 0});
+
+  // 8-bit
+  initialHeapData_.resize(VL / 4);
+  int8_t* heap8 = reinterpret_cast<int8_t*>(initialHeapData_.data());
+  std::vector<int8_t> srcA8 = {1,  2,   3,   4,   5,  6,  7,   8,
+                               -9, -10, -11, -12, 13, 14, -15, -1};
+  std::vector<int8_t> srcB8 = {16, 15, 14, 13, -12, -11, -10, -9,
+                               8,  7,  6,  5,  4,   3,   -2,  -1};
+  fillHeapCombined<int8_t>(heap8, srcA8, srcB8, VL / 4);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #0
+    mov x2, #0
+    mov x3, #0
+    mov x4, #1
+    mov x5, #2
+    addvl x2, x2, #1
+    udiv x2, x2, x4
+    udiv x3, x2, x5
+    whilelo p1.b, xzr, x3
+    ptrue p0.b
+
+    ld1b {z0.b}, p0/z, [x0, x1]
+    ld1b {z1.b}, p0/z, [x0, x2]
+    ld1b {z2.b}, p0/z, [x0, x2]
+
+    smin z1.b, p0/m, z1.b, z0.b
+    smin z2.b, p1/m, z2.b, z0.b
+
+    sminv b3, p1, z1.b
+    sminv b4, p0, z2.b
+  )");
+
+  std::vector<int8_t> results8 = {1,  2,   3,   4,   -12, -11, -10, -9,
+                                  -9, -10, -11, -12, 4,   3,   -15, -1};
+  std::array<int8_t, 256> arrA8 = fillNeon<int8_t>(results8, VL / 8);
+  std::rotate(srcB8.begin(), srcB8.begin() + ((VL / 16) % 16), srcB8.end());
+  std::array<int8_t, 256> arrB8 =
+      fillNeonCombined<int8_t>(results8, srcB8, VL / 8);
+
+  CHECK_NEON(1, int8_t, arrA8);
+  CHECK_NEON(2, int8_t, arrB8);
   // Find miniumum element. Modify search end point to only consider the
   // elements within the current VL and predication.
-  int32_t minElemA = arrA[std::distance(
-      arrA.begin(),
-      std::min_element(arrA.begin(), arrA.end() - (64 - VL / 64)))];
-  int32_t minElemB = arrB[std::distance(
-      arrB.begin(),
-      std::min_element(arrB.begin(), arrB.end() - (64 - VL / 32)))];
-  CHECK_NEON(3, int32_t, {minElemA, 0, 0, 0});
-  CHECK_NEON(4, int32_t, {minElemB, 0, 0, 0});
+  int8_t minElemA8 = arrA8[std::distance(
+      arrA8.begin(),
+      std::min_element(arrA8.begin(), arrA8.end() - (256 - VL / 16)))];
+  int8_t minElemB8 = arrB8[std::distance(
+      arrB8.begin(),
+      std::min_element(arrB8.begin(), arrB8.end() - (256 - VL / 8)))];
+  CHECK_NEON(3, int8_t, {minElemA8, 0, 0, 0});
+  CHECK_NEON(4, int8_t, {minElemB8, 0, 0, 0});
 }
 
 TEST_P(InstSve, smulh) {

From 02386a380b27b947d02ab00d533472bbc9789ae0 Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Tue, 10 Sep 2024 23:48:19 +0100
Subject: [PATCH 25/38] Added tests for umaxv and whilels

---
 test/regression/aarch64/instructions/sve.cc | 258 +++++++++++++++++++-
 1 file changed, 249 insertions(+), 9 deletions(-)

diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index b55d6b2a4d..f48e121c7d 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -6947,16 +6947,32 @@ TEST_P(InstSve, umaxv) {
   heap[5] = 0xCC;
   heap[6] = 0xDD;
   heap[7] = 0xEE;
+  heap[8] = 0x07;
+  heap[9] = 0x00;
+  heap[10] = 0xFC;
+  heap[11] = 0xFD;
+  heap[12] = 0xBA;
+  heap[13] = 0xCA;
+  heap[14] = 0x39;
+  heap[15] = 0xEF;
 
   // v1
-  heap[8] = 0x00;
-  heap[9] = 0x00;
-  heap[10] = 0xEE;
-  heap[11] = 0x11;
-  heap[12] = 0x22;
-  heap[13] = 0x33;
-  heap[14] = 0x44;
-  heap[15] = 0x55;
+  heap[16] = 0x00;
+  heap[17] = 0x00;
+  heap[18] = 0xEE;
+  heap[19] = 0x11;
+  heap[20] = 0x22;
+  heap[21] = 0x33;
+  heap[22] = 0x44;
+  heap[23] = 0x55;
+  heap[24] = 0x26;
+  heap[25] = 0xFF;
+  heap[26] = 0xEA;
+  heap[27] = 0xFA;
+  heap[28] = 0x14;
+  heap[29] = 0x43;
+  heap[30] = 0x21;
+  heap[31] = 0xAE;
 
   RUN_AARCH64(R"(
     # Get heap address
@@ -6965,15 +6981,45 @@ TEST_P(InstSve, umaxv) {
     svc #0
 
     ldr q0, [x0]
-    ldr q1, [x0, #8]
+    ldr q1, [x0, #16]
     umaxv h2, v0.4h
     umaxv h3, v1.4h
 
+    umaxv h4, v0.8h
+    umaxv h5, v1.8h
+
+    umaxv s6, v0.4s
+    umaxv s7, v1.4s
+
+    umaxv b8, v0.8b
+    umaxv b9, v1.8b
+    
+    umaxv b10, v0.16b
+    umaxv b11, v1.16b
+
   )");
   CHECK_NEON(2, uint16_t,
              {0xEEDD, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000});
   CHECK_NEON(3, uint16_t,
              {0x5544, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000});
+  CHECK_NEON(4, uint16_t,
+             {0xFDFC, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000});
+  CHECK_NEON(5, uint16_t,
+             {0xFF26, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000});
+  CHECK_NEON(6, uint32_t, {0xFDFC0007, 0x00000000, 0x00000000, 0x00000000});
+  CHECK_NEON(7, uint32_t, {0xFAEAFF26, 0x00000000, 0x00000000, 0x00000000});
+  CHECK_NEON(8, uint8_t,
+             {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+              0x00, 0x00, 0x00, 0x00, 0x00})
+  CHECK_NEON(9, uint8_t,
+             {0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+              0x00, 0x00, 0x00, 0x00, 0x00})
+  CHECK_NEON(10, uint8_t,
+             {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+              0x00, 0x00, 0x00, 0x00, 0x00})
+  CHECK_NEON(11, uint8_t,
+             {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+              0x00, 0x00, 0x00, 0x00, 0x00})
 }
 
 TEST_P(InstSve, clastb) {
@@ -8722,6 +8768,200 @@ TEST_P(InstSve, whilelo) {
   EXPECT_EQ(getNZCV(), 0b0110);
 }
 
+TEST_P(InstSve, whilels) {
+  // 8-bit arrangement, 64-bit source operands
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+
+    whilels p0.b, xzr, x0
+  )");
+  CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {1}, 1));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #2
+    udiv x2, x0, x1
+
+    whilels p1.b, x2, x0
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred((VL / 16) + 1, {1}, 1));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #2
+    udiv x2, x0, x1
+    mov x3, #4
+    udiv x4, x0, x3
+    add x5, x4, x2
+
+    whilels p2.b, x5, x0
+  )");
+  CHECK_PREDICATE(2, uint64_t, fillPred((VL / 32) + 1, {1}, 1));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    sub x0, x0, #1
+    mov x1, #0
+
+    whilels p3.b, x1, x0
+  )");
+  CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 1));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  // 16-bit arrangement, 64-bit source operands
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #2
+    udiv x2, x0, x1
+
+    whilels p0.h, xzr, x2
+  )");
+  CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {1}, 2));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #2
+    udiv x0, x0, x1
+    udiv x2, x0, x1
+
+    whilels p1.h, x2, x0
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred((VL / 16) + 1, {1}, 2));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #4
+    udiv x2, x0, x1
+    mov x3, #8
+    udiv x4, x0, x3
+    mov x5, #2
+    udiv x0, x0, x5
+    add x6, x4, x2
+
+    whilels p2.h, x6, x0
+  )");
+  CHECK_PREDICATE(2, uint64_t, fillPred((VL / 32) + 1, {1}, 2));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #2
+    udiv x0, x0, x1
+    sub x0, x0, #1
+
+    whilels p3.h, xzr, x0
+  )");
+  CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 2));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  // 32-bit arrangement, 64-bit source operands
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #4
+    udiv x2, x0, x1
+
+    whilels p0.s, xzr, x2
+  )");
+  CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {1}, 4));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #2
+    mov x2, #4
+    udiv x0, x0, x2
+    udiv x3, x0, x1
+
+    whilels p1.s, x3, x0
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred((VL / 16) + 1, {1}, 4));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #8
+    udiv x2, x0, x1
+    mov x3, #16
+    udiv x4, x0, x3
+    mov x5, #4
+    udiv x0, x0, x5
+    add x6, x4, x2
+
+    whilels p2.s, x6, x0
+  )");
+  CHECK_PREDICATE(2, uint64_t, fillPred((VL / 32) + 1, {1}, 4));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #4
+    udiv x0, x0, x1
+    sub x0, x0, #1
+
+    whilels p3.s, xzr, x0
+  )");
+  CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 4));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  // 64-bit arrangement, 64-bit source operands
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #8
+    udiv x2, x0, x1
+
+    whilels p0.d, xzr, x2
+  )");
+  CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {1}, 8));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #2
+    mov x2, #8
+    udiv x0, x0, x2
+    udiv x3, x0, x1
+
+    whilels p1.d, x3, x0
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred((VL / 16) + 1, {1}, 8));
+  if (VL == 128) {
+    EXPECT_EQ(getNZCV(), 0b1000);
+  } else {
+    EXPECT_EQ(getNZCV(), 0b1010);
+  }
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #8
+    udiv x0, x0, x1
+    sub x0, x0, #1
+
+    whilels p3.d, xzr, x0
+  )");
+  CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 8));
+  EXPECT_EQ(getNZCV(), 0b1000);
+}
+
 TEST_P(InstSve, whilelt) {
   // 8-bit arrangement, 64-bit source operands
   RUN_AARCH64(R"(

From 4712ea43fe0492b9bdaa2be3f41e8f91c55e1656 Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Wed, 11 Sep 2024 17:35:36 +0100
Subject: [PATCH 26/38] Added (or fixed) tests for pfirst and splice

---
 .../simeng/arch/aarch64/helpers/sve.hh        |  6 +-
 test/regression/aarch64/instructions/sve.cc   | 62 +++++++++++++++++++
 2 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 8b9f32c89a..7853ac438b 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -1538,12 +1538,12 @@ std::array<uint64_t, 4> svePsel(
 std::array<uint64_t, 4> svePfirst(srcValContainer& sourceValues,
                                   const uint16_t VL_bits) {
   const uint16_t partition_num = VL_bits / 8;
-  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
-  const uint64_t* dn = sourceValues[1].getAsVector<uint64_t>();
+  // sourceValues are wrong and the correct value is in the previous index.
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const uint64_t* dn = sourceValues[2].getAsVector<uint64_t>();
   // Set destination d as source n to copy all false lanes and the active lanes
   // beyond the first
   std::array<uint64_t, 4> out = {dn[0], dn[1], dn[2], dn[3]};
-
   // Get the first active lane and set same lane in destination predicate
   for (int i = 0; i < partition_num; i++) {
     uint64_t shifted_active = 1ull << ((i % (64)));
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index f48e121c7d..e733c9e667 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -5971,6 +5971,26 @@ TEST_P(InstSve, pfalse) {
   CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {0}, 1));
 }
 
+TEST_P(InstSve, pfirst) {
+  RUN_AARCH64(R"(
+    ptrue p0.b
+    pfalse p1.b
+    ptrue p2.b
+    ptrue p3.b
+    pfalse p4.b
+    pfalse p5.b
+
+    pfirst p2.b, p0, p2.b
+    pfirst p3.b, p1, p3.b
+    pfirst p4.b, p0, p4.b
+    pfirst p5.b, p1, p5.b
+  )");
+  CHECK_PREDICATE(2, uint64_t, fillPred(VL / 8, {1}, 1));
+  CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 1));
+  CHECK_PREDICATE(4, uint64_t, fillPred(1, {1}, 1));
+  CHECK_PREDICATE(5, uint64_t, fillPred(VL / 8, {0}, 1));
+}
+
 TEST_P(InstSve, ptrue) {
   RUN_AARCH64(R"(
     ptrue p0.s
@@ -7228,6 +7248,48 @@ TEST_P(InstSve, lastb) {
   CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1F}, 8));
 }
 
+TEST_P(InstSve, splice) {
+  // 64-bit arrangement
+  RUN_AARCH64(R"(
+    fmov z0.d, #1.5
+    fmov z1.d, #-0.5
+    fmov z2.d, #1.5
+
+    ptrue p0.d
+
+    mov x2, #0
+    mov x4, #16
+    addvl x2, x2, #1
+    udiv x2, x2, x4
+    whilelo p1.d, xzr, x2
+
+    splice z0.d, p0, z0.d, z1.d
+    splice z2.d, p1, z2.d, z1.d
+  )");
+  CHECK_NEON(0, double, fillNeon<double>({1.5}, VL / 8));
+  CHECK_NEON(2, double, fillNeonCombined<double>({1.5}, {-0.5}, VL / 8));
+
+  // 32-bit arrangement
+  RUN_AARCH64(R"(
+    fmov z0.s, #1.5
+    fmov z1.s, #-0.5
+    fmov z2.s, #1.5
+
+    ptrue p0.s
+
+    mov x2, #0
+    mov x4, #8
+    addvl x2, x2, #1
+    udiv x2, x2, x4
+    whilelo p1.s, xzr, x2
+
+    splice z0.s, p0, z0.s, z1.s
+    splice z2.s, p1, z2.s, z1.s
+  )");
+  CHECK_NEON(0, float, fillNeon<float>({1.5}, VL / 8));
+  CHECK_NEON(2, float, fillNeonCombined<float>({1.5}, {-0.5}, VL / 8));
+}
+
 TEST_P(InstSve, st1b) {
   initialHeapData_.resize(VL / 4);
   uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());

From c73b2d2d74abd02646d2832d66ad1d5f5b4a0adc Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Wed, 11 Sep 2024 21:49:46 +0100
Subject: [PATCH 27/38] Added tests for ftsmul and fixed some broken logic

---
 .../simeng/arch/aarch64/helpers/sve.hh        |  2 +-
 src/lib/arch/aarch64/InstructionMetadata.cc   |  4 ++
 test/regression/aarch64/instructions/sve.cc   | 71 +++++++++++++++++++
 3 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 7853ac438b..d704a38269 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -921,7 +921,7 @@ RegisterValue sveFTrigSMul(srcValContainer& sourceValues,
   // register
   for (int i = 0; i < partition_num; i++) {
     out[i] = n[i] * n[i];
-    T sign_bit = m[i] & bit_0_mask ? 1.0 : -1.0;
+    T sign_bit = m[i] & bit_0_mask ? -1.0 : 1.0;
     out[i] = std::abs(out[i]) * sign_bit;
   }
 
diff --git a/src/lib/arch/aarch64/InstructionMetadata.cc b/src/lib/arch/aarch64/InstructionMetadata.cc
index 34ddca07d7..c177ef90f1 100644
--- a/src/lib/arch/aarch64/InstructionMetadata.cc
+++ b/src/lib/arch/aarch64/InstructionMetadata.cc
@@ -89,6 +89,10 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
     }
     case Opcode::AArch64_SMAX_ZI_B:
       [[fallthrough]];
+    case Opcode::AArch64_FTSMUL_ZZZ_D:
+      [[fallthrough]];
+    case Opcode::AArch64_FTSMUL_ZZZ_S:
+      [[fallthrough]];
     case Opcode::AArch64_SMAX_ZI_D:
       [[fallthrough]];
     case Opcode::AArch64_SMAX_ZI_H:
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index e733c9e667..c6f55865b4 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -4934,6 +4934,77 @@ TEST_P(InstSve, index) {
   CHECK_NEON(7, uint64_t, fillNeonBaseAndOffset<uint64_t>(10, 10, VL / 8));
 }
 
+TEST_P(InstSve, ftsmul) {
+  initialHeapData_.resize(VL / 4);
+  // 64-bit arrangement
+  double* dheap = reinterpret_cast<double*>(initialHeapData_.data());
+  std::vector<double> srcA64 = {1.0, 2.0, 4.0, 12.34};
+  std::vector<double> srcB64 = {1.0, -5.4, 0.0, 78.2};
+  fillHeapCombined<double>(dheap, srcA64, srcB64, VL / 32);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #0
+    mov x2, #0
+    mov x3, #8
+    addvl x2, x2, #1
+    udiv x2, x2, x3
+    ptrue p0.d
+
+    ld1d {z0.d}, p0/z, [x0, x1, lsl #3] 
+    ld1d {z1.d}, p0/z, [x0, x2, lsl #3]
+
+    ftsmul z2.d, z0.d, z1.d
+  )");
+  CHECK_NEON(2, double, fillNeon<double>({1.0, -4.0, 16.0, 152.2756}, VL / 8));
+
+  // 32-bit arrangement
+  initialHeapData_.resize(VL / 8);
+  float* fheap = reinterpret_cast<float*>(initialHeapData_.data());
+  std::vector<float> fsrcA = {1.0f,  2.0f,   4.0f, 12.34f,
+                              -3.0f, -19.6f, 0.0f, 7.0f};
+  std::vector<float> fsrcB = {1.0f, -5.4f,   0.0f,  78.2f,
+                              2.1f, -26.42f, 12.0f, 3.5f};
+  fillHeapCombined<float>(fheap, fsrcA, fsrcB, VL / 32);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #0
+    mov x2, #0
+    mov x3, #8
+    addvl x2, x2, #1
+    sdiv x2, x2, x3
+
+    whilelo p0.s, xzr, x2
+    ptrue p1.s
+
+    ld1w {z0.s}, p0/z, [x0, x1, lsl #2]
+    ld1w {z1.s}, p0/z, [x0, x2, lsl #2]
+
+    ftsmul z2.s, z0.s, z1.s
+  )");
+  CHECK_NEON(
+      0, float,
+      fillNeon<float>({1.0f, 2.0f, 4.0f, 12.34f, -3.0f, -19.6f, 0.0f, 7.0f},
+                      VL / 16));
+  CHECK_NEON(
+      1, float,
+      fillNeon<float>({1.0f, -5.4f, 0.0f, 78.2f, 2.1f, -26.42f, 12.0f, 3.5f},
+                      VL / 16));
+  CHECK_NEON(2, float,
+             fillNeon<float>(
+                 {1.0f, -4.0f, 16.0f, 152.2756f, 9.0f, -384.16f, 0.0f, 49.0f},
+                 VL / 16));
+}
+
 TEST_P(InstSve, ld1rd) {
   initialHeapData_.resize(16);
   uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());

From 6fe35eca5b0b10aac9863ae0b03cbf3dd5ed9626 Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Wed, 11 Sep 2024 21:54:59 +0100
Subject: [PATCH 28/38] Added comment to ftsmul test

---
 test/regression/aarch64/instructions/sve.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index c6f55865b4..7d89571dde 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -4939,6 +4939,10 @@ TEST_P(InstSve, ftsmul) {
   // 64-bit arrangement
   double* dheap = reinterpret_cast<double*>(initialHeapData_.data());
   std::vector<double> srcA64 = {1.0, 2.0, 4.0, 12.34};
+  // Note that "the elements of the second source vector hold the corresponding
+  // value of the quadrant Q number as an integer not a floating-point value".
+  // We use doubles anyway as we only care about the sign bit, and currently
+  // "fillHeapCombined" only takes a single templated type
   std::vector<double> srcB64 = {1.0, -5.4, 0.0, 78.2};
   fillHeapCombined<double>(dheap, srcA64, srcB64, VL / 32);
 
@@ -4967,6 +4971,10 @@ TEST_P(InstSve, ftsmul) {
   float* fheap = reinterpret_cast<float*>(initialHeapData_.data());
   std::vector<float> fsrcA = {1.0f,  2.0f,   4.0f, 12.34f,
                               -3.0f, -19.6f, 0.0f, 7.0f};
+  // Note that "the elements of the second source vector hold the corresponding
+  // value of the quadrant Q number as an integer not a floating-point value".
+  // We use floats anyway as we only care about the sign bit, and currently
+  // "fillHeapCombined" only takes a single templated type
   std::vector<float> fsrcB = {1.0f, -5.4f,   0.0f,  78.2f,
                               2.1f, -26.42f, 12.0f, 3.5f};
   fillHeapCombined<float>(fheap, fsrcA, fsrcB, VL / 32);

From f22be5a94b1e9787f99d558f2a0fa396bc2ade75 Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Tue, 17 Sep 2024 22:01:48 +0100
Subject: [PATCH 29/38] Added FTSSEL tests. Nasty bugger....

---
 .../simeng/arch/aarch64/helpers/sve.hh        |  8 +-
 src/lib/arch/aarch64/InstructionMetadata.cc   |  4 +
 test/regression/aarch64/instructions/sve.cc   | 87 +++++++++++++++++--
 3 files changed, 86 insertions(+), 13 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index d704a38269..99ae84f3b7 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -947,13 +947,11 @@ RegisterValue sveFTrigSSel(srcValContainer& sourceValues,
   // Place the value 1.0 or a copy of the first source vector element in the
   // destination element, depending on bit 0 of the corresponding element of
   // the second source vector. The sign bit of the destination element is
-  // copied from bit 1 of the second source vector
+  // negated from bit 1 of the second source vector
   for (int i = 0; i < partition_num; i++) {
-    out[i] = m[i] & bit_0_mask ? 1.0 : n[i];
-    T sign_bit = m[i] & bit_1_mask ? 1.0 : -1.0;
-    out[i] = std::abs(out[i]) * sign_bit;
+    out[i] = m[i] & bit_0_mask ? static_cast<T>(1.0) : n[i];
+    out[i] = m[i] & bit_1_mask ? -out[i] : out[i];
   }
-
   return {out, 256};
 }
 
diff --git a/src/lib/arch/aarch64/InstructionMetadata.cc b/src/lib/arch/aarch64/InstructionMetadata.cc
index c177ef90f1..9653e3a00a 100644
--- a/src/lib/arch/aarch64/InstructionMetadata.cc
+++ b/src/lib/arch/aarch64/InstructionMetadata.cc
@@ -93,6 +93,10 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
       [[fallthrough]];
     case Opcode::AArch64_FTSMUL_ZZZ_S:
       [[fallthrough]];
+    case Opcode::AArch64_FTSSEL_ZZZ_D:
+      [[fallthrough]];
+    case Opcode::AArch64_FTSSEL_ZZZ_S:
+      [[fallthrough]];
     case Opcode::AArch64_SMAX_ZI_D:
       [[fallthrough]];
     case Opcode::AArch64_SMAX_ZI_H:
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 7d89571dde..eb826f9a5d 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -4963,8 +4963,10 @@ TEST_P(InstSve, ftsmul) {
     ld1d {z1.d}, p0/z, [x0, x2, lsl #3]
 
     ftsmul z2.d, z0.d, z1.d
+    ftsmul z3.d, z1.d, z0.d
   )");
   CHECK_NEON(2, double, fillNeon<double>({1.0, -4.0, 16.0, 152.2756}, VL / 8));
+  CHECK_NEON(3, double, fillNeon<double>({1.0, 29.16, 0.0, 6115.24}, VL / 8));
 
   // 32-bit arrangement
   initialHeapData_.resize(VL / 8);
@@ -4998,19 +5000,88 @@ TEST_P(InstSve, ftsmul) {
     ld1w {z1.s}, p0/z, [x0, x2, lsl #2]
 
     ftsmul z2.s, z0.s, z1.s
+    ftsmul z3.s, z1.s, z0.s
   )");
-  CHECK_NEON(
-      0, float,
-      fillNeon<float>({1.0f, 2.0f, 4.0f, 12.34f, -3.0f, -19.6f, 0.0f, 7.0f},
-                      VL / 16));
-  CHECK_NEON(
-      1, float,
-      fillNeon<float>({1.0f, -5.4f, 0.0f, 78.2f, 2.1f, -26.42f, 12.0f, 3.5f},
-                      VL / 16));
   CHECK_NEON(2, float,
              fillNeon<float>(
                  {1.0f, -4.0f, 16.0f, 152.2756f, 9.0f, -384.16f, 0.0f, 49.0f},
                  VL / 16));
+  CHECK_NEON(3, float,
+             fillNeon<float>({1.0f, 29.16f, 0.0f, 6115.24f, -4.41f, -698.0164f,
+                              144.0f, 12.25f},
+                             VL / 16));
+}
+
+TEST_P(InstSve, ftssel) {
+  initialHeapData_.resize(VL / 4);
+  // 64-bit arrangement
+  // We use uint64_t to model doubles here as we care about the bit patterns
+  // rather than values
+  uint64_t* dheap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> srcA64 = {0x1234, 0xABCD, 0x00000000F0F0FFFF, 0x9876};
+  // Note that "The use of the second operand is consistent with it holding an
+  // integer corresponding to the desired sine-wave quadrant."
+  std::vector<uint64_t> srcB64 = {0x0, 0x8000000000000000, 0x4000000000000000,
+                                  0xC000000000000000};
+  fillHeapCombined<uint64_t>(dheap, srcA64, srcB64, VL / 32);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #0
+    mov x2, #0
+    mov x3, #8
+    addvl x2, x2, #1
+    udiv x2, x2, x3
+    ptrue p0.d
+
+    ld1d {z0.d}, p0/z, [x0, x1, lsl #3] 
+    ld1d {z1.d}, p0/z, [x0, x2, lsl #3]
+
+    ftssel z2.d, z0.d, z1.d
+  )");
+  CHECK_NEON(2, uint64_t,
+             fillNeon<uint64_t>({0x1234, 0x3ff0000000000000, 0x80000000F0F0FFFF,
+                                 0xbff0000000000000},
+                                VL / 8));
+
+  // 32-bit arrangement
+  // We use uint32_t to model floats here as we care about the bit patterns
+  // rather than values
+  initialHeapData_.resize(VL / 8);
+  uint32_t* fheap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  std::vector<uint32_t> fsrcA = {0x1234, 0xABCD, 0x00F0FFFF, 0x9876};
+  // Note that "the elements of the second source vector hold the corresponding
+  // value of the quadrant Q number as an integer not a floating-point value".
+  std::vector<uint32_t> fsrcB = {0x0, 0x80000000, 0x40000000, 0xC0000000};
+  fillHeapCombined<uint32_t>(fheap, fsrcA, fsrcB, VL / 32);
+
+  RUN_AARCH64(R"(
+     # Get heap address
+     mov x0, 0
+     mov x8, 214
+     svc #0
+
+     mov x1, #0
+     mov x2, #0
+     mov x3, #8
+     addvl x2, x2, #1
+     sdiv x2, x2, x3
+
+     whilelo p0.s, xzr, x2
+     ptrue p1.s
+
+     ld1w {z0.s}, p0/z, [x0, x1, lsl #2]
+     ld1w {z1.s}, p0/z, [x0, x2, lsl #2]
+
+     ftssel z2.s, z0.s, z1.s
+   )");
+  CHECK_NEON(2, uint32_t,
+             fillNeon<uint32_t>({0x1234, 0x3f800000, 0x80F0FFFF, 0xBF800000},
+                                VL / 16));
 }
 
 TEST_P(InstSve, ld1rd) {

From dad04676fec277c1ca3a009eebc7ee4e1d8f273f Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Fri, 20 Sep 2024 14:43:10 +0100
Subject: [PATCH 30/38] Finally got ftmad sorted. Had issues with 32 bit for
 some reason

---
 .../simeng/arch/aarch64/helpers/sve.hh        | 43 +++++-----
 test/regression/aarch64/instructions/sve.cc   | 78 +++++++++++++++++++
 2 files changed, 99 insertions(+), 22 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 99ae84f3b7..10ee4e5446 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -963,9 +963,9 @@ RegisterValue sveFTrigMad(
     srcValContainer& sourceValues,
     const simeng::arch::aarch64::InstructionMetadata& metadata,
     const uint16_t VL_bits) {
-  const T* n = sourceValues[0].getAsVector<T>();
-  const T* m = sourceValues[1].getAsVector<T>();
-  const uint8_t imm = static_cast<uint8_t>(metadata.operands[1].imm);
+  const T* n = sourceValues[1].getAsVector<T>();
+  const T* m = sourceValues[2].getAsVector<T>();
+  const uint8_t imm = static_cast<uint8_t>(metadata.operands[3].imm);
 
   const std::array<double, 8> sin64 = {1.0,
                                        -0.1666666666666661,
@@ -985,31 +985,30 @@ RegisterValue sveFTrigMad(
                                        0.2087558253975872e-08,
                                        -0.1135338700720054e-10};
 
-  const std::array<float, 8> sin32 = {1.0,
-                                      -1.666666716337e-01,
-                                      8.333330973983e-03,
-                                      -1.983967522392e-04,
-                                      2.721174723774e-06,
-                                      0.0,
-                                      0.0,
-                                      0.0};
-
-  const std::array<float, 8> cos32 = {1.0,
-                                      -5.000000000000e-01,
-                                      4.166664928198e-02,
-                                      -1.388759003021e-03,
-                                      2.446388680255e-05,
-                                      0.0,
-                                      0.0,
-                                      0.0};
+  const std::array<float, 8> sin32 = {1.0f,
+                                      -1.666666716337e-01f,
+                                      8.333330973983e-03f,
+                                      -1.983967522392e-04f,
+                                      2.721174723774e-06f,
+                                      0.0f,
+                                      0.0f,
+                                      0.0f};
+
+  const std::array<float, 8> cos32 = {1.0f,
+                                      -5.000000000000e-01f,
+                                      4.166664928198e-02f,
+                                      -1.388759003021e-03f,
+                                      2.446388680255e-05f,
+                                      0.0f,
+                                      0.0f,
+                                      0.0f};
 
   const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
   T out[256 / sizeof(T)] = {0};
-  // std::array<T, 8> lut;
 
   for (int i = 0; i < partition_num; i++) {
     T coeff;
-    const bool sign_bit = m[i] < 0 ? 1 : 0;
+    const bool sign_bit = std::signbit(m[i]);
     // If float then use those LUTs
     if (sizeof(T) == 4) {
       coeff = sign_bit ? cos32[imm] : sin32[imm];
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index eb826f9a5d..eda1d97602 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -5084,6 +5084,84 @@ TEST_P(InstSve, ftssel) {
                                 VL / 16));
 }
 
+TEST_P(InstSve, ftmad) {
+  initialHeapData_.resize(VL / 4);
+  // 64-bit arrangement
+  double* dheap = reinterpret_cast<double*>(initialHeapData_.data());
+  std::vector<double> srcA64 = {0.0, 0.5, -0.5, 0.75};
+  std::vector<double> srcB64 = {0.0, 0.5, -0.4, -0.2};
+  fillHeapCombined<double>(dheap, srcA64, srcB64, VL / 32);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #0
+    mov x2, #0
+    mov x3, #8
+    addvl x2, x2, #1
+    udiv x2, x2, x3
+    ptrue p0.d
+
+    ld1d {z0.d}, p0/z, [x0, x1, lsl #3]
+    ld1d {z1.d}, p0/z, [x0, x2, lsl #3]
+    mov z2.d, z0.d
+    mov z3.d, z0.d
+    mov z4.d, z0.d
+
+    ftmad z2.d, z2.d, z1.d, #0
+    ftmad z3.d, z3.d, z1.d, #2
+    ftmad z4.d, z4.d, z1.d, #7
+  )");
+  CHECK_NEON(2, double, fillNeon<double>({1.0, 1.25, 0.8, 1.15}, VL / 8));
+  CHECK_NEON(3, double,
+             fillNeon<double>({0.008333333333320002, 0.258333333333320002,
+                               -0.15833333333333355, 0.19166666666666645},
+                              VL / 8));
+  CHECK_NEON(
+      4, double,
+      fillNeon<double>({0.0, 0.25, -0.20000000001135337, 0.1499999999886466},
+                       VL / 8));
+
+  // 32-bit arrangement
+  initialHeapData_.resize(VL / 4);
+  float* fheap = reinterpret_cast<float*>(initialHeapData_.data());
+  std::vector<float> fsrcA = {0.0f, 0.5f, -0.5f, 0.75f};
+  std::vector<float> fsrcB = {0.0f, 0.5f, -0.4f, -0.2f};
+  fillHeapCombined<float>(fheap, fsrcA, fsrcB, VL / 16);
+
+  RUN_AARCH64(R"(
+     # Get heap address
+     mov x0, 0
+     mov x8, 214
+     svc #0
+
+     mov x1, #0
+     mov x2, #0
+     mov x3, #4
+     addvl x2, x2, #1
+     sdiv x2, x2, x3
+
+     whilelo p0.s, xzr, x2
+
+     ld1w {z2.s}, p0/z, [x0]
+     ld1w {z3.s}, p0/z, [x0]
+     ld1w {z4.s}, p0/z, [x0, x1, lsl #2]
+     ld1w {z1.s}, p0/z, [x0, x2, lsl #2]
+
+     ftmad z2.s, z2.s, z1.s, #0
+     ftmad z3.s, z3.s, z1.s, #2
+     ftmad z4.s, z4.s, z1.s, #7
+   )");
+  CHECK_NEON(2, float, fillNeon<float>({1.0f, 1.25f, 0.8f, 1.15f}, VL / 8));
+  CHECK_NEON(3, float,
+             fillNeon<float>(
+                 {0.00833333f, 0.25833333f, -0.1583334f, 0.1916666f}, VL / 8));
+  CHECK_NEON(4, float, fillNeon<float>({0.0f, 0.25f, -0.2f, 0.15f}, VL / 8));
+}
+
 TEST_P(InstSve, ld1rd) {
   initialHeapData_.resize(16);
   uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());

From 5a611d31f922c04dbebbfb14fc4facb96fe5e2dc Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Fri, 20 Sep 2024 17:51:44 +0100
Subject: [PATCH 31/38] Added LDAXRB and STLXR insts. STLXR took some fix in
 decode to flag as a store

---
 src/lib/arch/aarch64/Instruction_address.cc   |  8 +-
 src/lib/arch/aarch64/Instruction_decode.cc    |  5 +-
 src/lib/arch/aarch64/Instruction_execute.cc   |  8 +-
 test/regression/aarch64/instructions/load.cc  | 62 +++++++++++++
 test/regression/aarch64/instructions/store.cc | 89 +++++++++++++++++++
 5 files changed, 165 insertions(+), 7 deletions(-)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index 3f27b5acc3..06eb7e2004 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -1365,11 +1365,15 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 1}});
         break;
       }
-      case Opcode::AArch64_STLXRW: {  // stlxr ws, wt, [xn]
+      case Opcode::AArch64_STLXRH: {  // stlxrb ws, ht, [xn]
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 2}});
+        break;
+      }
+      case Opcode::AArch64_STLXRW: {  // stlxrb ws, wt, [xn]
         setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 4}});
         break;
       }
-      case Opcode::AArch64_STLXRX: {  // stlxr ws, xt, [xn]
+      case Opcode::AArch64_STLXRX: {  // stlxr ws, xwt, [xn]
         setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 8}});
         break;
       }
diff --git a/src/lib/arch/aarch64/Instruction_decode.cc b/src/lib/arch/aarch64/Instruction_decode.cc
index 6d2007cb55..de68245ff6 100644
--- a/src/lib/arch/aarch64/Instruction_decode.cc
+++ b/src/lib/arch/aarch64/Instruction_decode.cc
@@ -499,8 +499,9 @@ void Instruction::decode() {
 
     // Check first operand access to determine if it's a load or store
     if (metadata_.operands[0].access & CS_AC_WRITE) {
-      if (metadata_.id == AARCH64_INS_STXR ||
-          metadata_.id == AARCH64_INS_STLXR) {
+      if (metadata_.id == ARM64_INS_STXR || metadata_.id == ARM64_INS_STLXR ||
+          metadata_.id == ARM64_INS_STLXRB ||
+          metadata_.id == ARM64_INS_STLXRH) {
         // Exceptions to this is load condition are exclusive store with a
         // success flag as first operand
         if (microOpcode_ != MicroOpcode::STR_DATA) {
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index fecf3a36ae..8e8706bcc5 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -5176,17 +5176,19 @@ void Instruction::execute() {
       }
       case Opcode::AArch64_STLRW:    // stlr wt, [xn]
       case Opcode::AArch64_STLRX: {  // stlr xt, [xn]
-        // STORE
+                                     // STORE
+        std::cout << "sv0: " << sourceValues_[0] << "\n";
         memoryData_[0] = sourceValues_[0];
         break;
       }
       case Opcode::AArch64_STLXRB:    // stlxrb ws, wt, [xn]
+      case Opcode::AArch64_STLXRH:    // stlxrh ws, wt, [xn]
       case Opcode::AArch64_STLXRW:    // stlxr ws, wt, [xn]
       case Opcode::AArch64_STLXRX: {  // stlxr ws, xt, [xn]
         // STORE
         memoryData_[0] = sourceValues_[0];
-        // TODO: Implement atomic memory access
-        results_[0] = static_cast<uint64_t>(0);
+        //  TODO: Implement atomic memory access
+        results_[0] = {0, 8};
         break;
       }
       case Opcode::AArch64_STPDi:    // stp dt1, dt2, [xn, #imm]
diff --git a/test/regression/aarch64/instructions/load.cc b/test/regression/aarch64/instructions/load.cc
index 05ffdd90a0..ed165943af 100644
--- a/test/regression/aarch64/instructions/load.cc
+++ b/test/regression/aarch64/instructions/load.cc
@@ -695,6 +695,68 @@ TEST_P(InstLoad, ldarb) {
   EXPECT_EQ(getGeneralRegister<uint32_t>(7), 64);
 }
 
+TEST_P(InstLoad, ldaxrb) {
+  initialHeapData_.resize(8);
+  uint32_t* heap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heap[0] = 0xDEADBEEF;
+  heap[1] = 0x12345678;
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+    ldaxrb w1, [x0]
+    add x0, x0, #1
+    ldaxrb w2, [x0]
+    add x0, x0, #1
+    ldaxrb w3, [x0]
+    add x0, x0, #1
+    ldaxrb w4, [x0]
+    add x0, x0, #1
+    ldaxrb w5, [x0]
+    add x0, x0, #1
+    ldaxrb w6, [x0]
+    add x0, x0, #1
+    ldaxrb w7, [x0]
+    add x0, x0, #1
+    ldaxrb w8, [x0]
+  )");
+  EXPECT_EQ(getGeneralRegister<uint32_t>(1), 0xEF);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(2), 0xBE);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(3), 0xAD);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(4), 0xDE);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(5), 0x78);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(6), 0x56);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(7), 0x34);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(8), 0x12);
+
+  RUN_AARCH64(R"(
+    sub sp, sp, #1024
+    mov w0, #16
+    mov w1, #32
+    mov w2, #48
+    mov w3, #64
+    str w0, [sp], #32
+    str w1, [sp], #32
+    str w2, [sp], #32
+    str w3, [sp], #32
+    sub sp, sp, #128
+    ldaxrb w4, [sp]
+    add sp, sp, #32
+    ldaxrb w5, [sp]
+    add sp, sp, #32
+    ldaxrb w6, [sp]
+    add sp, sp, #32
+    ldaxrb w7, [sp]
+  )");
+
+  EXPECT_EQ(getGeneralRegister<uint32_t>(4), 16);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(5), 32);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(6), 48);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(7), 64);
+}
+
 TEST_P(InstLoad, ldrb) {
   initialHeapData_.resize(8);
   uint32_t* heap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
diff --git a/test/regression/aarch64/instructions/store.cc b/test/regression/aarch64/instructions/store.cc
index 6d6876b494..c2298693b8 100644
--- a/test/regression/aarch64/instructions/store.cc
+++ b/test/regression/aarch64/instructions/store.cc
@@ -60,6 +60,95 @@ TEST_P(InstStore, stlr) {
             0xBABA);
 }
 
+TEST_P(InstStore, stlxr) {
+  // stlxrb
+  RUN_AARCH64(R"(
+    mov w0, 0xAB
+    mov w1, 0x12
+    mov w2, 0xCD
+    mov w3, 0x34
+    sub sp, sp, #4
+    stlxrb w4, w0, [sp]
+    add sp, sp, #1
+    stlxrb w5, w1, [sp]
+    add sp, sp, #1
+    stlxrb w6, w2, [sp]
+    add sp, sp, #1
+    stlxrb w7, w3, [sp]
+    add sp, sp, #1
+  )");
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 4),
+            0xAB);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 3),
+            0x12);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 2),
+            0xCD);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 1),
+            0x34);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(4), 0);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(5), 0);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(7), 0);
+
+  // stlxrh
+  RUN_AARCH64(R"(
+    mov w0, 0xABCD
+    mov w1, 0x1234
+    mov w2, 0xCDEF
+    mov w3, 0x3456
+    sub sp, sp, #8
+    stlxrh w4, w0, [sp]
+    add sp, sp, #2
+    stlxrh w5, w1, [sp]
+    add sp, sp, #2
+    stlxrh w6, w2, [sp]
+    add sp, sp, #2
+    stlxrh w7, w3, [sp]
+    add sp, sp, #2
+  )");
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() - 8),
+            0xABCD);
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() - 6),
+            0x1234);
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() - 4),
+            0xCDEF);
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() - 2),
+            0x3456);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(4), 0);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(5), 0);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(7), 0);
+
+  // stlxr
+  RUN_AARCH64(R"(
+    mov w0, 0xABCD
+    mov w1, 0x1234
+    mov w2, 0xCDEF
+    mov w3, 0x3456
+    sub sp, sp, #24
+    stlxr w4, x0, [sp]
+    add sp, sp, #8
+    stlxr w5, x1, [sp]
+    add sp, sp, #8
+    stlxr w6, w2, [sp]
+    add sp, sp, #4
+    stlxr w7, w3, [sp]
+    add sp, sp, #4
+  )");
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 24),
+            0xABCD);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 16),
+            0x1234);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 8),
+            0xCDEF);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 4),
+            0x3456);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(4), 0);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(5), 0);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(7), 0);
+}
+
 TEST_P(InstStore, strb) {
   RUN_AARCH64(R"(
     mov w0, 0xAB

From a58409bd39ca883f9f7f91acc0efd30277287a4c Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Fri, 20 Sep 2024 18:18:22 +0100
Subject: [PATCH 32/38] Added test for ORN. Finished all base tests

---
 test/regression/aarch64/instructions/neon.cc | 21 ++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc
index 680574158a..1a51df01e7 100644
--- a/test/regression/aarch64/instructions/neon.cc
+++ b/test/regression/aarch64/instructions/neon.cc
@@ -1090,6 +1090,27 @@ TEST_P(InstNeon, eor) {
   CHECK_NEON(3, uint8_t, {1, 3, 1, 7, 1, 3, 1, 15, 0, 0, 0, 0, 0, 0, 0, 0});
 }
 
+TEST_P(InstNeon, orn) {
+  initialHeapData_.resize(16);
+  uint8_t* heap = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  for (int i = 0; i < 8; i++) {
+    heap[i] = i;
+    heap[i + 8] = i + 1;
+  }
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #8]
+
+    orn v2.8b, v0.8b, v1.8b
+  )");
+  CHECK_NEON(2, uint8_t, {254, 253, 254, 251, 254, 253, 254, 247});
+}
+
 TEST_P(InstNeon, ext) {
   RUN_AARCH64(R"(
     movi v0.16b, #0xAB

From 0ffcd51bb0e268adc0b084604d66cb24afd714f7 Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Tue, 10 Dec 2024 13:33:05 +0000
Subject: [PATCH 33/38] Added group tests to all added insts

---
 src/lib/arch/aarch64/Instruction_execute.cc   |  2 --
 test/regression/aarch64/instructions/load.cc  |  5 +++
 test/regression/aarch64/instructions/neon.cc  |  8 +++++
 test/regression/aarch64/instructions/store.cc |  7 +++++
 test/regression/aarch64/instructions/sve.cc   | 31 +++++++++++++++++++
 5 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 8e8706bcc5..23a51c190c 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -5176,8 +5176,6 @@ void Instruction::execute() {
       }
       case Opcode::AArch64_STLRW:    // stlr wt, [xn]
       case Opcode::AArch64_STLRX: {  // stlr xt, [xn]
-                                     // STORE
-        std::cout << "sv0: " << sourceValues_[0] << "\n";
         memoryData_[0] = sourceValues_[0];
         break;
       }
diff --git a/test/regression/aarch64/instructions/load.cc b/test/regression/aarch64/instructions/load.cc
index ed165943af..83737c14ce 100644
--- a/test/regression/aarch64/instructions/load.cc
+++ b/test/regression/aarch64/instructions/load.cc
@@ -3,6 +3,7 @@
 namespace {
 
 using InstLoad = AArch64RegressionTest;
+using namespace simeng::arch::aarch64::InstructionGroups;
 
 TEST_P(InstLoad, ld1r) {
   // 8-bit
@@ -755,6 +756,8 @@ TEST_P(InstLoad, ldaxrb) {
   EXPECT_EQ(getGeneralRegister<uint32_t>(5), 32);
   EXPECT_EQ(getGeneralRegister<uint32_t>(6), 48);
   EXPECT_EQ(getGeneralRegister<uint32_t>(7), 64);
+
+  EXPECT_GROUP(R"(ldaxrb w7, [sp])", LOAD_INT);
 }
 
 TEST_P(InstLoad, ldrb) {
@@ -1353,6 +1356,8 @@ TEST_P(InstLoad, ldrsw) {
   EXPECT_EQ(getGeneralRegister<int64_t>(4), -5);
   EXPECT_EQ(getGeneralRegister<int64_t>(5), -5);
 
+  EXPECT_GROUP(R"(ldrsw x4, [x0, x6, lsl #2])", LOAD_INT);
+
   // ldursw
   RUN_AARCH64(R"(
     # Get heap address
diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc
index 1a51df01e7..91dee06ebb 100644
--- a/test/regression/aarch64/instructions/neon.cc
+++ b/test/regression/aarch64/instructions/neon.cc
@@ -374,6 +374,8 @@ TEST_P(InstNeon, uaddlv) {
     uaddlv h1, v0.8b
   )");
   CHECK_NEON(1, uint16_t, {36});
+
+  EXPECT_GROUP(R"(uaddlv h1, v0.8b)", SCALAR_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(InstNeon, and) {
@@ -770,6 +772,8 @@ TEST_P(InstNeon, cmeq) {
     cmeq v2.4s, v0.4s, v1.4s
   )");
   CHECK_NEON(2, uint32_t, {0, 0xFFFFFFFFu, 0xFFFFFFFFu, 0});
+
+  EXPECT_GROUP(R"(cmeq v2.4s, v0.4s, v1.4s)", VECTOR_SIMPLE_CMP);
 }
 
 TEST_P(InstNeon, cmhs) {
@@ -883,6 +887,8 @@ TEST_P(InstNeon, cmhi) {
   )");
   CHECK_NEON(2, uint32_t, {0xFFFFFFFF, 0x0, 0xFFFFFFFF, 0x0});
   CHECK_NEON(3, uint32_t, {0x0, 0xFFFFFFFF, 0x0, 0x0});
+
+  EXPECT_GROUP(R"(cmhi v3.4s, v1.4s, v0.4s)", VECTOR_SIMPLE_CMP);
 }
 
 TEST_P(InstNeon, cnt) {
@@ -1109,6 +1115,8 @@ TEST_P(InstNeon, orn) {
     orn v2.8b, v0.8b, v1.8b
   )");
   CHECK_NEON(2, uint8_t, {254, 253, 254, 251, 254, 253, 254, 247});
+
+  EXPECT_GROUP(R"(orn v2.8b, v0.8b, v1.8b)", VECTOR_SIMPLE_LOGICAL_NOSHIFT);
 }
 
 TEST_P(InstNeon, ext) {
diff --git a/test/regression/aarch64/instructions/store.cc b/test/regression/aarch64/instructions/store.cc
index c2298693b8..2b43e510e4 100644
--- a/test/regression/aarch64/instructions/store.cc
+++ b/test/regression/aarch64/instructions/store.cc
@@ -3,6 +3,7 @@
 namespace {
 
 using InstStore = AArch64RegressionTest;
+using namespace simeng::arch::aarch64::InstructionGroups;
 
 TEST_P(InstStore, stlr) {
   // stlrb
@@ -90,6 +91,8 @@ TEST_P(InstStore, stlxr) {
   EXPECT_EQ(getGeneralRegister<uint32_t>(6), 0);
   EXPECT_EQ(getGeneralRegister<uint32_t>(7), 0);
 
+  EXPECT_GROUP(R"(stlxrb w7, w3, [sp])", STORE_ADDRESS_INT);
+
   // stlxrh
   RUN_AARCH64(R"(
     mov w0, 0xABCD
@@ -119,6 +122,8 @@ TEST_P(InstStore, stlxr) {
   EXPECT_EQ(getGeneralRegister<uint32_t>(6), 0);
   EXPECT_EQ(getGeneralRegister<uint32_t>(7), 0);
 
+  EXPECT_GROUP(R"(stlxrh w7, w3, [sp])", STORE_ADDRESS_INT);
+
   // stlxr
   RUN_AARCH64(R"(
     mov w0, 0xABCD
@@ -147,6 +152,8 @@ TEST_P(InstStore, stlxr) {
   EXPECT_EQ(getGeneralRegister<uint32_t>(5), 0);
   EXPECT_EQ(getGeneralRegister<uint32_t>(6), 0);
   EXPECT_EQ(getGeneralRegister<uint32_t>(7), 0);
+
+  EXPECT_GROUP(R"(stlxr w7, w3, [sp])", STORE_ADDRESS_INT);
 }
 
 TEST_P(InstStore, strb) {
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index eda1d97602..ab027b408a 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -1506,6 +1506,8 @@ TEST_P(InstSve, cmphs_vec) {
   )");
   CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 8));
   EXPECT_EQ(getNZCV(), 0b1000);
+
+  EXPECT_GROUP(R"(cmphs p1.d, p0/z, z1.d, z0.d)", PREDICATE);
 }
 
 TEST_P(InstSve, cnt) {
@@ -1868,6 +1870,8 @@ TEST_P(InstSve, cpy) {
   CHECK_NEON(3, double, fillNeon<double>({static_cast<int16_t>(-16)}, VL / 16));
   CHECK_NEON(4, double, fillNeon<double>({12}, VL / 8));
   CHECK_NEON(5, double, fillNeon<double>({static_cast<int16_t>(-8)}, VL / 16));
+
+  EXPECT_GROUP(R"(cpy z3.d, p1/m, d9)", SVE_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(InstSve, fcpy) {
@@ -3775,6 +3779,8 @@ TEST_P(InstSve, fdiv) {
   CHECK_NEON(1, double, fillNeon<double>(dresults, VL / 8));
   std::rotate(dsrcB.begin(), dsrcB.begin() + ((VL / 128) % 8), dsrcB.end());
   CHECK_NEON(2, double, fillNeonCombined<double>(dresults, dsrcB, VL / 8));
+
+  EXPECT_GROUP(R"(fdiv z2.d, p0/m, z2.d, z0.d)", SVE_DIV_OR_SQRT);
 }
 
 TEST_P(InstSve, fnmls) {
@@ -5010,6 +5016,8 @@ TEST_P(InstSve, ftsmul) {
              fillNeon<float>({1.0f, 29.16f, 0.0f, 6115.24f, -4.41f, -698.0164f,
                               144.0f, 12.25f},
                              VL / 16));
+
+  EXPECT_GROUP(R"(ftsmul z3.s, z1.s, z0.s)", SVE_MUL);
 }
 
 TEST_P(InstSve, ftssel) {
@@ -5082,6 +5090,8 @@ TEST_P(InstSve, ftssel) {
   CHECK_NEON(2, uint32_t,
              fillNeon<uint32_t>({0x1234, 0x3f800000, 0x80F0FFFF, 0xBF800000},
                                 VL / 16));
+
+  EXPECT_GROUP(R"(ftssel z2.s, z0.s, z1.s)", SVE_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(InstSve, ftmad) {
@@ -5160,6 +5170,8 @@ TEST_P(InstSve, ftmad) {
              fillNeon<float>(
                  {0.00833333f, 0.25833333f, -0.1583334f, 0.1916666f}, VL / 8));
   CHECK_NEON(4, float, fillNeon<float>({0.0f, 0.25f, -0.2f, 0.15f}, VL / 8));
+
+  EXPECT_GROUP(R"(ftmad z4.s, z4.s, z1.s, #7)", SVE_MUL);
 }
 
 TEST_P(InstSve, ld1rd) {
@@ -6217,6 +6229,8 @@ TEST_P(InstSve, pfirst) {
   CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 1));
   CHECK_PREDICATE(4, uint64_t, fillPred(1, {1}, 1));
   CHECK_PREDICATE(5, uint64_t, fillPred(VL / 8, {0}, 1));
+
+  EXPECT_GROUP(R"(pfirst p5.b, p1, p5.b)", PREDICATE);
 }
 
 TEST_P(InstSve, ptrue) {
@@ -6330,6 +6344,8 @@ TEST_P(InstSve, pnext) {
   CHECK_PREDICATE(1, uint64_t,
                   fillPredFromSource<uint64_t>({0x1, 0, 0, 0}, 32));
   EXPECT_EQ(getNZCV(), 0b1010);
+
+  EXPECT_GROUP(R"(pnext p1.d, p3, p1.d)", PREDICATE);
 }
 
 TEST_P(InstSve, punpk) {
@@ -6868,6 +6884,8 @@ TEST_P(InstSve, smax) {
              fillNeon<int8_t>({127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
                                127, 127, 127, 127, 127, 127},
                               VL / 8));
+
+  EXPECT_GROUP(R"(smax z5.b, z5.b, #127)", SVE_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(InstSve, smin) {
@@ -7098,6 +7116,9 @@ TEST_P(InstSve, smin) {
       std::min_element(arrB8.begin(), arrB8.end() - (256 - VL / 8)))];
   CHECK_NEON(3, int8_t, {minElemA8, 0, 0, 0});
   CHECK_NEON(4, int8_t, {minElemB8, 0, 0, 0});
+
+  EXPECT_GROUP(R"(smin z2.b, p1/m, z2.b, z0.b)", SVE_SIMPLE_ARTH_NOSHIFT);
+  EXPECT_GROUP(R"(sminv b4, p0, z2.b)", SCALAR_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(InstSve, smulh) {
@@ -7268,6 +7289,8 @@ TEST_P(InstSve, umaxv) {
   CHECK_NEON(11, uint8_t,
              {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
               0x00, 0x00, 0x00, 0x00, 0x00})
+
+  EXPECT_GROUP(R"(umaxv b11, v1.16b)", SCALAR_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(InstSve, clastb) {
@@ -7370,6 +7393,8 @@ TEST_P(InstSve, clastb) {
     )");
   CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0xEF}, 8));
   CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1F}, 8));
+
+  EXPECT_GROUP(R"(clastb b2, p0, b2, z3.b)", SCALAR_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(InstSve, lastb) {
@@ -7474,6 +7499,8 @@ TEST_P(InstSve, lastb) {
     )");
   CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0x01}, 8));
   CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1F}, 8));
+
+  EXPECT_GROUP(R"(lastb b4, p0, z3.b)", SCALAR_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(InstSve, splice) {
@@ -7516,6 +7543,8 @@ TEST_P(InstSve, splice) {
   )");
   CHECK_NEON(0, float, fillNeon<float>({1.5}, VL / 8));
   CHECK_NEON(2, float, fillNeonCombined<float>({1.5}, {-0.5}, VL / 8));
+
+  EXPECT_GROUP(R"(splice z2.s, p1, z2.s, z1.s)", SVE_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(InstSve, st1b) {
@@ -9250,6 +9279,8 @@ TEST_P(InstSve, whilels) {
   )");
   CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 8));
   EXPECT_EQ(getNZCV(), 0b1000);
+
+  EXPECT_GROUP(R"(whilels p3.d, xzr, x0)", PREDICATE);
 }
 
 TEST_P(InstSve, whilelt) {

From 4361eabc5c1090070b3c93de3bc9ab8973aa7c79 Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Mon, 16 Dec 2024 17:52:10 +0000
Subject: [PATCH 34/38] Cleaned up infinite ROB check and OpenMP bug

---
 src/include/simeng/pipeline/ReorderBuffer.hh |  6 +----
 src/lib/arch/aarch64/ExceptionHandler.cc     |  3 +--
 src/lib/pipeline/ReorderBuffer.cc            | 24 +++++++++-----------
 3 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/src/include/simeng/pipeline/ReorderBuffer.hh b/src/include/simeng/pipeline/ReorderBuffer.hh
index 06a9aefadd..c7ee01fcc6 100644
--- a/src/include/simeng/pipeline/ReorderBuffer.hh
+++ b/src/include/simeng/pipeline/ReorderBuffer.hh
@@ -125,13 +125,9 @@ class ReorderBuffer {
    */
   uint64_t pc_;
 
-  /** The address of the last instruction at the head of the ROB to check if
-   * it's stuck */
-  uint64_t last_inst_addr = 0;
-
   /** A counter for how many cycles the same instruction has been at the head of
    * the ROB */
-  uint64_t inst_repeat_counter = 0;
+  uint64_t robHeadRepeatCounter_ = 0;
 
   /** The sequence ID of the youngest instruction that should remain after the
    * current flush. */
diff --git a/src/lib/arch/aarch64/ExceptionHandler.cc b/src/lib/arch/aarch64/ExceptionHandler.cc
index 33701b049b..639f8e0655 100644
--- a/src/lib/arch/aarch64/ExceptionHandler.cc
+++ b/src/lib/arch/aarch64/ExceptionHandler.cc
@@ -431,14 +431,13 @@ bool ExceptionHandler::init() {
           }
           uint64_t retval = static_cast<uint64_t>(bitmask);
           stateChange = {ChangeType::REPLACEMENT, {R0}, {sizeof(retval)}};
-          stateChange.memoryAddresses.push_back({mask, 8});
+          stateChange.memoryAddresses.push_back({mask, sizeof(bitmask)});
           stateChange.memoryAddressValues.push_back(bitmask);
         } else {
           stateChange = {ChangeType::REPLACEMENT, {R0}, {-1ll}};
         }
         break;
       }
-
       case 131: {  // tgkill
         // TODO: Functionality temporarily omitted since simeng only has a
         // single thread at the moment
diff --git a/src/lib/pipeline/ReorderBuffer.cc b/src/lib/pipeline/ReorderBuffer.cc
index 1ff4a6b6c5..33326944a3 100644
--- a/src/lib/pipeline/ReorderBuffer.cc
+++ b/src/lib/pipeline/ReorderBuffer.cc
@@ -81,23 +81,21 @@ unsigned int ReorderBuffer::commit(uint64_t maxCommitSize) {
   unsigned int n;
   for (n = 0; n < maxCommits; n++) {
     auto& uop = buffer_[0];
-    if (uop->getInstructionAddress() == last_inst_addr) {
-      inst_repeat_counter++;
-    } else {
-      inst_repeat_counter = 0;
-    }
-    if (inst_repeat_counter > 10000000) {
-      std::cout
-          << "Infinite loop detected in rob commit at instruction address "
-          << std::hex << uop->getInstructionAddress() << std::dec << " ("
-          << uop->getMicroOpIndex() << "). Killing.\n";
-      exit(1);
-    }
-    last_inst_addr = uop->getInstructionAddress();
 
     if (!uop->canCommit()) {
+      // If an instruction has been stuck at the head of the rob for
+      // sufficiently long, assume an error in SimEng has occured.
+      robHeadRepeatCounter_++;
+      if (robHeadRepeatCounter_ > 10000000) {
+        std::cerr << "[SimEng:ReorderBuffer] Infinite loop detected in rob "
+                     "commit at instruction address "
+                  << std::hex << uop->getInstructionAddress() << std::dec
+                  << " (" << uop->getMicroOpIndex() << ")." << std::endl;
+        exit(1);
+      }
       break;
     }
+    robHeadRepeatCounter_ = 0;
 
     if (uop->isLastMicroOp()) instructionsCommitted_++;
 

From 6345f0824d23dbe31f99b505390d7a99a0f81e19 Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Wed, 18 Dec 2024 11:21:25 +0000
Subject: [PATCH 35/38] Responded to PR comments. Cleaned up a lot of helper
 functions and fixed a few metadata issues

---
 .../simeng/arch/aarch64/helpers/neon.hh       | 29 ++++++
 .../simeng/arch/aarch64/helpers/sve.hh        | 88 ++++++------------
 src/lib/arch/aarch64/Instruction_execute.cc   | 17 ++--
 test/regression/aarch64/instructions/load.cc  | 27 +-----
 test/regression/aarch64/instructions/neon.cc  | 91 ++++++++++++++++++
 test/regression/aarch64/instructions/sve.cc   | 92 +------------------
 6 files changed, 160 insertions(+), 184 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh
index c2626b7e91..98c1648d6b 100644
--- a/src/include/simeng/arch/aarch64/helpers/neon.hh
+++ b/src/include/simeng/arch/aarch64/helpers/neon.hh
@@ -558,6 +558,35 @@ RegisterValue vecLogicOp_3vecs(srcValContainer& sourceValues,
   return {out, 256};
 }
 
+/** Helper function for NEON instructions with the format `uaddlv zd, zn.T`.
+ * T represents the type of the destination register (e.g. for h0, T =
+ * uint32_t).
+ * U represents the type of the sourceValues[0] (e.g. for v0.8b, U =
+ * uint8_t)
+ * Returns correctly formatted RegisterValue. */
+template <typename T, typename U, int I>
+RegisterValue vecAddlv(srcValContainer& sourceValues) {
+  const U* n = sourceValues[0].getAsVector<U>();
+  T out = 0;
+  for (int i = 0; i < I; i++) {
+    out += n[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `umaxv Vd, Vn.T`.
+ * T represents the type of sourceValues (e.g. for vn.s, T = uint32_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecUMaxV(srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  T out = n[0];
+  for (int i = 1; i < I; i++) {
+    out = std::max(n[i], out);
+  }
+  return {out, 256};
+}
+
 /** Helper function for NEON instructions with the format `umaxp vd, vn, vm`.
  * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
  * I represents the number of elements in the output array to be updated (e.g.
diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 10ee4e5446..8b23bb0ea9 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -114,33 +114,6 @@ RegisterValue sveAddvPredicated(srcValContainer& sourceValues,
   return {out, 256};
 }
 
-/** Helper function for NEON instructions with the format `uaddlv Vd, Vn.T`.
- * T represents the type of the destination register (e.g. for h0, T =
- * uint32_t). U represents the type of the sourceValues[0] (e.g. for v0.8b, U =
- * uint8_t) Returns correctly formatted RegisterValue. */
-template <typename T, typename U, int I>
-RegisterValue sveAddlv(srcValContainer& sourceValues) {
-  const U* n = sourceValues[0].getAsVector<U>();
-  T out = 0;
-  for (int i = 0; i < I; i++) {
-    out += n[i];
-  }
-  return {out, 256};
-}
-
-/** Helper function for NEON instructions with the format `umaxv Vd, Vn.T`.
- * T represents the type of sourceValues (e.g. for vn.s, T = uint32_t).
- * Returns correctly formatted RegisterValue. */
-template <typename T, int I>
-RegisterValue sveUMaxV(srcValContainer& sourceValues) {
-  const T* n = sourceValues[0].getAsVector<T>();
-  T out = n[0];
-  for (int i = 1; i < I; i++) {
-    out = std::max(n[i], out);
-  }
-  return {out, 256};
-}
-
 /** Helper function for SVE instructions with the format `adr zd, [zn, zm{,
  * lsl #<1,2,3>}]`.
  * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
@@ -284,7 +257,7 @@ RegisterValue sveCpy_imm(
   return {out, 256};
 }
 
-/** Helper function for SVE instructions with the format `cpy zd, pg/m, vn
+/** Helper function for SVE instructions with the format `cpy zd, pg/m, rn
  * T represents the type of sourceValues (e.g. for zd.d, T = int64_t).
  * Returns correctly formatted RegisterValue. */
 template <typename T>
@@ -294,7 +267,7 @@ RegisterValue sveCpy_Scalar(
     const uint16_t VL_bits) {
   const T* zd = sourceValues[0].getAsVector<T>();
   const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
-  const T vn = sourceValues[2].get<T>();
+  const T rn = sourceValues[2].get<T>();
 
   const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
   T out[256 / sizeof(T)] = {0};
@@ -302,7 +275,7 @@ RegisterValue sveCpy_Scalar(
   for (int i = 0; i < partition_num; i++) {
     uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
     if (p[i / (64 / sizeof(T))] & shifted_active) {
-      out[i] = vn;
+      out[i] = rn;
     } else {
       out[i] = zd[i];
     }
@@ -956,7 +929,8 @@ RegisterValue sveFTrigSSel(srcValContainer& sourceValues,
 }
 
 /** Helper function for SVE instructions with the format `ftmad zd, zn, zm,
- * #imm`. T represents the type of sourceValues (e.g. for zn.d, T = double).
+ * #imm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = double).
  * Returns correctly formatted RegisterValue. **/
 template <typename T>
 RegisterValue sveFTrigMad(
@@ -1112,15 +1086,14 @@ RegisterValue sveIndex(
   return {out, 256};
 }
 
-/** Helper function for SVE instructions with the format `lastb vd, pg, zn`.
+/** Helper function for SVE instructions with the format `lastb rd, pg, zn`.
  * T represents the vector register type (e.g. zd.d would be uint64_t).
  * Returns correctly formatted RegisterValue. */
 template <typename T>
 RegisterValue sveLastBScalar(srcValContainer& sourceValues,
                              const uint16_t VL_bits) {
-  // sourceValues are wrong and the correct value is in the previous index.
-  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
-  const T* n = sourceValues[2].getAsVector<T>();
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* n = sourceValues[1].getAsVector<T>();
 
   const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
   T out;
@@ -1141,15 +1114,15 @@ RegisterValue sveLastBScalar(srcValContainer& sourceValues,
   return {out, 256};
 }
 
-/** Helper function for SVE instructions with the format `clastb vd, pg, vd,
- * zn`. T represents the vector register type (e.g. zd.d would be uint64_t).
+/** Helper function for SVE instructions with the format `clastb rd, pg, rd,
+ * zn`.
+ * T represents the vector register type (e.g. zd.d would be uint64_t).
  * Returns correctly formatted RegisterValue. */
 template <typename T>
 RegisterValue sveCLastBScalar(srcValContainer& sourceValues,
                               const uint16_t VL_bits) {
-  // sourceValues are wrong and the correct value is in the previous index.
   const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
-  const uint64_t* m = sourceValues[2].getAsVector<uint64_t>();
+  const uint64_t m = sourceValues[2].get<T>();
   const T* n = sourceValues[3].getAsVector<T>();
 
   const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
@@ -1166,9 +1139,9 @@ RegisterValue sveCLastBScalar(srcValContainer& sourceValues,
   }
 
   if (lastElem < 0) {
-    out = static_cast<uint64_t>(static_cast<T>(m[0]));
+    out = m;
   } else {
-    out = static_cast<uint64_t>(static_cast<T>(n[lastElem]));
+    out = n[lastElem];
   }
   return {out, 256};
 }
@@ -1505,7 +1478,8 @@ RegisterValue sveOrr_3vecs(srcValContainer& sourceValues,
 /** Helper function for SVE2 instructions with the format `psel pd, pn,
  * pm.t[wa, #imm]`.
  * T represents the type of sourceValues (e.g. for pm.d, T =
- * uint64_t). Returns an array of 4 uint64_t elements. */
+ * uint64_t).
+ * Returns an array of 4 uint64_t elements. */
 template <typename T>
 std::array<uint64_t, 4> svePsel(
     srcValContainer& sourceValues,
@@ -1530,12 +1504,13 @@ std::array<uint64_t, 4> svePsel(
   return out;
 }
 
-/** Helper function for SVE instructions with the format `pfirst pdn, pg, pdn`.
- * Returns an array of 4 uint64_t elements. */
-std::array<uint64_t, 4> svePfirst(srcValContainer& sourceValues,
-                                  const uint16_t VL_bits) {
+/** Helper function for SVE instructions with the format `pfirst pdn.b, pg,
+ * pdn.b`.
+ * Returns an array of 4 uint64_t elements, and updates the NZCV flags.
+ */
+std::tuple<std::array<uint64_t, 4>, uint8_t> svePfirst(
+    srcValContainer& sourceValues, const uint16_t VL_bits) {
   const uint16_t partition_num = VL_bits / 8;
-  // sourceValues are wrong and the correct value is in the previous index.
   const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
   const uint64_t* dn = sourceValues[2].getAsVector<uint64_t>();
   // Set destination d as source n to copy all false lanes and the active lanes
@@ -1549,10 +1524,11 @@ std::array<uint64_t, 4> svePfirst(srcValContainer& sourceValues,
       break;
     }
   }
-  return out;
+  return {out, getNZCVfromPred(out, VL_bits, 1)};
 }
 
 /** Helper function for SVE instructions with the format `pnext pdn, pv, pdn`.
+ * T represents the type of sourceValues (e.g. for pdn.d, T = uint64_t).
  * Returns an array of 4 uint64_t elements, and updates the NZCV flags. */
 template <typename T>
 std::tuple<std::array<uint64_t, 4>, uint8_t> svePnext(
@@ -1565,21 +1541,13 @@ std::tuple<std::array<uint64_t, 4>, uint8_t> svePnext(
   // Set destination elements to 0
   std::array<uint64_t, 4> out = {0, 0, 0, 0};
 
-  // Get pattern
-  const uint16_t count =
-      sveGetPattern(metadata.operandStr, sizeof(T) * 8, VL_bits);
-
-  // Exit early if count == 0
-  if (count == 0) return {out, getNZCVfromPred(out, VL_bits, sizeof(T))};
   // Get last active element of dn.pattern
   int lastElem = -1;
   for (int i = partition_num - 1; i >= 0; i--) {
-    if (i < count) {
-      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      if (dn[i / (64 / sizeof(T))] & shifted_active) {
-        lastElem = i;
-        break;
-      }
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (dn[i / (64 / sizeof(T))] & shifted_active) {
+      lastElem = i;
+      break;
     }
   }
   // Get next active element of p, starting from last of dn.pattern
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 23a51c190c..2c87ee9ed3 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -206,7 +206,7 @@ void Instruction::execute() {
         break;
       }
       case Opcode::AArch64_UADDLVv8i8v: {  // uaddlv hd, vn.8b
-        results_[0] = sveAddlv<uint32_t, uint8_t, 8>(sourceValues_);
+        results_[0] = vecAddlv<uint32_t, uint8_t, 8>(sourceValues_);
         break;
       }
       case Opcode::AArch64_ADDWri: {  // add wd, wn, #imm{, shift}
@@ -4154,7 +4154,9 @@ void Instruction::execute() {
         break;
       }
       case Opcode::AArch64_PFIRST_B: {  // pfirst pdn.b, pg, pdn.b
-        results_[0] = svePfirst(sourceValues_, VL_bits);
+        auto [result, nzcv] = svePfirst(sourceValues_, VL_bits);
+        results_[0] = nzcv;
+        results_[1] = result;
         break;
       }
       case Opcode::AArch64_PNEXT_B: {  // pnext pdn.b, pv, pdn.b
@@ -5176,6 +5178,7 @@ void Instruction::execute() {
       }
       case Opcode::AArch64_STLRW:    // stlr wt, [xn]
       case Opcode::AArch64_STLRX: {  // stlr xt, [xn]
+        // STORE
         memoryData_[0] = sourceValues_[0];
         break;
       }
@@ -5805,23 +5808,23 @@ void Instruction::execute() {
         break;
       }
       case Opcode::AArch64_UMAXVv16i8v: {  // umaxv bd, vn.16b
-        results_[0] = sveUMaxV<uint8_t, 16>(sourceValues_);
+        results_[0] = vecUMaxV<uint8_t, 16>(sourceValues_);
         break;
       }
       case Opcode::AArch64_UMAXVv4i16v: {  // umaxv hd, vn.4h
-        results_[0] = sveUMaxV<uint16_t, 4>(sourceValues_);
+        results_[0] = vecUMaxV<uint16_t, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_UMAXVv4i32v: {  // umaxv sd, vn.4s
-        results_[0] = sveUMaxV<uint32_t, 4>(sourceValues_);
+        results_[0] = vecUMaxV<uint32_t, 4>(sourceValues_);
         break;
       }
       case Opcode::AArch64_UMAXVv8i16v: {  // umaxv hd, vn.8h
-        results_[0] = sveUMaxV<uint16_t, 8>(sourceValues_);
+        results_[0] = vecUMaxV<uint16_t, 8>(sourceValues_);
         break;
       }
       case Opcode::AArch64_UMAXVv8i8v: {  // umaxv bd, vn.8b
-        results_[0] = sveUMaxV<uint8_t, 8>(sourceValues_);
+        results_[0] = vecUMaxV<uint8_t, 8>(sourceValues_);
         break;
       }
       case Opcode::AArch64_UMOVvi32_idx0:  // umov wd, vn.s[0]
diff --git a/test/regression/aarch64/instructions/load.cc b/test/regression/aarch64/instructions/load.cc
index 83737c14ce..bf5a3cad47 100644
--- a/test/regression/aarch64/instructions/load.cc
+++ b/test/regression/aarch64/instructions/load.cc
@@ -732,32 +732,7 @@ TEST_P(InstLoad, ldaxrb) {
   EXPECT_EQ(getGeneralRegister<uint32_t>(7), 0x34);
   EXPECT_EQ(getGeneralRegister<uint32_t>(8), 0x12);
 
-  RUN_AARCH64(R"(
-    sub sp, sp, #1024
-    mov w0, #16
-    mov w1, #32
-    mov w2, #48
-    mov w3, #64
-    str w0, [sp], #32
-    str w1, [sp], #32
-    str w2, [sp], #32
-    str w3, [sp], #32
-    sub sp, sp, #128
-    ldaxrb w4, [sp]
-    add sp, sp, #32
-    ldaxrb w5, [sp]
-    add sp, sp, #32
-    ldaxrb w6, [sp]
-    add sp, sp, #32
-    ldaxrb w7, [sp]
-  )");
-
-  EXPECT_EQ(getGeneralRegister<uint32_t>(4), 16);
-  EXPECT_EQ(getGeneralRegister<uint32_t>(5), 32);
-  EXPECT_EQ(getGeneralRegister<uint32_t>(6), 48);
-  EXPECT_EQ(getGeneralRegister<uint32_t>(7), 64);
-
-  EXPECT_GROUP(R"(ldaxrb w7, [sp])", LOAD_INT);
+  EXPECT_GROUP(R"(ldaxrb w8, [x0])", LOAD_INT);
 }
 
 TEST_P(InstLoad, ldrb) {
diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc
index 91dee06ebb..c66f6f3c6f 100644
--- a/test/regression/aarch64/instructions/neon.cc
+++ b/test/regression/aarch64/instructions/neon.cc
@@ -2878,6 +2878,97 @@ TEST_P(InstNeon, umaxp) {
               0xCC, 0xBB, 0xAA, 0x99, 0x88});
 }
 
+TEST_P(InstNeon, umaxv) {
+  // umaxv vd, vn.t
+  initialHeapData_.resize(32);
+  uint8_t* heap = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+
+  // v0
+  heap[0] = 0x01;
+  heap[1] = 0x00;
+  heap[2] = 0xFF;
+  heap[3] = 0xAA;
+  heap[4] = 0xBB;
+  heap[5] = 0xCC;
+  heap[6] = 0xDD;
+  heap[7] = 0xEE;
+  heap[8] = 0x07;
+  heap[9] = 0x00;
+  heap[10] = 0xFC;
+  heap[11] = 0xFD;
+  heap[12] = 0xBA;
+  heap[13] = 0xCA;
+  heap[14] = 0x39;
+  heap[15] = 0xEF;
+
+  // v1
+  heap[16] = 0x00;
+  heap[17] = 0x00;
+  heap[18] = 0xEE;
+  heap[19] = 0x11;
+  heap[20] = 0x22;
+  heap[21] = 0x33;
+  heap[22] = 0x44;
+  heap[23] = 0x55;
+  heap[24] = 0x26;
+  heap[25] = 0xFF;
+  heap[26] = 0xEA;
+  heap[27] = 0xFA;
+  heap[28] = 0x14;
+  heap[29] = 0x43;
+  heap[30] = 0x21;
+  heap[31] = 0xAE;
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #16]
+    umaxv h2, v0.4h
+    umaxv h3, v1.4h
+
+    umaxv h4, v0.8h
+    umaxv h5, v1.8h
+
+    umaxv s6, v0.4s
+    umaxv s7, v1.4s
+
+    umaxv b8, v0.8b
+    umaxv b9, v1.8b
+    
+    umaxv b10, v0.16b
+    umaxv b11, v1.16b
+
+  )");
+  CHECK_NEON(2, uint16_t,
+             {0xEEDD, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000});
+  CHECK_NEON(3, uint16_t,
+             {0x5544, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000});
+  CHECK_NEON(4, uint16_t,
+             {0xFDFC, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000});
+  CHECK_NEON(5, uint16_t,
+             {0xFF26, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000});
+  CHECK_NEON(6, uint32_t, {0xFDFC0007, 0x00000000, 0x00000000, 0x00000000});
+  CHECK_NEON(7, uint32_t, {0xFAEAFF26, 0x00000000, 0x00000000, 0x00000000});
+  CHECK_NEON(8, uint8_t,
+             {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+              0x00, 0x00, 0x00, 0x00, 0x00})
+  CHECK_NEON(9, uint8_t,
+             {0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+              0x00, 0x00, 0x00, 0x00, 0x00})
+  CHECK_NEON(10, uint8_t,
+             {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+              0x00, 0x00, 0x00, 0x00, 0x00})
+  CHECK_NEON(11, uint8_t,
+             {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+              0x00, 0x00, 0x00, 0x00, 0x00})
+
+  EXPECT_GROUP(R"(umaxv b11, v1.16b)", SCALAR_SIMPLE_ARTH_NOSHIFT);
+}
+
 TEST_P(InstNeon, smax) {
   initialHeapData_.resize(32);
   uint32_t* heap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index ab027b408a..6a75c597cc 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -6229,6 +6229,7 @@ TEST_P(InstSve, pfirst) {
   CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 1));
   CHECK_PREDICATE(4, uint64_t, fillPred(1, {1}, 1));
   CHECK_PREDICATE(5, uint64_t, fillPred(VL / 8, {0}, 1));
+  EXPECT_EQ(getNZCV(), 0b0110);
 
   EXPECT_GROUP(R"(pfirst p5.b, p1, p5.b)", PREDICATE);
 }
@@ -7202,97 +7203,6 @@ TEST_P(InstSve, smulh) {
              fillNeonCombined<int32_t>({-12}, {-1076902265}, VL / 8));
 }
 
-TEST_P(InstSve, umaxv) {
-  // umaxv vd, vn.t
-  initialHeapData_.resize(32);
-  uint8_t* heap = reinterpret_cast<uint8_t*>(initialHeapData_.data());
-
-  // v0
-  heap[0] = 0x01;
-  heap[1] = 0x00;
-  heap[2] = 0xFF;
-  heap[3] = 0xAA;
-  heap[4] = 0xBB;
-  heap[5] = 0xCC;
-  heap[6] = 0xDD;
-  heap[7] = 0xEE;
-  heap[8] = 0x07;
-  heap[9] = 0x00;
-  heap[10] = 0xFC;
-  heap[11] = 0xFD;
-  heap[12] = 0xBA;
-  heap[13] = 0xCA;
-  heap[14] = 0x39;
-  heap[15] = 0xEF;
-
-  // v1
-  heap[16] = 0x00;
-  heap[17] = 0x00;
-  heap[18] = 0xEE;
-  heap[19] = 0x11;
-  heap[20] = 0x22;
-  heap[21] = 0x33;
-  heap[22] = 0x44;
-  heap[23] = 0x55;
-  heap[24] = 0x26;
-  heap[25] = 0xFF;
-  heap[26] = 0xEA;
-  heap[27] = 0xFA;
-  heap[28] = 0x14;
-  heap[29] = 0x43;
-  heap[30] = 0x21;
-  heap[31] = 0xAE;
-
-  RUN_AARCH64(R"(
-    # Get heap address
-    mov x0, 0
-    mov x8, 214
-    svc #0
-
-    ldr q0, [x0]
-    ldr q1, [x0, #16]
-    umaxv h2, v0.4h
-    umaxv h3, v1.4h
-
-    umaxv h4, v0.8h
-    umaxv h5, v1.8h
-
-    umaxv s6, v0.4s
-    umaxv s7, v1.4s
-
-    umaxv b8, v0.8b
-    umaxv b9, v1.8b
-    
-    umaxv b10, v0.16b
-    umaxv b11, v1.16b
-
-  )");
-  CHECK_NEON(2, uint16_t,
-             {0xEEDD, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000});
-  CHECK_NEON(3, uint16_t,
-             {0x5544, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000});
-  CHECK_NEON(4, uint16_t,
-             {0xFDFC, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000});
-  CHECK_NEON(5, uint16_t,
-             {0xFF26, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000});
-  CHECK_NEON(6, uint32_t, {0xFDFC0007, 0x00000000, 0x00000000, 0x00000000});
-  CHECK_NEON(7, uint32_t, {0xFAEAFF26, 0x00000000, 0x00000000, 0x00000000});
-  CHECK_NEON(8, uint8_t,
-             {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-              0x00, 0x00, 0x00, 0x00, 0x00})
-  CHECK_NEON(9, uint8_t,
-             {0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-              0x00, 0x00, 0x00, 0x00, 0x00})
-  CHECK_NEON(10, uint8_t,
-             {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-              0x00, 0x00, 0x00, 0x00, 0x00})
-  CHECK_NEON(11, uint8_t,
-             {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-              0x00, 0x00, 0x00, 0x00, 0x00})
-
-  EXPECT_GROUP(R"(umaxv b11, v1.16b)", SCALAR_SIMPLE_ARTH_NOSHIFT);
-}
-
 TEST_P(InstSve, clastb) {
   // 64 bit
   RUN_AARCH64(R"(

From 6119ade2f0df8f05439b43220c6b6c23f0e9cafe Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Wed, 18 Dec 2024 16:41:34 +0000
Subject: [PATCH 36/38] Responded to more comments

---
 .../simeng/arch/aarch64/helpers/neon.hh       |  8 ++++--
 .../simeng/arch/aarch64/helpers/sve.hh        | 25 +++++++++----------
 src/lib/arch/aarch64/Instruction_execute.cc   |  8 +++---
 src/lib/pipeline/ReorderBuffer.cc             | 13 +++++++---
 4 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh
index 98c1648d6b..e5cf3dd3aa 100644
--- a/src/include/simeng/arch/aarch64/helpers/neon.hh
+++ b/src/include/simeng/arch/aarch64/helpers/neon.hh
@@ -558,11 +558,13 @@ RegisterValue vecLogicOp_3vecs(srcValContainer& sourceValues,
   return {out, 256};
 }
 
-/** Helper function for NEON instructions with the format `uaddlv zd, zn.T`.
+/** Helper function for NEON instructions with the format `uaddlv rd, Vn.T`.
  * T represents the type of the destination register (e.g. for h0, T =
  * uint32_t).
  * U represents the type of the sourceValues[0] (e.g. for v0.8b, U =
  * uint8_t)
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
  * Returns correctly formatted RegisterValue. */
 template <typename T, typename U, int I>
 RegisterValue vecAddlv(srcValContainer& sourceValues) {
@@ -574,8 +576,10 @@ RegisterValue vecAddlv(srcValContainer& sourceValues) {
   return {out, 256};
 }
 
-/** Helper function for NEON instructions with the format `umaxv Vd, Vn.T`.
+/** Helper function for NEON instructions with the format `umaxv rd, Vn.T`.
  * T represents the type of sourceValues (e.g. for vn.s, T = uint32_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
  * Returns correctly formatted RegisterValue. */
 template <typename T, int I>
 RegisterValue vecUMaxV(srcValContainer& sourceValues) {
diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 8b23bb0ea9..563cc3ed62 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -877,8 +877,9 @@ RegisterValue sveFsqrtPredicated_2vecs(srcValContainer& sourceValues,
 
 /** Helper function for SVE instructions with the format `ftsmul zd, zn, zm`.
  * T represents the type of sourceValues (e.g. for zn.d, T = double).
- * Returns correctly formatted RegisterValue. U represents the same precision as
- * T, but as an integer type for the second source register. */
+ * U represents the same precision as T, but as an integer type for the second
+ * source register.
+ * Returns correctly formatted RegisterValue. */
 template <typename T, typename U>
 RegisterValue sveFTrigSMul(srcValContainer& sourceValues,
                            const uint16_t VL_bits) {
@@ -903,8 +904,9 @@ RegisterValue sveFTrigSMul(srcValContainer& sourceValues,
 
 /** Helper function for SVE instructions with the format `ftssel zd, zn, zm`.
  * T represents the type of sourceValues (e.g. for zn.d, T = double).
- * Returns correctly formatted RegisterValue. U represents the same precision as
- * T, but as an integer type for the second source register. */
+ * U represents the same precision as T, but as an integer type for the second
+ * source register.
+ * Returns correctly formatted RegisterValue. */
 template <typename T, typename U>
 RegisterValue sveFTrigSSel(srcValContainer& sourceValues,
                            const uint16_t VL_bits) {
@@ -1096,7 +1098,6 @@ RegisterValue sveLastBScalar(srcValContainer& sourceValues,
   const T* n = sourceValues[1].getAsVector<T>();
 
   const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-  T out;
 
   // Get last active element
   int lastElem = 0;
@@ -1109,20 +1110,18 @@ RegisterValue sveLastBScalar(srcValContainer& sourceValues,
     // If no active lane has been found, select highest element instead
     if (i == 0) lastElem = partition_num - 1;
   }
-
-  out = n[lastElem];
-  return {out, 256};
+  return {n[lastElem], 256};
 }
 
-/** Helper function for SVE instructions with the format `clastb rd, pg, rd,
+/** Helper function for SVE instructions with the format `clastb zd, pg, zd,
  * zn`.
  * T represents the vector register type (e.g. zd.d would be uint64_t).
  * Returns correctly formatted RegisterValue. */
 template <typename T>
-RegisterValue sveCLastBScalar(srcValContainer& sourceValues,
-                              const uint16_t VL_bits) {
+RegisterValue sveCLastBSimdScalar(srcValContainer& sourceValues,
+                                  const uint16_t VL_bits) {
   const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
-  const uint64_t m = sourceValues[2].get<T>();
+  const T* m = sourceValues[2].getAsVector<T>();
   const T* n = sourceValues[3].getAsVector<T>();
 
   const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
@@ -1139,7 +1138,7 @@ RegisterValue sveCLastBScalar(srcValContainer& sourceValues,
   }
 
   if (lastElem < 0) {
-    out = m;
+    out = m[0];
   } else {
     out = n[lastElem];
   }
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 2c87ee9ed3..6bbb6c0006 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -2666,19 +2666,19 @@ void Instruction::execute() {
         break;
       }
       case Opcode::AArch64_CLASTB_VPZ_D: {  // clastb dd, pg, dn, zn.d
-        results_[0] = sveCLastBScalar<uint64_t>(sourceValues_, VL_bits);
+        results_[0] = sveCLastBSimdScalar<uint64_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_CLASTB_VPZ_S: {  // clastb sd, pg, sn, zn.s
-        results_[0] = sveCLastBScalar<uint32_t>(sourceValues_, VL_bits);
+        results_[0] = sveCLastBSimdScalar<uint32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_CLASTB_VPZ_H: {  // clastb hd, pg, hn, zn.h
-        results_[0] = sveCLastBScalar<uint16_t>(sourceValues_, VL_bits);
+        results_[0] = sveCLastBSimdScalar<uint16_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_CLASTB_VPZ_B: {  // clastb bd, pg, bn, zn.b
-        results_[0] = sveCLastBScalar<uint8_t>(sourceValues_, VL_bits);
+        results_[0] = sveCLastBSimdScalar<uint8_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_LD1_MXIPXX_H_D: {  // ld1d {zath.d[ws, #imm]}, pg/z,
diff --git a/src/lib/pipeline/ReorderBuffer.cc b/src/lib/pipeline/ReorderBuffer.cc
index 33326944a3..e53849ea89 100644
--- a/src/lib/pipeline/ReorderBuffer.cc
+++ b/src/lib/pipeline/ReorderBuffer.cc
@@ -87,10 +87,17 @@ unsigned int ReorderBuffer::commit(uint64_t maxCommitSize) {
       // sufficiently long, assume an error in SimEng has occured.
       robHeadRepeatCounter_++;
       if (robHeadRepeatCounter_ > 10000000) {
-        std::cerr << "[SimEng:ReorderBuffer] Infinite loop detected in rob "
-                     "commit at instruction address "
+        std::cerr << "[SimEng:ReorderBuffer] Instruction stuck unable to "
+                     "commit at the head of ROB for a very long time at "
+                     "instruction address 0x"
                   << std::hex << uop->getInstructionAddress() << std::dec
-                  << " (" << uop->getMicroOpIndex() << ")." << std::endl;
+                  << " (MicroOp Index: " << uop->getMicroOpIndex()
+                  << "). This is unexpected behaviour for most valid core "
+                     "configurations, though may arise in designs with very "
+                     "high latencies or bottlenecks. If this is not the case, "
+                     "please try re-running. Please raise an issue on GitHub "
+                     "if the problem persists."
+                  << std::endl;
         exit(1);
       }
       break;

From 6da7f5cf64fe06f780b5c131251a49ed3a9c594f Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Wed, 18 Dec 2024 16:50:01 +0000
Subject: [PATCH 37/38] Updated naming for confusing lastb helper

---
 src/include/simeng/arch/aarch64/helpers/sve.hh | 6 +++---
 src/lib/arch/aarch64/Instruction_execute.cc    | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 563cc3ed62..08afb5bb19 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -1088,12 +1088,12 @@ RegisterValue sveIndex(
   return {out, 256};
 }
 
-/** Helper function for SVE instructions with the format `lastb rd, pg, zn`.
+/** Helper function for SVE instructions with the format `lastb zd, pg, zn`.
  * T represents the vector register type (e.g. zd.d would be uint64_t).
  * Returns correctly formatted RegisterValue. */
 template <typename T>
-RegisterValue sveLastBScalar(srcValContainer& sourceValues,
-                             const uint16_t VL_bits) {
+RegisterValue sveLastBSimdScalar(srcValContainer& sourceValues,
+                                 const uint16_t VL_bits) {
   const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
   const T* n = sourceValues[1].getAsVector<T>();
 
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 6bbb6c0006..3090e3cb42 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -2650,19 +2650,19 @@ void Instruction::execute() {
         break;
       }
       case Opcode::AArch64_LASTB_VPZ_D: {  // lastb dd, pg, zn.d
-        results_[0] = sveLastBScalar<uint64_t>(sourceValues_, VL_bits);
+        results_[0] = sveLastBSimdScalar<uint64_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_LASTB_VPZ_S: {  // lastb sd, pg, zn.s
-        results_[0] = sveLastBScalar<uint32_t>(sourceValues_, VL_bits);
+        results_[0] = sveLastBSimdScalar<uint32_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_LASTB_VPZ_H: {  // lastb hd, pg, zn.h
-        results_[0] = sveLastBScalar<uint16_t>(sourceValues_, VL_bits);
+        results_[0] = sveLastBSimdScalar<uint16_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_LASTB_VPZ_B: {  // lastb bd, pg, zn.b
-        results_[0] = sveLastBScalar<uint8_t>(sourceValues_, VL_bits);
+        results_[0] = sveLastBSimdScalar<uint8_t>(sourceValues_, VL_bits);
         break;
       }
       case Opcode::AArch64_CLASTB_VPZ_D: {  // clastb dd, pg, dn, zn.d

From c9f708b46797db6e55734acc6c9478cb7a4f7c50 Mon Sep 17 00:00:00 2001
From: Joseph Moore <moore.joseph002@gmail.com>
Date: Thu, 19 Dec 2024 23:43:29 +0000
Subject: [PATCH 38/38] Fixed issues arising from merge conflicts on Capstone
 Update branch. Updated comment for infinite loop detector

---
 .../simeng/arch/aarch64/helpers/sve.hh        | 14 ++---
 src/include/simeng/pipeline/ReorderBuffer.hh  |  4 ++
 src/lib/arch/aarch64/InstructionMetadata.cc   | 55 ++++++++++++++++---
 src/lib/arch/aarch64/Instruction_decode.cc    |  7 ++-
 src/lib/pipeline/ReorderBuffer.cc             | 11 ++--
 test/regression/aarch64/instructions/sve.cc   |  2 +-
 6 files changed, 70 insertions(+), 23 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 08afb5bb19..6d4c0df66a 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -1120,9 +1120,9 @@ RegisterValue sveLastBSimdScalar(srcValContainer& sourceValues,
 template <typename T>
 RegisterValue sveCLastBSimdScalar(srcValContainer& sourceValues,
                                   const uint16_t VL_bits) {
-  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
-  const T* m = sourceValues[2].getAsVector<T>();
-  const T* n = sourceValues[3].getAsVector<T>();
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* m = sourceValues[1].getAsVector<T>();
+  const T* n = sourceValues[2].getAsVector<T>();
 
   const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
   T out;
@@ -1510,8 +1510,8 @@ std::array<uint64_t, 4> svePsel(
 std::tuple<std::array<uint64_t, 4>, uint8_t> svePfirst(
     srcValContainer& sourceValues, const uint16_t VL_bits) {
   const uint16_t partition_num = VL_bits / 8;
-  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
-  const uint64_t* dn = sourceValues[2].getAsVector<uint64_t>();
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const uint64_t* dn = sourceValues[1].getAsVector<uint64_t>();
   // Set destination d as source n to copy all false lanes and the active lanes
   // beyond the first
   std::array<uint64_t, 4> out = {dn[0], dn[1], dn[2], dn[3]};
@@ -1535,8 +1535,8 @@ std::tuple<std::array<uint64_t, 4>, uint8_t> svePnext(
     const simeng::arch::aarch64::InstructionMetadata& metadata,
     const uint16_t VL_bits) {
   const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
-  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
-  const uint64_t* dn = sourceValues[2].getAsVector<uint64_t>();
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const uint64_t* dn = sourceValues[1].getAsVector<uint64_t>();
   // Set destination elements to 0
   std::array<uint64_t, 4> out = {0, 0, 0, 0};
 
diff --git a/src/include/simeng/pipeline/ReorderBuffer.hh b/src/include/simeng/pipeline/ReorderBuffer.hh
index c7ee01fcc6..4c31eeb38a 100644
--- a/src/include/simeng/pipeline/ReorderBuffer.hh
+++ b/src/include/simeng/pipeline/ReorderBuffer.hh
@@ -129,6 +129,10 @@ class ReorderBuffer {
    * the ROB */
   uint64_t robHeadRepeatCounter_ = 0;
 
+  /** A limit for the counter of how long an instruction can be stuck at the
+   * head of the ROB before SimEng exits with an exception. */
+  uint64_t robHeadRepeatLimit_ = 10000000;
+
   /** The sequence ID of the youngest instruction that should remain after the
    * current flush. */
   uint64_t flushAfter_;
diff --git a/src/lib/arch/aarch64/InstructionMetadata.cc b/src/lib/arch/aarch64/InstructionMetadata.cc
index 9653e3a00a..219023d93a 100644
--- a/src/lib/arch/aarch64/InstructionMetadata.cc
+++ b/src/lib/arch/aarch64/InstructionMetadata.cc
@@ -89,14 +89,6 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
     }
     case Opcode::AArch64_SMAX_ZI_B:
       [[fallthrough]];
-    case Opcode::AArch64_FTSMUL_ZZZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_FTSMUL_ZZZ_S:
-      [[fallthrough]];
-    case Opcode::AArch64_FTSSEL_ZZZ_D:
-      [[fallthrough]];
-    case Opcode::AArch64_FTSSEL_ZZZ_S:
-      [[fallthrough]];
     case Opcode::AArch64_SMAX_ZI_D:
       [[fallthrough]];
     case Opcode::AArch64_SMAX_ZI_H:
@@ -108,6 +100,14 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
       operands[0].access = CS_AC_WRITE;
       operands[1].access = CS_AC_READ;
       break;
+    case Opcode::AArch64_FTSMUL_ZZZ_D:
+      [[fallthrough]];
+    case Opcode::AArch64_FTSMUL_ZZZ_S:
+      [[fallthrough]];
+    case Opcode::AArch64_FTSSEL_ZZZ_D:
+      [[fallthrough]];
+    case Opcode::AArch64_FTSSEL_ZZZ_S:
+      [[fallthrough]];
     case Opcode::AArch64_FSUB_ZPmI_D:
       [[fallthrough]];
     case Opcode::AArch64_FSUB_ZPmI_H:
@@ -131,6 +131,41 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
       operands[2].access = CS_AC_READ;
       break;
     }
+    case Opcode::AArch64_FTMAD_ZZI_D:
+      [[fallthrough]];
+    case Opcode::AArch64_FTMAD_ZZI_S: {
+      // Incorrect access types
+      operands[0].access = CS_AC_READ | CS_AC_WRITE;
+      operands[1].access = CS_AC_READ;
+      operands[2].access = CS_AC_READ;
+      break;
+    }
+    case Opcode::AArch64_PFIRST_B:
+      [[fallthrough]];
+    case Opcode::AArch64_PNEXT_D:
+      [[fallthrough]];
+    case Opcode::AArch64_PNEXT_S:
+      [[fallthrough]];
+    case Opcode::AArch64_PNEXT_H:
+      [[fallthrough]];
+    case Opcode::AArch64_PNEXT_B: {
+      // Incorrect access types
+      operands[0].access = CS_AC_WRITE;
+      operands[1].access = CS_AC_READ;
+      operands[2].access = CS_AC_READ;
+      // Doesn't identify implicit NZCV destination
+      implicitDestinationCount = 1;
+      implicitDestinations[0] = AARCH64_REG_NZCV;
+      break;
+    }
+    case Opcode::AArch64_CLASTB_VPZ_D:
+      [[fallthrough]];
+    case Opcode::AArch64_CLASTB_VPZ_S:
+      [[fallthrough]];
+    case Opcode::AArch64_CLASTB_VPZ_H:
+      [[fallthrough]];
+    case Opcode::AArch64_CLASTB_VPZ_B:
+      [[fallthrough]];
     case Opcode::AArch64_AND_ZPmZ_D:  // Example bytecode - 4901da04
       [[fallthrough]];
     case Opcode::AArch64_AND_ZPmZ_H:
@@ -163,6 +198,10 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
       [[fallthrough]];
     case Opcode::AArch64_SMAX_ZPmZ_S:  // Example bytecode - 01008804
       [[fallthrough]];
+    case Opcode::AArch64_SPLICE_ZPZ_D:
+      [[fallthrough]];
+    case Opcode::AArch64_SPLICE_ZPZ_S:
+      [[fallthrough]];
     case Opcode::AArch64_MUL_ZPmZ_B:  // Example bytecode - 40001004
       [[fallthrough]];
     case Opcode::AArch64_MUL_ZPmZ_D:
diff --git a/src/lib/arch/aarch64/Instruction_decode.cc b/src/lib/arch/aarch64/Instruction_decode.cc
index de68245ff6..215ade08fa 100644
--- a/src/lib/arch/aarch64/Instruction_decode.cc
+++ b/src/lib/arch/aarch64/Instruction_decode.cc
@@ -499,9 +499,10 @@ void Instruction::decode() {
 
     // Check first operand access to determine if it's a load or store
     if (metadata_.operands[0].access & CS_AC_WRITE) {
-      if (metadata_.id == ARM64_INS_STXR || metadata_.id == ARM64_INS_STLXR ||
-          metadata_.id == ARM64_INS_STLXRB ||
-          metadata_.id == ARM64_INS_STLXRH) {
+      if (metadata_.id == AARCH64_INS_STXR ||
+          metadata_.id == AARCH64_INS_STLXR ||
+          metadata_.id == AARCH64_INS_STLXRB ||
+          metadata_.id == AARCH64_INS_STLXRH) {
         // Exceptions to this is load condition are exclusive store with a
         // success flag as first operand
         if (microOpcode_ != MicroOpcode::STR_DATA) {
diff --git a/src/lib/pipeline/ReorderBuffer.cc b/src/lib/pipeline/ReorderBuffer.cc
index e53849ea89..20a2970995 100644
--- a/src/lib/pipeline/ReorderBuffer.cc
+++ b/src/lib/pipeline/ReorderBuffer.cc
@@ -86,17 +86,20 @@ unsigned int ReorderBuffer::commit(uint64_t maxCommitSize) {
       // If an instruction has been stuck at the head of the rob for
       // sufficiently long, assume an error in SimEng has occured.
       robHeadRepeatCounter_++;
-      if (robHeadRepeatCounter_ > 10000000) {
+      if (robHeadRepeatCounter_ > robHeadRepeatLimit_) {
         std::cerr << "[SimEng:ReorderBuffer] Instruction stuck unable to "
-                     "commit at the head of ROB for a very long time at "
+                     "commit at the head of ROB for 10,000,000 cycles at "
                      "instruction address 0x"
                   << std::hex << uop->getInstructionAddress() << std::dec
                   << " (MicroOp Index: " << uop->getMicroOpIndex()
                   << "). This is unexpected behaviour for most valid core "
                      "configurations, though may arise in designs with very "
                      "high latencies or bottlenecks. If this is not the case, "
-                     "please try re-running. Please raise an issue on GitHub "
-                     "if the problem persists."
+                     "please try re-running. If this may be expected, you can "
+                     "increase this limit in "
+                     "`SimEng/src/include/pipeline/ReorderBuffer.hh` under the "
+                     "variable `robHeadRepeatLimit_`. Please raise "
+                     "an issue on GitHub if the problem persists."
                   << std::endl;
         exit(1);
       }
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 6a75c597cc..f94ee28262 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -5017,7 +5017,7 @@ TEST_P(InstSve, ftsmul) {
                               144.0f, 12.25f},
                              VL / 16));
 
-  EXPECT_GROUP(R"(ftsmul z3.s, z1.s, z0.s)", SVE_MUL);
+  EXPECT_GROUP(R"(ftsmul z3.s, z1.s, z0.s)", SVE_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(InstSve, ftssel) {