From 51ade58d82460d966dc901cf30fa0b22ad4cf5ea Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 7 Oct 2024 17:10:33 +0100
Subject: [PATCH 01/71] Fixed execution logic for UMINP and UMAXP neon
 instructions.

---
 .../simeng/arch/aarch64/helpers/neon.hh        | 14 ++++++++++++--
 src/lib/arch/aarch64/ExceptionHandler.cc       |  5 ++---
 src/lib/arch/aarch64/InstructionMetadata.cc    |  2 +-
 .../aarch64/instructions/bitmanip.cc           | 18 ++++++++++++++++++
 test/regression/aarch64/instructions/neon.cc   |  8 ++++----
 5 files changed, 37 insertions(+), 10 deletions(-)
diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh
index c2626b7e91..17137dcb55 100644
--- a/src/include/simeng/arch/aarch64/helpers/neon.hh
+++ b/src/include/simeng/arch/aarch64/helpers/neon.hh
@@ -568,9 +568,14 @@ RegisterValue vecUMaxP(srcValContainer& sourceValues) {
   const T* n = sourceValues[0].getAsVector<T>();
   const T* m = sourceValues[1].getAsVector<T>();
 
+  // Concatenate the vectors
+  T temp[2 * I];
+  memcpy(temp, m, sizeof(T) * I);
+  memcpy(temp + (sizeof(T) * I), n, sizeof(T) * I);
+  // Compare each adjacent pair of elements
   T out[I];
   for (int i = 0; i < I; i++) {
-    out[i] = std::max(n[i], m[i]);
+    out[i] = std::max(temp[2 * i], temp[2 * i + 1]);
   }
   return {out, 256};
 }
@@ -585,9 +590,14 @@ RegisterValue vecUMinP(srcValContainer& sourceValues) {
   const T* n = sourceValues[0].getAsVector<T>();
   const T* m = sourceValues[1].getAsVector<T>();
 
+  // Concatenate the vectors
+  T temp[2 * I];
+  memcpy(temp, m, sizeof(T) * I);
+  memcpy(temp + (sizeof(T) * I), n, sizeof(T) * I);
+
   T out[I];
   for (int i = 0; i < I; i++) {
-    out[i] = std::min(n[i], m[i]);
+    out[i] = std::min(temp[2 * i], temp[2 * i + 1]);
   }
   return {out, 256};
 }
diff --git a/src/lib/arch/aarch64/ExceptionHandler.cc b/src/lib/arch/aarch64/ExceptionHandler.cc
index ae98dddb1a..ff7375339f 100644
--- a/src/lib/arch/aarch64/ExceptionHandler.cc
+++ b/src/lib/arch/aarch64/ExceptionHandler.cc
@@ -626,8 +626,7 @@ bool ExceptionHandler::init() {
 
         break;
       }
-      case 293:  // rseq
-      {
+      case 293: {  // rseq
         stateChange = {ChangeType::REPLACEMENT, {R0}, {0ull}};
         break;
       }
@@ -818,7 +817,7 @@ void ExceptionHandler::readLinkAt(span<char> path) {
   for (size_t i = 0; i < bytesCopied; i += 256) {
     uint8_t size = std::min<uint64_t>(bytesCopied - i, 256ul);
     stateChange.memoryAddresses.push_back({bufAddress + i, size});
-    stateChange.memoryAddressValues.push_back(RegisterValue(bufPtr, size));
+    stateChange.memoryAddressValues.push_back(RegisterValue(bufPtr + i, size));
   }
 
   concludeSyscall(stateChange);
diff --git a/src/lib/arch/aarch64/InstructionMetadata.cc b/src/lib/arch/aarch64/InstructionMetadata.cc
index 34ddca07d7..ce71ec5b1f 100644
--- a/src/lib/arch/aarch64/InstructionMetadata.cc
+++ b/src/lib/arch/aarch64/InstructionMetadata.cc
@@ -244,7 +244,7 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
   if (isAlias) {
     exceptionString_ =
         "This instruction is an alias. The printed mnemonic and operand string "
-        "differ from what is expected of the Capstone opcode.";
+        "may differ from the underlying opcode.";
   }
 }
 
diff --git a/test/regression/aarch64/instructions/bitmanip.cc b/test/regression/aarch64/instructions/bitmanip.cc
index a72dcb64dc..30eb27fcef 100644
--- a/test/regression/aarch64/instructions/bitmanip.cc
+++ b/test/regression/aarch64/instructions/bitmanip.cc
@@ -274,11 +274,20 @@ TEST_P(InstBitmanip, ubfm) {
     ubfm w2, w0, #16, #31
     ubfm w3, w0, #28, #23
     ubfm w4, w0, #30, #27
+
+    # check alias
+    mov w10, #-1
+    mov w11, #-1
+    mov w12, #128
+    lsl w10, w12, #1
+    lsr w11, w12, #1
   )");
   EXPECT_EQ(getGeneralRegister<uint32_t>(1), 0x000007A0ull);
   EXPECT_EQ(getGeneralRegister<uint32_t>(2), 0x0000007Aull);
   EXPECT_EQ(getGeneralRegister<uint32_t>(3), 0x07A00000ull);
   EXPECT_EQ(getGeneralRegister<uint32_t>(4), 0x01E80000ull);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(10), 256);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(11), 64);
 
   RUN_AARCH64(R"(
     # Fill destination registers with 1s
@@ -295,11 +304,20 @@ TEST_P(InstBitmanip, ubfm) {
     ubfm x2, x0, #16, #63
     ubfm x3, x0, #32, #23
     ubfm x4, x0, #60, #55
+
+    # check alias
+    mov x10, #-1
+    mov x11, #-1
+    mov x12, #128
+    lsl x10, x12, #1
+    lsr x11, x12, #1
   )");
   EXPECT_EQ(getGeneralRegister<uint64_t>(1), 0x00000000000007A0ull);
   EXPECT_EQ(getGeneralRegister<uint64_t>(2), 0x000000000000007Aull);
   EXPECT_EQ(getGeneralRegister<uint64_t>(3), 0x007A000000000000ull);
   EXPECT_EQ(getGeneralRegister<uint64_t>(4), 0x0000000007A00000ull);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(10), 256);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(11), 64);
 }
 
 INSTANTIATE_TEST_SUITE_P(AArch64, InstBitmanip,
diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc
index a4731f388f..ad11b13e9a 100644
--- a/test/regression/aarch64/instructions/neon.cc
+++ b/test/regression/aarch64/instructions/neon.cc
@@ -2684,8 +2684,8 @@ TEST_P(InstNeon, uminp) {
 
   )");
   CHECK_NEON(2, uint8_t,
-             {0x00, 0x00, 0xEE, 0x11, 0x22, 0x33, 0x44, 0x55, 0x01, 0x02, 0x03,
-              0x04, 0x05, 0x06, 0x07, 0x08});
+             {0x00, 0x11, 0x22, 0x44, 0xEE, 0xCC, 0xAA, 0x88, 0x00, 0xAA, 0xBB,
+              0xDD, 0x01, 0x03, 0x05, 0x07});
 }
 TEST_P(InstNeon, umaxp) {
   // umaxp vd.16b vn.16b vm.16b
@@ -2742,8 +2742,8 @@ TEST_P(InstNeon, umaxp) {
 
   )");
   CHECK_NEON(2, uint8_t,
-             {0x01, 0x00, 0xFF, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0xEE, 0xDD,
-              0xCC, 0xBB, 0xAA, 0x99, 0x88});
+             {0x00, 0xEE, 0x33, 0x55, 0xFF, 0xDD, 0xBB, 0x99, 0x01, 0xFF, 0xCC,
+              0xEE, 0x02, 0x04, 0x06, 0x08});
 }
 
 TEST_P(InstNeon, smax) {

From 6a11d7d8b46d02d24e634a85368e69dc0d10d576 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 7 Oct 2024 17:36:38 +0100
Subject: [PATCH 02/71] Implemented ldrsb (32-bit, Post) instruction with test.

---
 src/lib/arch/aarch64/Instruction_address.cc  |  4 ++++
 src/lib/arch/aarch64/Instruction_execute.cc  |  8 ++++++++
 test/regression/aarch64/instructions/load.cc | 17 +++++++++++++----
 3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index 357077e7b3..a8a98e5edd 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -679,6 +679,10 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({{base, 4}, {base + 4, 4}});
         break;
       }
+      case Opcode::AArch64_LDRSBWpost: {  // ldrsb wt, [xn], #imm
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 1}});
+        break;
+      }
       case Opcode::AArch64_LDRSBWroX: {  // ldrsb wt, [xn, xm{, extend
                                          // {#amount}}]
         uint64_t offset = extendOffset(sourceValues_[1].get<uint64_t>(),
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 20b62904b9..63f8147aa3 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -3522,6 +3522,14 @@ void Instruction::execute() {
         results_[0] = memoryData_[0].zeroExtend(16, 256);
         break;
       }
+      case Opcode::AArch64_LDRSBWpost: {  // ldrsb wt, [xn], #imm
+        // LOAD
+        results_[1] = RegisterValue(
+            static_cast<int32_t>(memoryData_[0].get<int8_t>()), 4);
+        results_[0] =
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[2].imm;
+        break;
+      }
       case Opcode::AArch64_LDRSBWroX: {  // ldrsb wt, [xn, xm{, extend
                                          // {#amount}}]
         // LOAD
diff --git a/test/regression/aarch64/instructions/load.cc b/test/regression/aarch64/instructions/load.cc
index 09269eebb8..2718c1fdb3 100644
--- a/test/regression/aarch64/instructions/load.cc
+++ b/test/regression/aarch64/instructions/load.cc
@@ -1222,14 +1222,23 @@ TEST_P(InstLoad, ldrsb) {
     mov x5, 1
     # Load 8-bit values from heap and sign-extend to 32-bits
     ldrsb w1, [x0, x5, sxtx]
+    # Post Index
+    mov x20, x0
+    ldrsb w2, [x20], #16
+
 
     # Load 8-bit values from heap and sign-extend to 64-bits
-    ldrsb x2, [x0]
-    ldrsb x3, [x0, #3]
+    ldrsb x3, [x0]
+    ldrsb x4, [x0, #3]
+
   )");
   EXPECT_EQ(getGeneralRegister<int32_t>(1), INT8_MAX);
-  EXPECT_EQ(getGeneralRegister<int64_t>(2), -2);
-  EXPECT_EQ(getGeneralRegister<int64_t>(3), 64);
+  EXPECT_EQ(getGeneralRegister<int32_t>(2), -2);
+  EXPECT_EQ(getGeneralRegister<int64_t>(20),
+            getGeneralRegister<uint64_t>(0) + 16);
+
+  EXPECT_EQ(getGeneralRegister<int64_t>(3), -2);
+  EXPECT_EQ(getGeneralRegister<int64_t>(4), 64);
 }
 
 TEST_P(InstLoad, ldrsh) {

From 520324c4049851675fd71cef41ed7aa968070aab Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Tue, 8 Oct 2024 15:31:06 +0100
Subject: [PATCH 03/71] Fixed implementation of NEON CMHS instruction.

---
 src/lib/arch/aarch64/Instruction_execute.cc  |  4 ++--
 test/regression/aarch64/instructions/neon.cc | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 63f8147aa3..93c1bfeca2 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -700,9 +700,9 @@ void Instruction::execute() {
         break;
       }
       case Opcode::AArch64_CMHSv16i8: {  // cmhs vd.16b, vn.16b, vm.16b
-        results_[0] = vecCompare<int8_t, 16>(
+        results_[0] = vecCompare<uint8_t, 16>(
             sourceValues_, false,
-            [](int8_t x, int8_t y) -> bool { return (x >= y); });
+            [](uint8_t x, uint8_t y) -> bool { return (x >= y); });
         break;
       }
       case Opcode::AArch64_CMPEQ_PPzZI_B: {  // cmpeq pd.b, pg/z, zn.b, #imm
diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc
index ad11b13e9a..2a28a4e22b 100644
--- a/test/regression/aarch64/instructions/neon.cc
+++ b/test/regression/aarch64/instructions/neon.cc
@@ -727,8 +727,8 @@ TEST_P(InstNeon, cmhs) {
   heap[1] = 0x7F;
   heap[2] = INT8_MAX;
   heap[3] = 1;
-  heap[4] = -128;
-  heap[5] = -1;
+  heap[4] = 128;
+  heap[5] = 1;
   heap[6] = 0xAA;
   heap[7] = 0xBB;
   heap[8] = 0xCC;
@@ -744,7 +744,7 @@ TEST_P(InstNeon, cmhs) {
   heap[16] = INT8_MAX;
   heap[17] = 0x7F;
   heap[18] = 0;
-  heap[19] = -128;
+  heap[19] = 128;
   heap[20] = 1;
   heap[21] = 0;
   heap[22] = 0xAA;
@@ -772,10 +772,10 @@ TEST_P(InstNeon, cmhs) {
   )");
 
   CHECK_NEON(2, uint8_t,
-             {0x00, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+             {0x00, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
               0xFF, 0xFF, 0xFF, 0xFF, 0xFF});
   CHECK_NEON(3, uint8_t,
-             {0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
+             {0xFF, 0xFF, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
               0x00, 0xFF, 0xFF, 0xFF, 0xFF});
 }
 

From 2b4a88605cead6f45aeeb1c149d5ce6be579c423 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Wed, 9 Oct 2024 13:35:38 +0100
Subject: [PATCH 04/71] Implemented UCVTF (fixed-point to float) instruction
 with test.

---
 src/lib/arch/aarch64/Instruction_execute.cc   | 11 +++++++
 test/regression/aarch64/instructions/float.cc | 31 +++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 93c1bfeca2..4f4e17fa18 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -5526,6 +5526,17 @@ void Instruction::execute() {
             bfm_2imms<uint64_t>(sourceValues_, metadata_, false, true);
         break;
       }
+      case Opcode::AArch64_UCVTFSXSri: {  // ucvtf sd, xn, #fbits
+        // Convert Fixed-Point to FP32
+        // Using algorithm from
+        // https://embeddedartistry.com/blog/2018/07/12/simple-fixed-point-conversion-in-c/
+        const uint64_t xn = sourceValues_[0].get<uint64_t>();
+        const uint64_t fbits = static_cast<uint64_t>(metadata_.operands[2].imm);
+        std::cerr << xn << " " << fbits << std::endl;
+        results_[0] = {
+            static_cast<float>(xn) / static_cast<float>(1ull << fbits), 256};
+        break;
+      }
       case Opcode::AArch64_UCVTFUWDri: {  // ucvtf dd, wn
         results_[0] = {static_cast<double>(sourceValues_[0].get<uint32_t>()),
                        256};
diff --git a/test/regression/aarch64/instructions/float.cc b/test/regression/aarch64/instructions/float.cc
index 03f3f799df..bc2d09ea27 100644
--- a/test/regression/aarch64/instructions/float.cc
+++ b/test/regression/aarch64/instructions/float.cc
@@ -1453,6 +1453,37 @@ TEST_P(InstFloat, ucvtf) {
   CHECK_NEON(9, float, {static_cast<float>(UINT64_C(1) << 48), 0.f, 0.f, 0.f});
   CHECK_NEON(10, float, {static_cast<float>(UINT64_MAX), 0.f, 0.f, 0.f});
   CHECK_NEON(11, float, {0.f, 0.f, 0.f, 0.f});
+
+  // 32-bit unsigned fixed-point to float
+  // Numbers have been chosen to have less than 0.0005 fixed-point
+  // representation error to ensure tests pass
+  initialHeapData_.resize(12);
+  heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heap32[0] = 0x000001EE;
+  heap32[1] = 0x00021F3B;
+  heap32[2] = 0x32FE6B75;
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    # 2 fraction-bits (123.5)
+    ldr w1, [x0], #4
+    ucvtf s1, x1, #0x2
+
+    # 8 fraction-bits (543.23)
+    ldr w2, [x0], #4
+    ucvtf s2, x2, #0x8
+
+
+    # 23 fraction-bits (101.987654321)
+    ldr w3, [x0]
+    ucvtf s3, x3, #0x17
+  )");
+  CHECK_NEON(1, float, {123.5f, 0.0f, 0.0f, 0.0f});
+  CHECK_NEON(2, float, {543.23f, 0.0f, 0.0f, 0.0f});
+  CHECK_NEON(3, float, {101.987654321f, 0.0f, 0.0f, 0.0f});
 }
 
 TEST_P(InstFloat, frintp) {

From e43ada7b0e615744521824735929929b0e53cd04 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Wed, 9 Oct 2024 14:58:28 +0100
Subject: [PATCH 05/71] Implemented UCVTF (fixed-point to float) helper
 function.

---
 src/include/simeng/arch/aarch64/helpers/float.hh | 15 +++++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc      |  9 +--------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/float.hh b/src/include/simeng/arch/aarch64/helpers/float.hh
index 454f50070c..8675f5ed0c 100644
--- a/src/include/simeng/arch/aarch64/helpers/float.hh
+++ b/src/include/simeng/arch/aarch64/helpers/float.hh
@@ -194,6 +194,21 @@ D fcvtzu_integer(srcValContainer& sourceValues) {
   return result;
 }
 
+/** Helper function for SCALAR/FP instructions with the format ucvtf rd, rn
+ * #fbits.
+ * D represents the destination register type (e.g. for Sd, D = float).
+ * N represents the source register type (e.g. for Xn, N = uint32_t).
+ * Returns single value of type D. */
+template <typename D, typename N>
+D ucvtf_fixedToFloat(srcValContainer& sourceValues) {
+  // Convert Fixed-Point to FP
+  // Using algorithm from
+  // https://embeddedartistry.com/blog/2018/07/12/simple-fixed-point-conversion-in-c/
+  const N xn = sourceValues_[0].get<N>();
+  const N fbits = static_cast<N>(metadata_.operands[2].imm);
+  return (static_cast<D>(xn) / static_cast<D>(1ull << fbits));
+}
+
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
\ No newline at end of file
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 4f4e17fa18..97d98cdb8e 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -5527,14 +5527,7 @@ void Instruction::execute() {
         break;
       }
       case Opcode::AArch64_UCVTFSXSri: {  // ucvtf sd, xn, #fbits
-        // Convert Fixed-Point to FP32
-        // Using algorithm from
-        // https://embeddedartistry.com/blog/2018/07/12/simple-fixed-point-conversion-in-c/
-        const uint64_t xn = sourceValues_[0].get<uint64_t>();
-        const uint64_t fbits = static_cast<uint64_t>(metadata_.operands[2].imm);
-        std::cerr << xn << " " << fbits << std::endl;
-        results_[0] = {
-            static_cast<float>(xn) / static_cast<float>(1ull << fbits), 256};
+        results_[0] = {ucvtf_fixedToFloat<float, uint32_t>(sourceValues_), 256};
         break;
       }
       case Opcode::AArch64_UCVTFUWDri: {  // ucvtf dd, wn

From 4773af8fdd5f842718c2f0334025c8a2a7ee9776 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Thu, 10 Oct 2024 15:13:15 +0100
Subject: [PATCH 06/71] Implemented UDOT (by element) NEON instructions with
 tests.

---
 .../simeng/arch/aarch64/helpers/float.hh      |  8 +++--
 .../simeng/arch/aarch64/helpers/neon.hh       | 30 ++++++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc   | 11 +++++-
 .../aarch64/AArch64RegressionTest.hh          | 15 ++++----
 test/regression/aarch64/instructions/neon.cc  | 35 +++++++++++++++++++
 5 files changed, 89 insertions(+), 10 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/float.hh b/src/include/simeng/arch/aarch64/helpers/float.hh
index 8675f5ed0c..0d198f9268 100644
--- a/src/include/simeng/arch/aarch64/helpers/float.hh
+++ b/src/include/simeng/arch/aarch64/helpers/float.hh
@@ -200,12 +200,14 @@ D fcvtzu_integer(srcValContainer& sourceValues) {
  * N represents the source register type (e.g. for Xn, N = uint32_t).
  * Returns single value of type D. */
 template <typename D, typename N>
-D ucvtf_fixedToFloat(srcValContainer& sourceValues) {
+D ucvtf_fixedToFloat(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
   // Convert Fixed-Point to FP
   // Using algorithm from
   // https://embeddedartistry.com/blog/2018/07/12/simple-fixed-point-conversion-in-c/
-  const N xn = sourceValues_[0].get<N>();
-  const N fbits = static_cast<N>(metadata_.operands[2].imm);
+  const N xn = sourceValues[0].get<N>();
+  const N fbits = static_cast<N>(metadata.operands[2].imm);
   return (static_cast<D>(xn) / static_cast<D>(1ull << fbits));
 }
 
diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh
index 17137dcb55..c2bf42e6fa 100644
--- a/src/include/simeng/arch/aarch64/helpers/neon.hh
+++ b/src/include/simeng/arch/aarch64/helpers/neon.hh
@@ -951,6 +951,36 @@ RegisterValue vecUzp(srcValContainer& sourceValues, bool isUzp1) {
   return {out, 256};
 }
 
+/** Helper function for NEON instructions with the format `udot vd.s, vn.b,
+ * vm.4b[index]`.
+ * D represents the number of elements in the output vector to be updated (i.e.
+ * for vd.2s D = 2). Only 2 or 4 are valid.
+ * Returns correctly formatted RegisterValue. */
+template <int D>
+RegisterValue vecUdot_byElement(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  // Check D and N are valid values
+  static_assert((D == 2 || D == 4) &&
+                "D must be either 2 or 4 to align with vd.2s or vd.4s.");
+
+  const uint32_t* vd = sourceValues[0].getAsVector<uint32_t>();
+  const uint8_t* vn = sourceValues[1].getAsVector<uint8_t>();
+  const uint8_t* vm = sourceValues[2].getAsVector<uint8_t>();
+  const int index = metadata.operands[2].vector_index;
+
+  uint32_t out[D] = {0};
+  for (int i = 0; i < D; i++) {
+    uint32_t acc = vd[i];
+    for (int j = 0; j < 4; j++) {
+      acc += (static_cast<uint32_t>(vn[(4 * i) + j]) *
+              static_cast<uint32_t>(vm[(4 * index) + j]));
+    }
+    out[i] = acc;
+  }
+  return {out, 256};
+}
+
 /** Helper function for NEON instructions with the format `zip<1,2> vd.T,
  * vn.T, vm.T`.
  * T represents the type of sourceValues (e.g. for vn.d, T = uint64_t).
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 97d98cdb8e..19136d5442 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -5527,7 +5527,8 @@ void Instruction::execute() {
         break;
       }
       case Opcode::AArch64_UCVTFSXSri: {  // ucvtf sd, xn, #fbits
-        results_[0] = {ucvtf_fixedToFloat<float, uint32_t>(sourceValues_), 256};
+        results_[0] = {
+            ucvtf_fixedToFloat<float, uint32_t>(sourceValues_, metadata_), 256};
         break;
       }
       case Opcode::AArch64_UCVTFUWDri: {  // ucvtf dd, wn
@@ -5568,6 +5569,14 @@ void Instruction::execute() {
         results_[0] = {div_3ops<uint64_t>(sourceValues_), 8};
         break;
       }
+      case Opcode::AArch64_UDOTlanev16i8: {  // udot vd.4s, vn.16b, vm.4b[index]
+        results_[0] = vecUdot_byElement<4>(sourceValues_, metadata_);
+        break;
+      }
+      case Opcode::AArch64_UDOTlanev8i8: {  // udot vd.2s, vn.8b, vm.4b[index]
+        results_[0] = vecUdot_byElement<2>(sourceValues_, metadata_);
+        break;
+      }
       case Opcode::AArch64_UMADDLrrr: {  // umaddl xd, wn, wm, xa
         results_[0] = maddl_4ops<uint64_t, uint32_t>(sourceValues_);
         break;
diff --git a/test/regression/aarch64/AArch64RegressionTest.hh b/test/regression/aarch64/AArch64RegressionTest.hh
index 32d975b09d..3e39fa59fe 100644
--- a/test/regression/aarch64/AArch64RegressionTest.hh
+++ b/test/regression/aarch64/AArch64RegressionTest.hh
@@ -239,13 +239,16 @@ class AArch64RegressionTest : public RegressionTest {
 
   /** Get the subtarget feature string based on LLVM version being used */
   std::string getSubtargetFeaturesString() {
-#if SIMENG_LLVM_VERSION < 14
-    return "+sve,+lse";
-#elif SIMENG_LLVM_VERSION < 18
-    return "+sve,+lse,+sve2,+sme,+sme-f64";
-#else
-    return "+sve,+lse,+sve2,+sme,+sme-f64f64,+sme-i16i64,+sme2";
+    std::string features = "+dotprod,+sve,+lse";
+#if SIMENG_LLVM_VERSION > 13
+    // "+dotprod,+sve,+lse,+sve2,+sme,+sme-f64";
+    features += ",+sve2,+sme,+sme-f64";
 #endif
+#if SIMENG_LLVM_VERSION > 17
+    // "+dotprod,+sve,+lse,+sve2,+sme,+sme-f64f64,+sme-i16i64,+sme2";
+    features += "f64,+sme-i16i64,+sme2";
+#endif
+    return features;
   }
 
   /** Check the elements of a Neon register.
diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc
index 2a28a4e22b..92d270288f 100644
--- a/test/regression/aarch64/instructions/neon.cc
+++ b/test/regression/aarch64/instructions/neon.cc
@@ -3648,6 +3648,41 @@ TEST_P(InstNeon, trn) {
   CHECK_NEON(8, uint64_t, {0x1e1c1a1816141210, 0x1f1d1b1917151311});
 }
 
+TEST_P(InstNeon, udot) {
+  // udot by element
+  initialHeapData_.resize(128);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  heap64[0] = 0xDEADBEEFFFFF00FF;
+  heap64[1] = 0x01234567ABBACAFE;
+  heap64[2] = 0xFEDCBA9876543210;
+  heap64[3] = 0xDEADCAFEABBABEEF;
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, #0
+    mov x8, #214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #16]
+
+    movi v2.4s, #3
+    movi v3.4s, #4
+    movi v4.4s, #5
+    movi v5.4s, #6
+
+    udot v2.2s, v1.8b, v0.4b[0]
+    udot v3.4s, v1.16b, v0.4b[1]
+    udot v4.2s, v1.8b, v0.4b[2]
+    udot v5.4s, v1.16b, v0.4b[3]
+  )");
+  CHECK_NEON(0, uint64_t, {0xDEADBEEFFFFF00FF, 0x01234567ABBACAFE});
+  CHECK_NEON(1, uint64_t, {0xFEDCBA9876543210, 0xDEADCAFEABBABEEF});
+  CHECK_NEON(2, uint32_t, {0xd929, 0x26f91, 0x0, 0x0});
+  CHECK_NEON(3, uint32_t, {0xd328, 0x288e8, 0x27e25, 0x2b87f});
+  CHECK_NEON(4, uint32_t, {0xc333, 0x2731b, 0x0, 0x0});
+  CHECK_NEON(5, uint32_t, {0x1fe2, 0x8e62, 0xad7e, 0xb52f});
+}
+
 TEST_P(InstNeon, uzp) {
   initialHeapData_.resize(128);
   uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());

From 50a8a20efbf331efc4eb7fff6ef20bc724e96330 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Thu, 10 Oct 2024 16:17:01 +0100
Subject: [PATCH 07/71] Implemented LD1 (NEON 8h x2, post index) instruction
 with tests.

---
 src/lib/arch/aarch64/Instruction_address.cc  |  3 ++
 src/lib/arch/aarch64/Instruction_execute.cc  |  3 ++
 test/regression/aarch64/instructions/load.cc | 47 ++++++++++++++++++++
 3 files changed, 53 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index a8a98e5edd..df94a5efda 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -275,6 +275,9 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
       case Opcode::AArch64_LD1Twov2d_POST:  // ld1 {vt1.2d, vt2.2d}, [xn],
                                             // <#imm|xm>
         [[fallthrough]];
+      case Opcode::AArch64_LD1Twov8h_POST:  // ld1 {vt1.8h, vt2.8h}, [xn],
+                                            // <#imm|xm>
+        [[fallthrough]];
       case Opcode::AArch64_LD1Twov4s:  // ld1 {vt1.4s, vt2.4s}, [xn]
         [[fallthrough]];
       case Opcode::AArch64_LD1Twov4s_POST: {  // ld1 {vt1.4s, vt2.4s}, [xn],
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 19136d5442..d7d12040c1 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -3106,6 +3106,9 @@ void Instruction::execute() {
       case Opcode::AArch64_LD1Twov2d_POST:  // ld1 {vt1.2d, vt2.2d}, [xn],
                                             // <#imm|xm>
         [[fallthrough]];
+      case Opcode::AArch64_LD1Twov8h_POST:  // ld1 {vt1.8h, vt2.8h}, [xn],
+                                            // <#imm|xm>
+        [[fallthrough]];
       case Opcode::AArch64_LD1Twov4s_POST: {  // ld1 {vt1.4s, vt2.4s}, [xn],
                                               // <#imm|xm>
         // LOAD
diff --git a/test/regression/aarch64/instructions/load.cc b/test/regression/aarch64/instructions/load.cc
index 2718c1fdb3..b59f1f8cf5 100644
--- a/test/regression/aarch64/instructions/load.cc
+++ b/test/regression/aarch64/instructions/load.cc
@@ -282,6 +282,53 @@ TEST_P(InstLoad, ld1_multi_struct) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(12),
             getGeneralRegister<uint64_t>(10) + 32);
 
+  // Two reg, 8h elements
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #32
+
+    # Load values from heap
+    # ld1 {v0.8h, v1.8h}, [x0]
+
+    # save heap address before post index
+    mov x10, x0
+
+    # Load values from heap with imm post-index
+    ld1 {v2.8h, v3.8h}, [x0], #32
+
+    # save heap address after post index
+    mov x11, x0
+    sub x0, x0, #32
+
+    # Load values from heap with reg post-index
+    ld1 {v4.8h, v5.8h}, [x0], x1
+
+    mov x12, x0
+  )");
+
+  // CHECK_NEON(0, uint16_t,
+  //            {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB,
+  //            0xEEDD});
+  // CHECK_NEON(1, uint16_t,
+  //            {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB,
+  //            0xEEDD});
+  CHECK_NEON(2, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(3, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(4, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  CHECK_NEON(5, uint16_t,
+             {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD});
+  EXPECT_EQ(getGeneralRegister<uint64_t>(11),
+            getGeneralRegister<uint64_t>(10) + 32);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(12),
+            getGeneralRegister<uint64_t>(10) + 32);
+
   // Two reg, 2d elements
   RUN_AARCH64(R"(
     # Get heap address

From 6696d5f861357f42e0e2876a540aa2795b507c24 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Thu, 10 Oct 2024 17:34:51 +0100
Subject: [PATCH 08/71] Implemented NEON UMLAL (32 to 64 bit) instruction with
 tests.

---
 src/lib/arch/aarch64/Instruction_execute.cc  | 13 ++++++++++
 test/regression/aarch64/instructions/neon.cc | 26 ++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index d7d12040c1..caa423871e 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -5592,6 +5592,19 @@ void Instruction::execute() {
         results_[0] = vecUMinP<uint8_t, 16>(sourceValues_);
         break;
       }
+      case Opcode::AArch64_UMLALv2i32_indexed: {  // umlal vd.2d, vn.2s,
+                                                  // vm.s[index]
+        const uint64_t* vd = sourceValues_[0].getAsVector<uint64_t>();
+        const uint32_t* vn = sourceValues_[1].getAsVector<uint32_t>();
+        const uint32_t* vm = sourceValues_[2].getAsVector<uint32_t>();
+        const int64_t index = metadata_.operands[2].vector_index;
+        const uint64_t vm_idx_elem = static_cast<uint64_t>(vm[index]);
+
+        uint64_t out[2] = {vd[0] + static_cast<uint64_t>(vn[0]) * vm_idx_elem,
+                           vd[1] + static_cast<uint64_t>(vn[1]) * vm_idx_elem};
+        results_[0] = {out, 256};
+        break;
+      }
       case Opcode::AArch64_UMOVvi32_idx0:  // umov wd, vn.s[0]
       case Opcode::AArch64_UMOVvi32: {     // umov wd, vn.s[index]
         const uint32_t* vec = sourceValues_[0].getAsVector<uint32_t>();
diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc
index 92d270288f..8ecee526e6 100644
--- a/test/regression/aarch64/instructions/neon.cc
+++ b/test/regression/aarch64/instructions/neon.cc
@@ -3758,6 +3758,32 @@ TEST_P(InstNeon, uzp) {
   CHECK_NEON(8, uint64_t, {0x1e1c1a1816141210, 0x1f1d1b1917151311});
 }
 
+TEST_P(InstNeon, umlal) {
+  // uint32 to uint64, lower half
+  RUN_AARCH64(R"(
+    mov w0, #-1
+    mov w1, #344
+    mov v0.s[0], w0
+    mov v0.s[3], w1
+
+    mov w2, #-1
+    mov w3, #3
+    mov v1.s[0], w2
+    mov v1.s[1], w3
+
+    mov v2.d[0], xzr
+    mov v2.d[1], xzr
+    mov v3.d[0], xzr
+    mov v3.d[1], xzr
+
+    umlal v2.2d, v1.2s, v0.s[0]
+    umlal v3.2d, v1.2s, v0.s[3]
+  )");
+  CHECK_NEON(0, uint32_t, {UINT32_MAX, 0, 0, 344});
+  CHECK_NEON(2, uint64_t, {18446744065119617025ull, 12884901885ull});
+  CHECK_NEON(3, uint64_t, {1477468749480ull, 1032ull});
+}
+
 TEST_P(InstNeon, zip) {
   initialHeapData_.resize(128);
   uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());

From bb5096a6ca1c2acfecf3c55c2021abf7aeefe182 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Thu, 10 Oct 2024 17:41:06 +0100
Subject: [PATCH 09/71] Implemented NEON UMLAL2 (32 to 64 bit) instruction with
 tests.

---
 src/lib/arch/aarch64/Instruction_execute.cc  | 13 +++++++++++
 test/regression/aarch64/instructions/neon.cc | 24 ++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index caa423871e..aa58b2fe9f 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -5605,6 +5605,19 @@ void Instruction::execute() {
         results_[0] = {out, 256};
         break;
       }
+      case Opcode::AArch64_UMLALv4i32_indexed: {  // umlal2 vd.2d, vn.4s,
+                                                  // vm.s[index]
+        const uint64_t* vd = sourceValues_[0].getAsVector<uint64_t>();
+        const uint32_t* vn = sourceValues_[1].getAsVector<uint32_t>();
+        const uint32_t* vm = sourceValues_[2].getAsVector<uint32_t>();
+        const int64_t index = metadata_.operands[2].vector_index;
+        const uint64_t vm_idx_elem = static_cast<uint64_t>(vm[index]);
+
+        uint64_t out[2] = {vd[0] + static_cast<uint64_t>(vn[2]) * vm_idx_elem,
+                           vd[1] + static_cast<uint64_t>(vn[3]) * vm_idx_elem};
+        results_[0] = {out, 256};
+        break;
+      }
       case Opcode::AArch64_UMOVvi32_idx0:  // umov wd, vn.s[0]
       case Opcode::AArch64_UMOVvi32: {     // umov wd, vn.s[index]
         const uint32_t* vec = sourceValues_[0].getAsVector<uint32_t>();
diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc
index 8ecee526e6..e8ce4f13f2 100644
--- a/test/regression/aarch64/instructions/neon.cc
+++ b/test/regression/aarch64/instructions/neon.cc
@@ -3782,6 +3782,30 @@ TEST_P(InstNeon, umlal) {
   CHECK_NEON(0, uint32_t, {UINT32_MAX, 0, 0, 344});
   CHECK_NEON(2, uint64_t, {18446744065119617025ull, 12884901885ull});
   CHECK_NEON(3, uint64_t, {1477468749480ull, 1032ull});
+
+  // uint32 to uint64, upper half
+  RUN_AARCH64(R"(
+    mov w0, #-1
+    mov w1, #344
+    mov v0.s[0], w0
+    mov v0.s[3], w1
+
+    mov w2, #-1
+    mov w3, #3
+    mov v1.s[2], w2
+    mov v1.s[3], w3
+
+    mov v2.d[0], xzr
+    mov v2.d[1], xzr
+    mov v3.d[0], xzr
+    mov v3.d[1], xzr
+
+    umlal2 v2.2d, v1.4s, v0.s[0]
+    umlal2 v3.2d, v1.4s, v0.s[3]
+  )");
+  CHECK_NEON(0, uint32_t, {UINT32_MAX, 0, 0, 344});
+  CHECK_NEON(2, uint64_t, {18446744065119617025ull, 12884901885ull});
+  CHECK_NEON(3, uint64_t, {1477468749480ull, 1032ull});
 }
 
 TEST_P(InstNeon, zip) {

From 09d65069af365105c5d907cb807d95cfc1707fd4 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Fri, 11 Oct 2024 13:00:39 +0100
Subject: [PATCH 10/71] Implemented NEON ST1 (single vector, post index)
 instruction with tests.

---
 src/lib/arch/aarch64/Instruction_address.cc   |  5 +++++
 src/lib/arch/aarch64/Instruction_execute.cc   | 13 ++++++++++++
 test/regression/aarch64/instructions/store.cc | 20 +++++++++++++++++++
 3 files changed, 38 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index df94a5efda..2594e07ed6 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -1284,6 +1284,11 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
+      case Opcode::AArch64_ST1Onev4s_POST: {  // st1 {vt.4s}, [xn|sp], <#imm|xm>
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        setMemoryAddresses({base, 16});
+        break;
+      }
       case Opcode::AArch64_ST1Twov16b:  // st1 {vt.16b, vt2.16b}, [xn]
         [[fallthrough]];
       case Opcode::AArch64_ST1Twov16b_POST:  // st1 {vt.16b, vt2.16b}, [xn],
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index aa58b2fe9f..13bb362c35 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -4724,6 +4724,19 @@ void Instruction::execute() {
         results_[0] = sourceValues_[4].get<uint64_t>() + postIndex;
         break;
       }
+      case Opcode::AArch64_ST1Onev4s_POST: {  // st1 {vt.4s}, [xn|sp], <#imm|xm>
+        // STORE
+        const uint32_t* vt = sourceValues_[0].getAsVector<uint32_t>();
+        memoryData_[0] = RegisterValue((char*)vt, 4 * sizeof(uint32_t));
+
+        // if #imm post-index, value can only be 16
+        const uint64_t postIndex =
+            (metadata_.operands[2].type == AARCH64_OP_REG)
+                ? sourceValues_[2].get<uint64_t>()
+                : 16;
+        results_[0] = sourceValues_[1].get<uint64_t>() + postIndex;
+        break;
+      }
       case Opcode::AArch64_ST1Twov16b: {  // st1 {vt.16b, vt2.16b}, [xn|sp]
         // STORE
         const uint8_t* t = sourceValues_[0].getAsVector<uint8_t>();
diff --git a/test/regression/aarch64/instructions/store.cc b/test/regression/aarch64/instructions/store.cc
index 6d6876b494..6a8136da37 100644
--- a/test/regression/aarch64/instructions/store.cc
+++ b/test/regression/aarch64/instructions/store.cc
@@ -437,6 +437,26 @@ TEST_P(InstStore, st1_multi_struct) {
     }
   }
 
+  // one reg, 4s elements (post offset only)
+  RUN_AARCH64(R"(
+    mov x0, #32
+    movi v0.4s, #1
+    sub sp, sp, #96
+    st1 {v0.4s}, [sp], #16
+    st1 {v0.4s}, [sp], x0
+  )");
+  const uint64_t sp = process_->getInitialStackPointer();
+  EXPECT_EQ(getGeneralRegister<uint64_t>(31), sp - 48);
+  EXPECT_EQ(getMemoryValue<uint32_t>(sp - 96), static_cast<uint32_t>(1));
+  EXPECT_EQ(getMemoryValue<uint32_t>(sp - 92), static_cast<uint32_t>(1));
+  EXPECT_EQ(getMemoryValue<uint32_t>(sp - 88), static_cast<uint32_t>(1));
+  EXPECT_EQ(getMemoryValue<uint32_t>(sp - 84), static_cast<uint32_t>(1));
+
+  EXPECT_EQ(getMemoryValue<uint32_t>(sp - 80), static_cast<uint32_t>(1));
+  EXPECT_EQ(getMemoryValue<uint32_t>(sp - 76), static_cast<uint32_t>(1));
+  EXPECT_EQ(getMemoryValue<uint32_t>(sp - 72), static_cast<uint32_t>(1));
+  EXPECT_EQ(getMemoryValue<uint32_t>(sp - 68), static_cast<uint32_t>(1));
+
   // two reg, 4s elements
   RUN_AARCH64(R"(
     mov x0, #32

From f6e7c03afb691fd60dbcc32797ad6c04f8823c43 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Fri, 11 Oct 2024 14:39:42 +0100
Subject: [PATCH 11/71] Implemented NEON LD1 (single vector, post index, 8b)
 instruction with tests.

---
 src/lib/arch/aarch64/Instruction_address.cc  |  4 +++
 src/lib/arch/aarch64/Instruction_execute.cc  | 10 ++++++
 test/regression/aarch64/instructions/load.cc | 35 ++++++++++++++++++++
 3 files changed, 49 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index 2594e07ed6..0758669e65 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -243,6 +243,10 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 16}});
         break;
       }
+      case Opcode::AArch64_LD1Onev8b_POST: {  // ld1 {vt.8b}, [xn], <#imm|xm>
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 8}});
+        break;
+      }
       case Opcode::AArch64_LD1Fourv16b:  // ld1 {vt1.16b, vt2.16b, vt3.16b,
                                          // vt4.16b}, [xn]
         [[fallthrough]];
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 13bb362c35..b44ab3bdc0 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -2795,6 +2795,16 @@ void Instruction::execute() {
         results_[1] = memoryData_[0].zeroExtend(memoryData_[0].size(), 256);
         break;
       }
+      case Opcode::AArch64_LD1Onev8b_POST: {  // ld1 {vt.8b}, [xn], <#imm|xm>
+        // if #imm post-index, value can only be 8
+        const uint64_t postIndex =
+            (metadata_.operands[2].type == AARCH64_OP_REG)
+                ? sourceValues_[1].get<uint64_t>()
+                : 8;
+        results_[0] = sourceValues_[0].get<uint64_t>() + postIndex;
+        results_[1] = memoryData_[0].zeroExtend(memoryData_[0].size(), 256);
+        break;
+      }
       case Opcode::AArch64_LD1RD_IMM: {  // ld1rd {zt.d}, pg/z, [xn, #imm]
         // LOAD
         const uint16_t partition_num = VL_bits / 64;
diff --git a/test/regression/aarch64/instructions/load.cc b/test/regression/aarch64/instructions/load.cc
index b59f1f8cf5..b98013d2a2 100644
--- a/test/regression/aarch64/instructions/load.cc
+++ b/test/regression/aarch64/instructions/load.cc
@@ -231,6 +231,41 @@ TEST_P(InstLoad, ld1_multi_struct) {
   EXPECT_EQ(getGeneralRegister<uint64_t>(12),
             getGeneralRegister<uint64_t>(10) + 16);
 
+  // One reg, 8b elements
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #8
+
+    # save heap address before post index
+    mov x10, x0
+
+    # Load values from heap with imm post-index
+    ld1 {v1.8b}, [x0], #8
+
+    # save heap address after post index
+    mov x11, x0
+
+    # Load values from heap with reg post-index
+    ld1 {v2.8b}, [x0], x1
+
+    mov x12, x0
+  )");
+
+  CHECK_NEON(1, uint8_t,
+             {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x00, 0x00, 0x00,
+              0x00, 0x00, 0x00, 0x00, 0x00});
+  CHECK_NEON(2, uint8_t,
+             {0x77, 0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x00, 0x00, 0x00,
+              0x00, 0x00, 0x00, 0x00, 0x00});
+  EXPECT_EQ(getGeneralRegister<uint64_t>(11),
+            getGeneralRegister<uint64_t>(10) + 8);
+  EXPECT_EQ(getGeneralRegister<uint64_t>(12),
+            getGeneralRegister<uint64_t>(10) + 16);
+
   // Two reg, 16b elements
   RUN_AARCH64(R"(
     # Get heap address

From 74e9b47c579bb60c949f75282ca06a86db0c797c Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 14 Oct 2024 11:54:41 +0100
Subject: [PATCH 12/71] Implemented SVE LD1RQB (imm offset) instruction with
 tests.

---
 src/lib/arch/aarch64/Instruction_address.cc |  6 +++
 src/lib/arch/aarch64/Instruction_execute.cc | 23 +++++++++++
 test/regression/aarch64/instructions/sve.cc | 43 +++++++++++++++++++++
 3 files changed, 72 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index 0758669e65..b7dc32176d 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -141,6 +141,12 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         }
         break;
       }
+      case Opcode::AArch64_LD1RQ_B_IMM: {  // ld1rqb {zd.b}, pg/z, [xn{, #imm}]
+        uint64_t addr =
+            sourceValues_[1].get<uint64_t>() + metadata_.operands[2].mem.disp;
+        setMemoryAddresses({addr, static_cast<uint16_t>(16)});
+        break;
+      }
       case Opcode::AArch64_LD1RQ_D_IMM: {  // ld1rqd {zd.d}, pg/z, [xn{, #imm}]
         uint64_t addr =
             sourceValues_[1].get<uint64_t>() + metadata_.operands[2].mem.disp;
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index b44ab3bdc0..1e5813bde2 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -2832,6 +2832,29 @@ void Instruction::execute() {
         results_[0] = {out, 256};
         break;
       }
+      case Opcode::AArch64_LD1RQ_B_IMM: {  // ld1rqb {zd.b}, pg/z, [xn{, #imm }]
+        // LOAD
+        const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
+        const uint16_t partition_num = VL_bits / 8;
+        uint8_t out[256] = {0};
+        const uint8_t* data = memoryData_[0].getAsVector<uint8_t>();
+
+        // Get mini-vector (quadword)
+        uint8_t mini[16] = {0};
+        for (int i = 0; i < 16; i++) {
+          uint64_t shifted_active = 1ull << (i % 64);
+          if (p[i / 64] & shifted_active) {
+            mini[i] = data[i];
+          }
+        }
+
+        // Duplicate mini-vector into output vector
+        for (int i = 0; i < partition_num; i++) {
+          out[i] = mini[i % 16];
+        }
+        results_[0] = {out, 256};
+        break;
+      }
       case Opcode::AArch64_LD1RQ_D_IMM: {  // ld1rqd {zd.d}, pg/z, [xn{, #imm}]
         // LOAD
         const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 6a52d46b95..c1db0317fd 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -4641,6 +4641,49 @@ TEST_P(InstSve, ld1rd) {
   CHECK_NEON(3, uint64_t, fillNeon<uint64_t>({0x12345678}, VL / 16));
 }
 
+TEST_P(InstSve, ld1rqb) {
+  initialHeapData_.resize(32);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  fillHeap<uint64_t>(heap64,
+                     {0x12345678DEADBEEF, 0xABCDEF0198765432,
+                      0xABBACAFEFEDCBA98, 0xFEEDABCDBEADCABB},
+                     4);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    # Load and broadcast values from heap
+    ptrue p0.b
+    ld1rqb {z0.b}, p0/z, [x0]
+    ld1rqb {z1.b}, p0/z, [x0, #16]
+
+    # Test for inactive lanes
+    ptrue p1.b, vl1
+    ld1rqb {z2.b}, p1/z, [x0]
+    add x0, x0, #32
+    ld1rqb {z3.b}, p1/z, [x0, #-16]
+  )");
+  CHECK_NEON(0, uint8_t,
+             fillNeon<uint8_t>({0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12,
+                                0x32, 0x54, 0x76, 0x98, 0x01, 0xEF, 0xCD, 0xAB},
+                               VL / 8));
+  CHECK_NEON(1, uint8_t,
+             fillNeon<uint8_t>({0x98, 0xBA, 0xDC, 0xFE, 0xFE, 0xCA, 0xBA, 0xAB,
+                                0xBB, 0xCA, 0xAD, 0xBE, 0xCD, 0xAB, 0xED, 0xFE},
+                               VL / 8));
+  CHECK_NEON(2, uint8_t,
+             fillNeon<uint8_t>({0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                               VL / 8));
+  CHECK_NEON(3, uint8_t,
+             fillNeon<uint8_t>({0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                               VL / 8));
+}
+
 TEST_P(InstSve, ld1rqd) {
   initialHeapData_.resize(32);
   uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());

From 4daf7050b5d83644b0f94a6725f6e68ef4e99133 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 14 Oct 2024 12:09:13 +0100
Subject: [PATCH 13/71] Implemented SVE LD1RQB (reg offset) instruction with
 tests.

---
 src/lib/arch/aarch64/Instruction_address.cc |  6 ++++
 src/lib/arch/aarch64/Instruction_execute.cc |  1 +
 test/regression/aarch64/instructions/sve.cc | 37 ++++++++++++++++++++-
 3 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index b7dc32176d..cd453f311e 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -141,6 +141,12 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         }
         break;
       }
+      case Opcode::AArch64_LD1RQ_B: {  // ld1rqb {zd.b}, pg/z, [xn, xm]
+        uint64_t addr =
+            sourceValues_[1].get<uint64_t>() + sourceValues_[2].get<uint64_t>();
+        setMemoryAddresses({addr, static_cast<uint16_t>(16)});
+        break;
+      }
       case Opcode::AArch64_LD1RQ_B_IMM: {  // ld1rqb {zd.b}, pg/z, [xn{, #imm}]
         uint64_t addr =
             sourceValues_[1].get<uint64_t>() + metadata_.operands[2].mem.disp;
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 1e5813bde2..1ed2ff172a 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -2832,6 +2832,7 @@ void Instruction::execute() {
         results_[0] = {out, 256};
         break;
       }
+      case Opcode::AArch64_LD1RQ_B:        // ld1rqb {zd.b}, pg/z, [xn, xm]
       case Opcode::AArch64_LD1RQ_B_IMM: {  // ld1rqb {zd.b}, pg/z, [xn{, #imm }]
         // LOAD
         const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index c1db0317fd..47618e4fef 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -4648,7 +4648,7 @@ TEST_P(InstSve, ld1rqb) {
                      {0x12345678DEADBEEF, 0xABCDEF0198765432,
                       0xABBACAFEFEDCBA98, 0xFEEDABCDBEADCABB},
                      4);
-
+  // Imm offset
   RUN_AARCH64(R"(
     # Get heap address
     mov x0, 0
@@ -4682,6 +4682,41 @@ TEST_P(InstSve, ld1rqb) {
              fillNeon<uint8_t>({0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
                                VL / 8));
+
+  // Reg offset
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    # Load and broadcast values from heap
+    ptrue p0.b
+    mov x1, #16
+    ld1rqb {z0.b}, p0/z, [x0]
+    ld1rqb {z1.b}, p0/z, [x0, x1]
+
+    # Test for inactive lanes
+    ptrue p1.b, vl1
+    ld1rqb {z2.b}, p1/z, [x0]
+    ld1rqb {z3.b}, p1/z, [x0, x1]
+  )");
+  CHECK_NEON(0, uint8_t,
+             fillNeon<uint8_t>({0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12,
+                                0x32, 0x54, 0x76, 0x98, 0x01, 0xEF, 0xCD, 0xAB},
+                               VL / 8));
+  CHECK_NEON(1, uint8_t,
+             fillNeon<uint8_t>({0x98, 0xBA, 0xDC, 0xFE, 0xFE, 0xCA, 0xBA, 0xAB,
+                                0xBB, 0xCA, 0xAD, 0xBE, 0xCD, 0xAB, 0xED, 0xFE},
+                               VL / 8));
+  CHECK_NEON(2, uint8_t,
+             fillNeon<uint8_t>({0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                               VL / 8));
+  CHECK_NEON(3, uint8_t,
+             fillNeon<uint8_t>({0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                               VL / 8));
 }
 
 TEST_P(InstSve, ld1rqd) {

From 810a3240ce398e0199f7ae9dc99875b71c0a8579 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 14 Oct 2024 13:01:55 +0100
Subject: [PATCH 14/71] Implemented SVE UDOT (4-way, indexed) instruction and
 tests.

---
 .../simeng/arch/aarch64/helpers/sve.hh        | 31 +++++++++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc   |  5 +++
 test/regression/aarch64/instructions/sve.cc   | 26 ++++++++++++++++
 3 files changed, 62 insertions(+)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 2c33ccfbe6..b19600a0dc 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -1563,6 +1563,37 @@ RegisterValue sveTrn2_3vecs(srcValContainer& sourceValues,
   return {out, 256};
 }
 
+/** Helper function for SVE instructions with the format `udot zd, zn,
+ * zm[index]`.
+ * D represents the element type of the destination register (i.e. for uint32_t,
+ * D = uint32_t).
+ * N represents the element type of the source registers (i.e. for uint8_t, N =
+ * uint8_t).
+ * W represents how many source elements are multiplied to form an output
+ * element (i.e. for 4-way, W = 4).
+ * Returns correctly formatted RegisterValue. */
+template <typename D, typename N, int W>
+RegisterValue sveUdot_indexed(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const D* zd = sourceValues[0].getAsVector<D>();
+  const N* zn = sourceValues[1].getAsVector<N>();
+  const N* zm = sourceValues[2].getAsVector<N>();
+  const int index = metadata.operands[2].vector_index;
+
+  D out[256 / sizeof(D)] = {0};
+  for (int i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) {
+    D acc = zd[i];
+    for (int j = 0; j < W; j++) {
+      acc += (static_cast<D>(zn[(W * i) + j]) *
+              static_cast<N>(zm[(W * index) + j]));
+    }
+    out[i] = acc;
+  }
+  return {out, 256};
+}
+
 /** Helper function for SVE instructions with the format `<s,u>unpk>hi,lo> zd,
  * zn`.
  * D represents the type of the destination register (e.g. <u>int32_t for
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 1ed2ff172a..08d3f49ae3 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -5619,6 +5619,11 @@ void Instruction::execute() {
         results_[0] = {div_3ops<uint64_t>(sourceValues_), 8};
         break;
       }
+      case Opcode::AArch64_UDOT_ZZZI_S: {  // udot zd.s, zn.b, zm.b[index]
+        results_[0] = sveUdot_indexed<uint32_t, uint8_t, 4>(sourceValues_,
+                                                            metadata_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_UDOTlanev16i8: {  // udot vd.4s, vn.16b, vm.4b[index]
         results_[0] = vecUdot_byElement<4>(sourceValues_, metadata_);
         break;
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 47618e4fef..9ef230d575 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -7098,6 +7098,32 @@ TEST_P(InstSve, uaddv) {
   CHECK_NEON(3, uint64_t, {(9 * (VL / 128)), 0});
 }
 
+TEST_P(InstSve, udot) {
+  // udot by element
+  initialHeapData_.resize(16);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  heap64[0] = 0xDEADBEEFFFFF00FF;
+  heap64[1] = 0x01234567ABBACAFE;
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, #0
+    mov x8, #214
+    svc #0
+
+    ldr q0, [x0]
+
+    dup z2.b, #2
+    dup z3.b, #3
+    dup z4.s, #4
+    dup z5.s, #5
+
+    udot z4.s, z2.b, z0.b[0]
+    udot z5.s, z3.b, z0.b[3]
+  )");
+  CHECK_NEON(4, uint32_t, fillNeon<uint32_t>({1534}, VL / 8));
+  CHECK_NEON(5, uint32_t, fillNeon<uint32_t>({629}, VL / 8));
+}
+
 TEST_P(InstSve, uqdec) {
   // d arrangement
   RUN_AARCH64(R"(

From 2db08ae0484bdff080bef28fabc069c25a43e768 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 14 Oct 2024 13:18:34 +0100
Subject: [PATCH 15/71] Implemented SVE ZIP1+2 (byte) instructions and tests.

---
 CMakeLists.txt                              |  4 ++--
 src/lib/arch/aarch64/Instruction_execute.cc |  8 ++++++++
 test/regression/aarch64/Exception.cc        |  2 --
 test/regression/aarch64/instructions/sme.cc |  4 ----
 test/regression/aarch64/instructions/sve.cc | 15 ++++++++++-----
 5 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b8f4379b98..d0691578fe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -155,10 +155,10 @@ if(SIMENG_ENABLE_TESTS)
 
       # Print message containing if the full test suite will run
     if (${LLVM_PACKAGE_VERSION} VERSION_LESS "14.0")
-      message(STATUS "LLVM version does not support AArch64 extensions SME or SVE2. These test suites will be skipped.")
+      message(STATUS "LLVM version does not support AArch64 extensions SVE2, SVE2.1, SME, or SME2. Related tests will fail.")
     endif()
     if (${LLVM_PACKAGE_VERSION} VERSION_LESS "18.0")
-      message(STATUS "LLVM version does not support AArch64 extensions SME2. These test suites will be skipped.")
+      message(STATUS "LLVM version does not support AArch64 extensions SME2 or SVE2.1. Related test will fail.")
     endif()
 
   else()
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 08d3f49ae3..0c7a3f6250 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -5943,6 +5943,10 @@ void Instruction::execute() {
         results_[0] = sveZip_preds<uint32_t>(sourceValues_, VL_bits, false);
         break;
       }
+      case Opcode::AArch64_ZIP1_ZZZ_B: {  // zip1 zd.b, zn.b, zm.b
+        results_[0] = sveZip_vecs<uint8_t>(sourceValues_, VL_bits, false);
+        break;
+      }
       case Opcode::AArch64_ZIP1_ZZZ_D: {  // zip1 zd.d, zn.d, zm.d
         results_[0] = sveZip_vecs<uint64_t>(sourceValues_, VL_bits, false);
         break;
@@ -5995,6 +5999,10 @@ void Instruction::execute() {
         results_[0] = sveZip_preds<uint32_t>(sourceValues_, VL_bits, true);
         break;
       }
+      case Opcode::AArch64_ZIP2_ZZZ_B: {  // zip2 zd.b, zn.b, zm.b
+        results_[0] = sveZip_vecs<uint8_t>(sourceValues_, VL_bits, true);
+        break;
+      }
       case Opcode::AArch64_ZIP2_ZZZ_D: {  // zip2 zd.d, zn.d, zm.d
         results_[0] = sveZip_vecs<uint64_t>(sourceValues_, VL_bits, true);
         break;
diff --git a/test/regression/aarch64/Exception.cc b/test/regression/aarch64/Exception.cc
index b987ae4429..2133629473 100644
--- a/test/regression/aarch64/Exception.cc
+++ b/test/regression/aarch64/Exception.cc
@@ -151,7 +151,6 @@ TEST_P(Exception, unmapped_sys_reg) {
   EXPECT_EQ(stdout_.substr(0, strlen(err)), err);
 }
 
-#if SIMENG_LLVM_VERSION >= 14
 // TODO: Write test for InstructionException::StreamingModeUpdate once it has a
 // trigger case
 // TODO: Write test for InstructionException::ZAregisterStatusUpdate once it has
@@ -371,7 +370,6 @@ TEST_P(Exception, svcr) {
                   fillNeon<uint32_t>({0}, SVL / 8));
   }
 }
-#endif
 
 INSTANTIATE_TEST_SUITE_P(
     AArch64, Exception,
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index 55c7b945f3..68f686609c 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -7,7 +7,6 @@ namespace {
 
 using InstSme = AArch64RegressionTest;
 
-#if SIMENG_LLVM_VERSION >= 14
 TEST_P(InstSme, mova) {
   // 8-bit
   RUN_AARCH64(R"(
@@ -576,8 +575,5 @@ TEST_P(InstSme, zero) {
 INSTANTIATE_TEST_SUITE_P(AArch64, InstSme,
                          ::testing::ValuesIn(genCoreTypeSVLPairs(EMULATION)),
                          paramToString);
-#else
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InstSme);
-#endif
 
 }  // namespace
\ No newline at end of file
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 9ef230d575..ea8021fc96 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -8087,14 +8087,12 @@ TEST_P(InstSve, zip_pred) {
 }
 
 TEST_P(InstSve, zip) {
-  // d arrangement
   RUN_AARCH64(R"(
     # 64-bit  
     fdup z0.d, #0.5
     fdup z1.d, #-0.5
     fdup z2.d, #0.75
     fdup z3.d, #-0.75
-
     zip1 z4.d, z0.d, z1.d
     zip2 z5.d, z2.d, z3.d
 
@@ -8105,16 +8103,24 @@ TEST_P(InstSve, zip) {
     fdup z9.s, #0.75
     zip1 z10.s, z6.s, z7.s
     zip2 z11.s, z8.s, z9.s
+
+    # 8-bit
+    dup z12.b, #1
+    dup z13.b, #-2
+    dup z14.b, #-1
+    dup z15.b, #2
+    zip1 z16.b, z12.b, z13.b
+    zip2 z17.b, z14.b, z15.b
   )");
 
   CHECK_NEON(4, double, fillNeon<double>({0.5, -0.5}, VL / 8));
   CHECK_NEON(5, double, fillNeon<double>({0.75, -0.75}, VL / 8));
   CHECK_NEON(10, float, fillNeon<float>({0.5, -0.75}, VL / 8));
   CHECK_NEON(11, float, fillNeon<float>({-0.5, 0.75}, VL / 8));
+  CHECK_NEON(16, int8_t, fillNeon<int8_t>({1, -2}, VL / 8));
+  CHECK_NEON(17, int8_t, fillNeon<int8_t>({-1, 2}, VL / 8));
 }
 
-#if SIMENG_LLVM_VERSION >= 14
-// If LLVM version supports SVE2 :
 TEST_P(InstSve, psel) {
   RUN_AARCH64(R"(
     mov w13, #0
@@ -8148,7 +8154,6 @@ TEST_P(InstSve, psel) {
   CHECK_PREDICATE(14, uint64_t, fillPred(VL / 8, {0}, 4));
   CHECK_PREDICATE(15, uint64_t, fillPred(VL / 8, {0}, 8));
 }
-#endif
 
 INSTANTIATE_TEST_SUITE_P(AArch64, InstSve,
                          ::testing::ValuesIn(genCoreTypeVLPairs(EMULATION)),

From 7ac89e862ca2166c86156fd831e3dc4fabe62fce Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 14 Oct 2024 14:49:06 +0100
Subject: [PATCH 16/71] Implemented SVE faddv (float and double) instructions
 and tests.

---
 .../simeng/arch/aarch64/helpers/sve.hh        | 21 +++++
 src/lib/arch/aarch64/Instruction_execute.cc   | 10 +++
 test/regression/aarch64/instructions/sve.cc   | 78 +++++++++++++++++++
 3 files changed, 109 insertions(+)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index b19600a0dc..25ea3dede9 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -626,6 +626,27 @@ std::enable_if_t<std::is_floating_point_v<T>, RegisterValue> sveFDivPredicated(
   return {out, 256};
 }
 
+/** Helpfer function for SVE instructions with the format `faddv rd, pg, zn.
+ * D represents the source vector element type and the destination scalar
+ * register type (i.e. for zn.s and sd, D = float).
+ * Returns correctly formatted RegisterValue. */
+template <typename D>
+RegisterValue sveFaddv_predicated(srcValContainer& sourceValues,
+                                  const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const D* zn = sourceValues[1].getAsVector<D>();
+
+  const uint16_t partition_num = VL_bits / (8 * sizeof(D));
+  D out[256 / sizeof(D)] = {0};
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(D))) * sizeof(D));
+    if (p[i / (64 / sizeof(D))] & shifted_active) {
+      out[0] += zn[i];
+    }
+  }
+  return {out, 256};
+}
+
 /** Helper function for SVE instructions with the format `fmad zd, pg/m, zn,
  * zm`.
  * T represents the type of sourceValues (e.g. for zn.d, T = double).
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 0c7a3f6250..5bc1e088ca 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -1388,6 +1388,16 @@ void Instruction::execute() {
         results_[0] = vecAdd_3ops<float, 4>(sourceValues_);
         break;
       }
+      case Opcode::AArch64_FADDV_VPZ_D: {  // faddv dd, p0, zn.d
+
+        results_[0] = sveFaddv_predicated<double>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_FADDV_VPZ_S: {  // faddv sd, p0, zn.s
+
+        results_[0] = sveFaddv_predicated<float>(sourceValues_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_FCADD_ZPmZ_D: {  // fcadd zdn.d, pg/m, zdn.d, zm.d,
                                             // #imm
         results_[0] =
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index ea8021fc96..03e3e4e870 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -2852,6 +2852,84 @@ TEST_P(InstSve, fadda) {
   CHECK_NEON(3, double, {resultB, 0});
 }
 
+TEST_P(InstSve, faddv) {
+  // float
+  initialHeapData_.resize(VL / 8);
+  float* fheap = reinterpret_cast<float*>(initialHeapData_.data());
+  std::vector<float> fsrc = {
+      1.0f,    -42.76f, -0.125f, 0.0f,   40.26f,   -684.72f, -0.15f,  107.86f,
+      -34.71f, -0.917f, 0.0f,    80.72f, -125.67f, -0.01f,   701.90f, 7.0f};
+  fillHeap<float>(fheap, fsrc, VL / 32);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x2, xzr
+    mov x3, xzr
+    mov x4, #4
+    mov x5, #2
+    addvl x3, x3, #1
+    sdiv x3, x3, x4
+    sdiv x2, x3, x5
+
+    ptrue p0.s
+    whilelo p1.s, xzr, x2
+
+    ld1w {z0.s}, p0/z, [x0]
+
+    faddv s3, p0, z0.s
+    faddv s4, p1, z0.s
+  )");
+  float s3 = 0.0f;
+  float s4 = 0.0f;
+  for (int i = 0; i < VL / 32; i++) {
+    s3 += fsrc[i % (fsrc.size())];
+    if (i < (VL / 64)) s4 += fsrc[i % (fsrc.size())];
+  }
+  CHECK_NEON(3, float, {s3, 0.0f, 0.0f, 0.0f});
+  CHECK_NEON(4, float, {s4, 0.0f, 0.0f, 0.0f});
+
+  // double
+  initialHeapData_.resize(VL);
+  double* dheap = reinterpret_cast<double*>(initialHeapData_.data());
+  std::vector<double> dsrc = {1.0,     -42.76, -0.125, 0.0,    40.26, -684.72,
+                              -0.15,   107.86, -34.71, -0.917, 0.0,   80.72,
+                              -125.67, -0.01,  701.90, 7.0};
+  fillHeap<double>(dheap, dsrc, VL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x2, xzr
+    mov x3, xzr
+    mov x4, #8
+    mov x5, #2
+    addvl x3, x3, #1
+    sdiv x3, x3, x4
+    sdiv x2, x3, x5
+
+    ptrue p0.d
+    whilelo p1.d, xzr, x2
+
+    ld1d {z0.d}, p0/z, [x0]
+
+    faddv d3, p0, z0.d
+    faddv d4, p1, z0.d
+  )");
+  double d3 = 0.0;
+  double d4 = 0.0;
+  for (int i = 0; i < VL / 64; i++) {
+    d3 += dsrc[i % (dsrc.size())];
+    if (i < (VL / 128)) d4 += dsrc[i % (dsrc.size())];
+  }
+  CHECK_NEON(3, double, {d3, 0.0});
+  CHECK_NEON(4, double, {d4, 0.0});
+}
+
 TEST_P(InstSve, fcmge) {
   // double
   initialHeapData_.resize(VL / 16);

From bb737611352886e648038ac9dbf09beeb401f5f2 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 14 Oct 2024 16:07:10 +0100
Subject: [PATCH 17/71] Implemented SVE PTRUE (as counter) instructions with
 tests.

---
 .../simeng/arch/aarch64/helpers/sve.hh        | 42 +++++++++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc   | 16 +++++++
 test/regression/aarch64/instructions/sve.cc   | 25 +++++++++++
 3 files changed, 83 insertions(+)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 25ea3dede9..3736a7c766 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -1340,6 +1340,48 @@ std::array<uint64_t, 4> svePtrue(
   return out;
 }
 
+/** Helper function for SVE instructions with the format `ptrue pnd.
+ * T represents the type of sourceValues (e.g. for pnd.d, T = uint64_t).
+ * Returns an array of 4 uint64_t elements. */
+template <typename T>
+std::array<uint64_t, 4> svePtrue_counter(const uint16_t VL_bits) {
+  // Predicate as counter is 16-bits and has the following encoding:
+  //    - Up to first 4 bits encode the element size (0b1, 0b10, 0b100, 0b1000
+  //    for b h s d respectively)
+  //            - bits 0->LSZ
+  //    - Bits LSZ -> 14 represent a uint of the number of consecutive elements
+  //    from element 0 that are active / inactive
+  //            - If invert bit = 0 it is number of active elements
+  //            - If invert bit = 1 it is number of inactive elements
+  //    - Bit 15 represents the invert bit
+  std::array<uint64_t, 4> out = {0, 0, 0, 0};
+
+  // Set invert bit
+  out[0] |= 0b1000000000000000;
+
+  // Set Element size field
+  uint8_t bitsUsed = 0;
+  if (sizeof(T) == 1) {
+    out[0] |= 0b1;
+    bitsUsed += 1;
+  } else if (sizeof(T) == 2) {
+    out[0] |= 0b10;
+    bitsUsed += 2;
+  } else if (sizeof(T) == 4) {
+    out[0] |= 0b100;
+    bitsUsed += 3;
+  } else if (sizeof(T) == 8) {
+    out[0] |= 0b1000;
+    bitsUsed += 4;
+  }
+
+  // Set Element count (max value is 256 (2048 bit VL for pnd.b))
+  const uint64_t elementCount = VL_bits / (sizeof(T) * 8);
+  out[0] |= (elementCount << bitsUsed);
+
+  return out;
+}
+
 /** Helper function for SVE instructions with the format `punpk<hi,lo> pd.h,
  * pn.b`.
  * If `isHI` = false, then PUNPKLO is performed.
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 5bc1e088ca..879304f6a2 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -4111,6 +4111,22 @@ void Instruction::execute() {
         results_[0] = svePtrue<uint32_t>(metadata_, VL_bits);
         break;
       }
+      case Opcode::AArch64_PTRUE_C_B: {  // ptrue pnd.b
+        results_[0] = svePtrue_counter<uint8_t>(VL_bits);
+        break;
+      }
+      case Opcode::AArch64_PTRUE_C_D: {  // ptrue pnd.d
+        results_[0] = svePtrue_counter<uint64_t>(VL_bits);
+        break;
+      }
+      case Opcode::AArch64_PTRUE_C_H: {  // ptrue pnd.h
+        results_[0] = svePtrue_counter<uint16_t>(VL_bits);
+        break;
+      }
+      case Opcode::AArch64_PTRUE_C_S: {  // ptrue pnd.s
+        results_[0] = svePtrue_counter<uint32_t>(VL_bits);
+        break;
+      }
       case Opcode::AArch64_PUNPKHI_PP: {  // punpkhi pd.h, pn.b
         results_[0] = svePunpk(sourceValues_, VL_bits, true);
         break;
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 03e3e4e870..21a51a5d1a 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -5816,6 +5816,31 @@ TEST_P(InstSve, ptrue) {
   CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 2));
 }
 
+TEST_P(InstSve, ptrue_counter) {
+  RUN_AARCH64(R"(
+    ptrue pn8.s
+    ptrue pn9.d
+    ptrue pn10.b
+    ptrue pn11.h
+  )");
+  const uint64_t ps =
+      0b0000000000000000000000000000000000000000000000001000000000000100 |
+      ((static_cast<uint64_t>(VL / 32)) << 3);
+  const uint64_t pd =
+      0b0000000000000000000000000000000000000000000000001000000000001000 |
+      ((static_cast<uint64_t>(VL / 64)) << 4);
+  const uint64_t pb =
+      0b0000000000000000000000000000000000000000000000001000000000000001 |
+      ((static_cast<uint64_t>(VL / 8)) << 1);
+  const uint64_t ph =
+      0b0000000000000000000000000000000000000000000000001000000000000010 |
+      ((static_cast<uint64_t>(VL / 16)) << 2);
+  CHECK_PREDICATE(8, uint64_t, {ps, 0x0, 0x0, 0x0});
+  CHECK_PREDICATE(9, uint64_t, {pd, 0x0, 0x0, 0x0});
+  CHECK_PREDICATE(10, uint64_t, {pb, 0x0, 0x0, 0x0});
+  CHECK_PREDICATE(11, uint64_t, {ph, 0x0, 0x0, 0x0});
+}
+
 TEST_P(InstSve, punpk) {
   RUN_AARCH64(R"(
     ptrue p0.b

From 9febab0c62e82d05740abe8c83bd2bf687912875 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Thu, 24 Oct 2024 14:05:01 +0100
Subject: [PATCH 18/71] Added paciasp and autiasp empty execution logic.

---
 src/lib/arch/aarch64/Instruction_address.cc | 13 ++++++
 src/lib/arch/aarch64/Instruction_execute.cc | 44 +++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index cd453f311e..d7874a44a4 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -327,6 +327,19 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
         break;
       }
+      // case Opcode::AArch64_LD1D_4Z_IMM: {  // ld1d {zt1.d - zt4.d}, png/z,
+      // [xn{,
+      //                                      // #imm, mul vl}]
+      //   const uint16_t partition_num = VL_bits / 64;
+
+      //   const uint64_t base = sourceValues_[1].get<uint64_t>();
+      //   const uint64_t offset =
+      //       static_cast<uint64_t>(metadata_.operands[5].mem.disp);
+      //   const uint64_t addr = base + (offset * 4 * partition_num * 8);
+
+      //   setMemoryAddresses({addr, static_cast<uint16_t>((VL_bits / 8) * 4)});
+      //   break;
+      // }
       case Opcode::AArch64_LD1D_IMM: {  // ld1d {zt.d}, pg/z, [xn{, #imm,
                                         // mul vl}]
         const uint16_t partition_num = VL_bits / 64;
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 879304f6a2..303a16c4ae 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -2752,6 +2752,42 @@ void Instruction::execute() {
         results_[0] = {out, 256};
         break;
       }
+      // case Opcode::AArch64_LD1D_4Z_IMM: {  // ld1d {zt1.d - zt4.d}, png/z,
+      // [xn{,
+      //                                      // #imm, mul vl}]
+      //   // LOAD
+      //   const uint64_t* pn = sourceValues_[0].getAsVector<uint64_t>();
+      //   const uint64_t* data = memoryData_[0].getAsVector<uint64_t>();
+
+      //   const uint16_t partition_num = VL_bits / 64;
+
+      //   // Get predicate-as-counter information
+      //   const bool invert =
+      //       (pn[0] & static_cast<uint64_t>(0b1000000000000000)) != 0;
+      //   const uint64_t numElems =
+      //       (pn[0] & static_cast<uint64_t>(0b0111111111110000)) >> 4;
+
+      //   uint64_t out[4][32] = {{0}, {0}, {0}, {0}};
+      //   uint64_t index = 0;
+
+      //   for (int r = 0; r < 4; r++) {
+      //     // If invert = 1, dictates number of inactive elements at start of
+      //     // each
+      //     // vector. Otherwise, it is number of active elements at start of
+      //     each
+      //     // vector.
+      //     int iMax = (invert) ? partition_num : numElems;
+      //     for (int i = (invert) ? numElems : 0; i < iMax; i++) {
+      //       out[r][i] = data[index];
+      //       index++;
+      //     }
+      //   }
+      //   results_[0] = {out[0], 256};
+      //   results_[1] = {out[1], 256};
+      //   results_[2] = {out[2], 256};
+      //   results_[3] = {out[3], 256};
+      //   break;
+      // }
       case Opcode::AArch64_LD1D_IMM: {  // ld1d  {zt.d}, pg/z, [xn{, #imm,
                                         // mul vl}]
         // LOAD
@@ -4062,6 +4098,14 @@ void Instruction::execute() {
             [](uint8_t x, uint8_t y) -> uint8_t { return x | y; });
         break;
       }
+      case Opcode::AArch64_AUTIASP:  // autiasp
+        [[fallthrough]];
+      case Opcode::AArch64_PACIASP: {  // paciasp
+        const uint64_t x30 = sourceValues_[0].get<uint64_t>();
+        // Mimic execution by writing leaving x30 unmodified
+        results_[0] = {x30, 8};
+        break;
+      }
       case Opcode::AArch64_PFALSE: {  // pfalse pd.b
         uint64_t out[4] = {0, 0, 0, 0};
         results_[0] = out;

From b45d8d782dc3f50943994f93a9e5f98357dda85c Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Fri, 25 Oct 2024 10:32:28 +0100
Subject: [PATCH 19/71] Implemented NEON UMULL (uint16 to uint32) instruction
 and tests.

---
 src/lib/arch/aarch64/Instruction_execute.cc  | 11 ++++++
 test/regression/aarch64/instructions/neon.cc | 36 ++++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 303a16c4ae..0b4ce8a622 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -5767,6 +5767,17 @@ void Instruction::execute() {
                             sourceValues_[1].get<uint64_t>());
         break;
       }
+      case Opcode::AArch64_UMULLv4i16_v4i32: {  // umull vd.4s, vn.4h, vm.4h
+        const uint16_t* vn = sourceValues_[0].getAsVector<uint16_t>();
+        const uint16_t* vm = sourceValues_[1].getAsVector<uint16_t>();
+
+        uint32_t out[4] = {0};
+        for (int i = 0; i < 4; i++) {
+          out[i] = static_cast<uint32_t>(vn[i]) * static_cast<uint32_t>(vm[i]);
+        }
+        results_[0] = {out, 256};
+        break;
+      }
       case Opcode::AArch64_UQDECD_WPiI: {  // uqdecd wd{, pattern{, MUL #imm}}
         results_[0] =
             sveUqdec<uint32_t, 64u>(sourceValues_, metadata_, VL_bits);
diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc
index e8ce4f13f2..ca9ae26a4e 100644
--- a/test/regression/aarch64/instructions/neon.cc
+++ b/test/regression/aarch64/instructions/neon.cc
@@ -3808,6 +3808,42 @@ TEST_P(InstNeon, umlal) {
   CHECK_NEON(3, uint64_t, {1477468749480ull, 1032ull});
 }
 
+TEST_P(InstNeon, umull) {
+  // uint16_t to uint32_t
+  initialHeapData_.resize(32);
+  uint16_t* heap16 = reinterpret_cast<uint16_t*>(initialHeapData_.data());
+  heap16[0] = UINT16_MAX;
+  heap16[1] = 0;
+  heap16[2] = 1234;
+  heap16[3] = 0xBEEF;
+  heap16[4] = 0xABBA;
+  heap16[5] = 0xCAFE;
+  heap16[6] = 0xDEAD;
+  heap16[7] = 0xACDC;
+
+  heap16[8] = UINT16_MAX;
+  heap16[9] = 0xACDC;
+  heap16[10] = 0xCAFE;
+  heap16[11] = 0xABBA;
+  heap16[12] = 0xBEEF;
+  heap16[13] = 0xDEAD;
+  heap16[14] = 9876;
+  heap16[15] = 0;
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, #0
+    mov x8, #214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #16]
+
+    umull v2.4s, v0.4h, v1.4h
+  )");
+  CHECK_NEON(2, uint32_t, {4294836225u, 0, 64126044u, 2148818598u});
+}
+
 TEST_P(InstNeon, zip) {
   initialHeapData_.resize(128);
   uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());

From 6383d983896d4dc40c17b5100617acf3ae36e6bb Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Fri, 25 Oct 2024 11:17:34 +0100
Subject: [PATCH 20/71] Implemented RDSVL and tests.

---
 src/include/simeng/arch/aarch64/ArchInfo.hh |  3 ++-
 src/lib/arch/aarch64/Instruction_execute.cc | 13 +++++++++++--
 test/regression/aarch64/instructions/sme.cc | 15 +++++++++++++++
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/ArchInfo.hh b/src/include/simeng/arch/aarch64/ArchInfo.hh
index 1403da08f8..b7f2740353 100644
--- a/src/include/simeng/arch/aarch64/ArchInfo.hh
+++ b/src/include/simeng/arch/aarch64/ArchInfo.hh
@@ -18,7 +18,8 @@ class ArchInfo : public simeng::arch::ArchInfo {
                            aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1,
                            aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0,
                            aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0,
-                           aarch64_sysreg::AARCH64_SYSREG_SVCR}),
+                           aarch64_sysreg::AARCH64_SYSREG_SVCR,
+                           aarch64_sysreg::AARCH64_SYSREG_TPIDR2_EL0}),
         zaSize_(config["Core"]["Streaming-Vector-Length"].as<uint16_t>() / 8) {
     // Generate the architecture-defined architectural register structure
     archRegStruct_ = {
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 0b4ce8a622..7172a35de7 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -4187,9 +4187,18 @@ void Instruction::execute() {
         results_[0] = rbit<uint64_t>(sourceValues_, metadata_);
         break;
       }
+      case Opcode::AArch64_RDSVLI_XI: {  // rdsvl xd, #imm
+        // Uses Streaming SVE vector register size, regardless of streaming mode
+        // state
+        int64_t imm = metadata_.operands[1].imm;
+        results_[0] = imm * static_cast<int64_t>(
+                                architecture_.getStreamingVectorLength() / 8);
+        break;
+      }
       case Opcode::AArch64_RDVLI_XI: {  // rdvl xd, #imm
-        int8_t imm = static_cast<int8_t>(metadata_.operands[1].imm);
-        results_[0] = (uint64_t)(imm * (VL_bits / 8));
+        // Uses current vector register size
+        int64_t imm = metadata_.operands[1].imm;
+        results_[0] = imm * static_cast<int64_t>(VL_bits / 8);
         break;
       }
       case Opcode::AArch64_RET: {  // ret {xr}
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index 68f686609c..7171b0da0f 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -271,6 +271,21 @@ TEST_P(InstSme, ld1w) {
           {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, {0}, SVL / 8));
 }
 
+TEST_P(InstSme, rdsvl) {
+  RUN_AARCH64(R"(
+    rdsvl x0, #-32
+    rdsvl x1, #-3
+    rdsvl x2, #0
+    rdsvl x3, #3
+    rdsvl x4, #31
+  )");
+  EXPECT_EQ(getGeneralRegister<int64_t>(0), (SVL / 8) * -32);
+  EXPECT_EQ(getGeneralRegister<int64_t>(1), (SVL / 8) * -3);
+  EXPECT_EQ(getGeneralRegister<int64_t>(2), 0);
+  EXPECT_EQ(getGeneralRegister<int64_t>(3), (SVL / 8) * 3);
+  EXPECT_EQ(getGeneralRegister<int64_t>(4), (SVL / 8) * 31);
+}
+
 TEST_P(InstSme, st1d) {
   // Horizontal
   initialHeapData_.resize(SVL / 4);

From 64162374ad15a42c9c5312f0f0396fad2c5a82fa Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Fri, 25 Oct 2024 12:11:55 +0100
Subject: [PATCH 21/71] Implemented ZERO {zt0} instruction with test.

---
 src/lib/arch/aarch64/Instruction_execute.cc   |  9 +++++
 test/integration/ConfigTest.cc                |  6 ++--
 .../aarch64/AArch64RegressionTest.hh          | 35 +++++++++++++++++++
 test/regression/aarch64/instructions/sme.cc   |  9 +++++
 test/unit/aarch64/ArchInfoTest.cc             |  3 +-
 5 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 7172a35de7..c41c80b2b5 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -6139,6 +6139,15 @@ void Instruction::execute() {
         }
         break;
       }
+      case Opcode::AArch64_ZERO_T: {  // zero {zt0}
+        // SME
+        // Not in right context mode. Raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        // ZT0 has a fixed width of 512-bits
+        results_[0] = RegisterValue(0, 64);
+        break;
+      }
       default:
         return executionNYI();
     }
diff --git a/test/integration/ConfigTest.cc b/test/integration/ConfigTest.cc
index 48975eeacd..49a028ebb8 100644
--- a/test/integration/ConfigTest.cc
+++ b/test/integration/ConfigTest.cc
@@ -24,7 +24,8 @@ TEST(ConfigTest, Default) {
       aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1,
       aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0,
       aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0,
-      aarch64_sysreg::AARCH64_SYSREG_SVCR};
+      aarch64_sysreg::AARCH64_SYSREG_SVCR,
+      aarch64_sysreg::AARCH64_SYSREG_TPIDR2_EL0};
   EXPECT_EQ(simeng::config::SimInfo::getSysRegVec(), sysRegisterEnums);
   std::vector<simeng::RegisterFileStructure> archRegStruct = {
       {8, 32},
@@ -384,7 +385,8 @@ TEST(ConfigTest, configFromFile) {
       aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1,
       aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0,
       aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0,
-      aarch64_sysreg::AARCH64_SYSREG_SVCR};
+      aarch64_sysreg::AARCH64_SYSREG_SVCR,
+      aarch64_sysreg::AARCH64_SYSREG_TPIDR2_EL0};
   EXPECT_EQ(simeng::config::SimInfo::getSysRegVec(), sysRegisterEnums);
   std::vector<simeng::RegisterFileStructure> archRegStruct = {
       {8, 32},
diff --git a/test/regression/aarch64/AArch64RegressionTest.hh b/test/regression/aarch64/AArch64RegressionTest.hh
index 3e39fa59fe..8285726ee7 100644
--- a/test/regression/aarch64/AArch64RegressionTest.hh
+++ b/test/regression/aarch64/AArch64RegressionTest.hh
@@ -190,6 +190,24 @@ inline std::vector<std::tuple<CoreType, std::string>> genCoreTypeSVLPairs(
     checkMatrixRegisterCol<type>(tag, index, __VA_ARGS__); \
   }
 
+/** Check each element of the Lookup Table register ZT0 against expected values.
+ *
+ * The `tag` argument is the register index (must be 0), and the `type` argument
+ * is the C++ data type to use for value comparisons. The third argument should
+ * be an initializer list containing one value for each register element (for a
+ * total of `(64 / sizeof(type))` values).
+ *
+ * For example:
+ *
+ *     // Compare zt0 to some expected 32-bit uint64 values.
+ *     CHECK_TABLE(0, uint32_t, {1, 2, 3, 4, ..., 16});
+ */
+#define CHECK_TABLE(tag, type, ...)             \
+  {                                             \
+    SCOPED_TRACE("<<== error generated here");  \
+    checkTableRegister<type>(tag, __VA_ARGS__); \
+  }
+
 /** A helper macro to predecode the first instruction in a snippet of Armv9.2-a
  * assembly code and check the assigned group(s) for each micro-op matches the
  * expected group(s). Returns from the calling function if a fatal error occurs.
@@ -361,6 +379,23 @@ class AArch64RegressionTest : public RegressionTest {
     }
   }
 
+  /** Check the elements of the ZT0 lookup table register.
+   *
+   * This should be invoked via the `CHECK_TABLE` macro in order to provide
+   * better diagnostic messages, rather than called directly from test code.
+   */
+  template <typename T>
+  void checkTableRegister(uint8_t tag,
+                          const std::array<T, (64 / sizeof(T))>& values) const {
+    assert(tag == 0 && "Only a tag of value 0 is valid for Table registers");
+    const T* data = RegressionTest::getVectorRegister<T>(
+        {simeng::arch::aarch64::RegisterType::TABLE, tag});
+    for (unsigned i = 0; i < (64 / sizeof(T)); i++) {
+      EXPECT_NEAR(data[i], values[i], 0.0005)
+          << "Mismatch for element " << i << ".";
+    }
+  }
+
   /** Get the value of a general purpose register. */
   template <typename T>
   T getGeneralRegister(uint8_t tag) const {
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index 7171b0da0f..01def5ce7d 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -534,6 +534,15 @@ TEST_P(InstSme, st1w) {
 }
 
 TEST_P(InstSme, zero) {
+  // ZT0
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {zt0}
+  )");
+  CHECK_TABLE(0, uint64_t, {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0});
+
+  // ZA tiles
   RUN_AARCH64(R"(
     smstart
 
diff --git a/test/unit/aarch64/ArchInfoTest.cc b/test/unit/aarch64/ArchInfoTest.cc
index 39e25a0bd1..a2b41a9ec2 100644
--- a/test/unit/aarch64/ArchInfoTest.cc
+++ b/test/unit/aarch64/ArchInfoTest.cc
@@ -23,7 +23,8 @@ class AArch64ArchInfoTest : public ::testing::Test {
       aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1,
       aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0,
       aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0,
-      aarch64_sysreg::AARCH64_SYSREG_SVCR};
+      aarch64_sysreg::AARCH64_SYSREG_SVCR,
+      aarch64_sysreg::AARCH64_SYSREG_TPIDR2_EL0};
 
   const std::vector<simeng::RegisterFileStructure> archRegStruct = {
       {8, 32},

From 9a3dc35261ec772c56a912e2aabeeae9aaa1944f Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Fri, 25 Oct 2024 17:05:46 +0100
Subject: [PATCH 22/71] Implemented ld1d (4 consec vecs, uint64) SVE
 instruction with tests, and fixed PTRUE (counter) implementation.

---
 .../simeng/arch/aarch64/helpers/sve.hh        | 12 +---
 src/lib/arch/aarch64/Instruction_address.cc   | 34 ++++++----
 src/lib/arch/aarch64/Instruction_execute.cc   | 64 +++++++++----------
 test/regression/aarch64/instructions/sve.cc   | 59 ++++++++++++++---
 4 files changed, 105 insertions(+), 64 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 3736a7c766..df924c1f8c 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -1356,29 +1356,21 @@ std::array<uint64_t, 4> svePtrue_counter(const uint16_t VL_bits) {
   //    - Bit 15 represents the invert bit
   std::array<uint64_t, 4> out = {0, 0, 0, 0};
 
-  // Set invert bit
+  // Set invert bit to 1 and count to 0
+  // (The first 0 elements are FALSE)
   out[0] |= 0b1000000000000000;
 
   // Set Element size field
-  uint8_t bitsUsed = 0;
   if (sizeof(T) == 1) {
     out[0] |= 0b1;
-    bitsUsed += 1;
   } else if (sizeof(T) == 2) {
     out[0] |= 0b10;
-    bitsUsed += 2;
   } else if (sizeof(T) == 4) {
     out[0] |= 0b100;
-    bitsUsed += 3;
   } else if (sizeof(T) == 8) {
     out[0] |= 0b1000;
-    bitsUsed += 4;
   }
 
-  // Set Element count (max value is 256 (2048 bit VL for pnd.b))
-  const uint64_t elementCount = VL_bits / (sizeof(T) * 8);
-  out[0] |= (elementCount << bitsUsed);
-
   return out;
 }
 
diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index d7874a44a4..f6171fff6a 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -327,19 +327,27 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
         break;
       }
-      // case Opcode::AArch64_LD1D_4Z_IMM: {  // ld1d {zt1.d - zt4.d}, png/z,
-      // [xn{,
-      //                                      // #imm, mul vl}]
-      //   const uint16_t partition_num = VL_bits / 64;
-
-      //   const uint64_t base = sourceValues_[1].get<uint64_t>();
-      //   const uint64_t offset =
-      //       static_cast<uint64_t>(metadata_.operands[5].mem.disp);
-      //   const uint64_t addr = base + (offset * 4 * partition_num * 8);
-
-      //   setMemoryAddresses({addr, static_cast<uint16_t>((VL_bits / 8) * 4)});
-      //   break;
-      // }
+      case Opcode::AArch64_LD1D_4Z_IMM: {  // ld1d {zt1.d - zt4.d}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        const uint16_t partition_num = VL_bits / 64;
+
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset =
+            static_cast<uint64_t>(metadata_.operands[5].mem.disp);
+        const uint64_t addr = base + (offset * partition_num * 8);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(4);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+        addresses.push_back({addr + 2 * blockSize, blockSize});
+        addresses.push_back({addr + 3 * blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_LD1D_IMM: {  // ld1d {zt.d}, pg/z, [xn{, #imm,
                                         // mul vl}]
         const uint16_t partition_num = VL_bits / 64;
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index c41c80b2b5..3b7bee1704 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -2752,42 +2752,40 @@ void Instruction::execute() {
         results_[0] = {out, 256};
         break;
       }
-      // case Opcode::AArch64_LD1D_4Z_IMM: {  // ld1d {zt1.d - zt4.d}, png/z,
-      // [xn{,
-      //                                      // #imm, mul vl}]
-      //   // LOAD
-      //   const uint64_t* pn = sourceValues_[0].getAsVector<uint64_t>();
-      //   const uint64_t* data = memoryData_[0].getAsVector<uint64_t>();
-
-      //   const uint16_t partition_num = VL_bits / 64;
+      case Opcode::AArch64_LD1D_4Z_IMM: {  // ld1d {zt1.d - zt4.d}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        // LOAD
+        const uint64_t* pn = sourceValues_[0].getAsVector<uint64_t>();
 
-      //   // Get predicate-as-counter information
-      //   const bool invert =
-      //       (pn[0] & static_cast<uint64_t>(0b1000000000000000)) != 0;
-      //   const uint64_t numElems =
-      //       (pn[0] & static_cast<uint64_t>(0b0111111111110000)) >> 4;
+        // Get predicate-as-counter information
+        const bool invert =
+            (pn[0] & static_cast<uint64_t>(0b1000000000000000)) != 0;
+        const uint64_t predElemCount =
+            (pn[0] & static_cast<uint64_t>(0b0111111111110000)) >> 4;
 
-      //   uint64_t out[4][32] = {{0}, {0}, {0}, {0}};
-      //   uint64_t index = 0;
+        uint64_t out[4][32] = {{0}, {0}, {0}, {0}};
+        const uint16_t partition_num = VL_bits / 64;
 
-      //   for (int r = 0; r < 4; r++) {
-      //     // If invert = 1, dictates number of inactive elements at start of
-      //     // each
-      //     // vector. Otherwise, it is number of active elements at start of
-      //     each
-      //     // vector.
-      //     int iMax = (invert) ? partition_num : numElems;
-      //     for (int i = (invert) ? numElems : 0; i < iMax; i++) {
-      //       out[r][i] = data[index];
-      //       index++;
-      //     }
-      //   }
-      //   results_[0] = {out[0], 256};
-      //   results_[1] = {out[1], 256};
-      //   results_[2] = {out[2], 256};
-      //   results_[3] = {out[3], 256};
-      //   break;
-      // }
+        for (int r = 0; r < 4; r++) {
+          for (int i = 0; i < partition_num; i++) {
+            // If invert = 1, predElemCount dictates number of initial inactive
+            // elements.
+            // Otherwise, it is number of initial active elements.
+            if ((r * partition_num) + i < predElemCount) {
+              out[r][i] =
+                  (invert) ? 0 : memoryData_[r].getAsVector<uint64_t>()[i];
+            } else {
+              out[r][i] =
+                  (invert) ? memoryData_[r].getAsVector<uint64_t>()[i] : 0;
+            }
+          }
+        }
+        results_[0] = {out[0], 256};
+        results_[1] = {out[1], 256};
+        results_[2] = {out[2], 256};
+        results_[3] = {out[3], 256};
+        break;
+      }
       case Opcode::AArch64_LD1D_IMM: {  // ld1d  {zt.d}, pg/z, [xn{, #imm,
                                         // mul vl}]
         // LOAD
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 21a51a5d1a..84d351c60e 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -5063,6 +5063,7 @@ TEST_P(InstSve, ld1d_gather) {
 }
 
 TEST_P(InstSve, ld1d) {
+  // Single vector
   initialHeapData_.resize(VL / 4);
   uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
   std::vector<uint64_t> src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
@@ -5104,6 +5105,52 @@ TEST_P(InstSve, ld1d) {
              fillNeon<uint64_t>({src[(base) % 4], src[(base + 1) % 4],
                                  src[(base + 2) % 4], src[(base + 3) % 4]},
                                 VL / 16));
+
+  // Multi vector
+  initialHeapData_.resize(VL);
+  uint64_t* heap64_multi = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src_multi = {0xDEADBEEF, 0x12345678, 0x98765432,
+                                     0xABCDEF01};
+  fillHeap<uint64_t>(heap64_multi, src_multi, VL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    dup z0.d, #1
+    dup z1.d, #2
+    dup z2.d, #3
+    dup z3.d, #4
+
+    ptrue pn8.d
+
+    ld1d {z0.d - z3.d}, pn8/z, [x0, #4, mul vl]
+  )");
+  base = (VL / 64) * 4;
+  uint16_t offset = (VL / 64);
+  CHECK_NEON(0, uint64_t,
+             fillNeon<uint64_t>({src[(base) % 4], src[(base + 1) % 4],
+                                 src[(base + 2) % 4], src[(base + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(
+      1, uint64_t,
+      fillNeon<uint64_t>(
+          {src[((base + offset)) % 4], src[((base + offset) + 1) % 4],
+           src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]},
+          VL / 8));
+  CHECK_NEON(2, uint64_t,
+             fillNeon<uint64_t>({src[((base + (offset * 2))) % 4],
+                                 src[((base + (offset * 2)) + 1) % 4],
+                                 src[((base + (offset * 2)) + 2) % 4],
+                                 src[((base + (offset * 2)) + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(3, uint64_t,
+             fillNeon<uint64_t>({src[((base + (offset * 3))) % 4],
+                                 src[((base + (offset * 3)) + 1) % 4],
+                                 src[((base + (offset * 3)) + 2) % 4],
+                                 src[((base + (offset * 3)) + 3) % 4]},
+                                VL / 8));
 }
 
 TEST_P(InstSve, ld1h) {
@@ -5824,17 +5871,13 @@ TEST_P(InstSve, ptrue_counter) {
     ptrue pn11.h
   )");
   const uint64_t ps =
-      0b0000000000000000000000000000000000000000000000001000000000000100 |
-      ((static_cast<uint64_t>(VL / 32)) << 3);
+      0b0000000000000000000000000000000000000000000000001000000000000100;
   const uint64_t pd =
-      0b0000000000000000000000000000000000000000000000001000000000001000 |
-      ((static_cast<uint64_t>(VL / 64)) << 4);
+      0b0000000000000000000000000000000000000000000000001000000000001000;
   const uint64_t pb =
-      0b0000000000000000000000000000000000000000000000001000000000000001 |
-      ((static_cast<uint64_t>(VL / 8)) << 1);
+      0b0000000000000000000000000000000000000000000000001000000000000001;
   const uint64_t ph =
-      0b0000000000000000000000000000000000000000000000001000000000000010 |
-      ((static_cast<uint64_t>(VL / 16)) << 2);
+      0b0000000000000000000000000000000000000000000000001000000000000010;
   CHECK_PREDICATE(8, uint64_t, {ps, 0x0, 0x0, 0x0});
   CHECK_PREDICATE(9, uint64_t, {pd, 0x0, 0x0, 0x0});
   CHECK_PREDICATE(10, uint64_t, {pb, 0x0, 0x0, 0x0});

From 487365779bf3584675eb94b3a7236df105f6d874 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Fri, 25 Oct 2024 17:10:45 +0100
Subject: [PATCH 23/71] Implemented ld1d (2 consec vecs, uint64) SVE
 instruction with tests.

---
 src/lib/arch/aarch64/Instruction_address.cc | 19 ++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc | 32 +++++++++++++++++++++
 test/regression/aarch64/instructions/sve.cc | 31 +++++++++++++++++++-
 3 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index f6171fff6a..ecb5cab359 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -327,6 +327,25 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
         break;
       }
+      case Opcode::AArch64_LD1D_2Z_IMM: {  // ld1d {zt1.d, zt2.d}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        const uint16_t partition_num = VL_bits / 64;
+
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset =
+            static_cast<uint64_t>(metadata_.operands[5].mem.disp);
+        const uint64_t addr = base + (offset * partition_num * 8);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(2);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_LD1D_4Z_IMM: {  // ld1d {zt1.d - zt4.d}, png/z, [xn{,
                                            // #imm, mul vl}]
         const uint16_t partition_num = VL_bits / 64;
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 3b7bee1704..bdcaeb1b1e 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -2752,6 +2752,38 @@ void Instruction::execute() {
         results_[0] = {out, 256};
         break;
       }
+      case Opcode::AArch64_LD1D_2Z_IMM: {  // ld1d {zt1.d, zt2.d}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        // LOAD
+        const uint64_t* pn = sourceValues_[0].getAsVector<uint64_t>();
+
+        // Get predicate-as-counter information
+        const bool invert =
+            (pn[0] & static_cast<uint64_t>(0b1000000000000000)) != 0;
+        const uint64_t predElemCount =
+            (pn[0] & static_cast<uint64_t>(0b0111111111110000)) >> 4;
+
+        uint64_t out[2][32] = {{0}, {0}};
+        const uint16_t partition_num = VL_bits / 64;
+
+        for (int r = 0; r < 2; r++) {
+          for (int i = 0; i < partition_num; i++) {
+            // If invert = 1, predElemCount dictates number of initial inactive
+            // elements.
+            // Otherwise, it is number of initial active elements.
+            if ((r * partition_num) + i < predElemCount) {
+              out[r][i] =
+                  (invert) ? 0 : memoryData_[r].getAsVector<uint64_t>()[i];
+            } else {
+              out[r][i] =
+                  (invert) ? memoryData_[r].getAsVector<uint64_t>()[i] : 0;
+            }
+          }
+        }
+        results_[0] = {out[0], 256};
+        results_[1] = {out[1], 256};
+        break;
+      }
       case Opcode::AArch64_LD1D_4Z_IMM: {  // ld1d {zt1.d - zt4.d}, png/z, [xn{,
                                            // #imm, mul vl}]
         // LOAD
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 84d351c60e..9933a5f653 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -5112,6 +5112,35 @@ TEST_P(InstSve, ld1d) {
   std::vector<uint64_t> src_multi = {0xDEADBEEF, 0x12345678, 0x98765432,
                                      0xABCDEF01};
   fillHeap<uint64_t>(heap64_multi, src_multi, VL / 8);
+
+  // Two vector
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    dup z0.d, #1
+    dup z1.d, #2
+
+    ptrue pn8.d
+
+    ld1d {z0.d, z1.d}, pn8/z, [x0, #2, mul vl]
+  )");
+  base = (VL / 64) * 2;
+  uint16_t offset = (VL / 64);
+  CHECK_NEON(0, uint64_t,
+             fillNeon<uint64_t>({src[(base) % 4], src[(base + 1) % 4],
+                                 src[(base + 2) % 4], src[(base + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(
+      1, uint64_t,
+      fillNeon<uint64_t>(
+          {src[((base + offset)) % 4], src[((base + offset) + 1) % 4],
+           src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]},
+          VL / 8));
+
+  // Four vector
   RUN_AARCH64(R"(
     # Get heap address
     mov x0, 0
@@ -5128,7 +5157,7 @@ TEST_P(InstSve, ld1d) {
     ld1d {z0.d - z3.d}, pn8/z, [x0, #4, mul vl]
   )");
   base = (VL / 64) * 4;
-  uint16_t offset = (VL / 64);
+  offset = (VL / 64);
   CHECK_NEON(0, uint64_t,
              fillNeon<uint64_t>({src[(base) % 4], src[(base + 1) % 4],
                                  src[(base + 2) % 4], src[(base + 3) % 4]},

From b82ec90b1b6f355144d91bb79cf9617e2e2b9cd0 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 28 Oct 2024 13:12:59 +0000
Subject: [PATCH 24/71] Implemented SME mova (tile to vec, 4 regs, 8-bit)
 instruction with tests.

---
 src/lib/arch/aarch64/Instruction_execute.cc | 28 ++++++++++
 test/regression/aarch64/instructions/sme.cc | 58 ++++++++++++++++++---
 2 files changed, 78 insertions(+), 8 deletions(-)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index bdcaeb1b1e..eeadf477eb 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -3892,6 +3892,34 @@ void Instruction::execute() {
         results_[0] = sveMlaPredicated_vecs<uint32_t>(sourceValues_, VL_bits);
         break;
       }
+      case Opcode::AArch64_MOVA_4ZMXI_H_B: {  // mova {zd1.b - zd4.b},
+                                              // za0h.b[ws, offs1:offs4]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t sliceCount = VL_bits / 8;
+
+        const uint32_t ws = sourceValues_[sliceCount].get<uint32_t>();
+        const uint8_t offs1 =
+            metadata_.operands[4].sme.slice_offset.imm_range.first;
+        const uint8_t offs4 =
+            metadata_.operands[4].sme.slice_offset.imm_range.offset;
+
+        uint8_t out[4][256] = {{0}, {0}, {0}, {0}};
+
+        for (uint8_t i = offs1; i <= offs4; i++) {
+          // Get correct next row
+          const uint8_t* row =
+              sourceValues_[(ws + i) % sliceCount].getAsVector<uint8_t>();
+          // Update out and results_
+          const uint8_t index = i - offs1;
+          memcpy(out[index], row, sliceCount);
+          results_[index] = {out[index], 256};
+        }
+        break;
+      }
       case Opcode::AArch64_MOVID: {  // movi dd, #imm
         results_[0] = {static_cast<uint64_t>(metadata_.operands[1].imm), 256};
         break;
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index 01def5ce7d..16716a5603 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -47,6 +47,47 @@ TEST_P(InstSme, mova) {
   CHECK_NEON(7, float, fillNeonCombined<float>({4}, {10}, SVL / 8));
 }
 
+TEST_P(InstSme, mova_tilesToVecs) {
+  // uint8_t; 4 vectors
+  initialHeapData_.resize(SVL / 4);
+  uint32_t* heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  std::vector<uint32_t> src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
+  fillHeap<uint32_t>(heap32, src, SVL / 16);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    mov w12, #0
+    ptrue p0.s
+
+    # Pre-fill first 4 rows of za0.b
+    ld1w {za0h.s[w12, 0]}, p0/z, [x0]
+    ld1w {za1h.s[w12, 0]}, p0/z, [x0]
+    ld1w {za2h.s[w12, 0]}, p0/z, [x0]
+    ld1w {za3h.s[w12, 0]}, p0/z, [x0]
+
+
+    mova {z4.b-z7.b}, za0h.b[w12, 0:3]
+    
+    # Test Alias
+    mov w13, #1
+    dup z11.b, #3
+    mov {z8.b-z11.b}, za0h.b[w13, 0:3]
+  )");
+  for (int i = 4; i <= 10; i++) {
+    CHECK_NEON(
+        i, uint8_t,
+        fillNeon<uint8_t>({0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12, 0x32,
+                           0x54, 0x76, 0x98, 0x01, 0xEF, 0xCD, 0xAB},
+                          SVL / 8));
+  }
+  CHECK_NEON(11, uint8_t, fillNeon<uint8_t>({0x00}, SVL / 8));
+}
+
 TEST_P(InstSme, fmopa) {
   // 32-bit
   RUN_AARCH64(R"(
@@ -218,15 +259,16 @@ TEST_P(InstSme, ld1w) {
     whilelo p1.s, xzr, x1
     ld1w {za1h.s[w12, 0]}, p1/z, [x0, x2, lsl #2]
   )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAS0, 1, uint32_t,
+                fillNeon<uint32_t>(
+                    {0x12345678, 0x98765432, 0xABCDEF01, 0xDEADBEEF}, SVL / 8));
+  CHECK_MAT_ROW(AARCH64_REG_ZAS0, 3, uint32_t,
+                fillNeon<uint32_t>(
+                    {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8));
   CHECK_MAT_ROW(
-      AARCH64_REG_ZAS0, 1, uint64_t,
-      fillNeon<uint64_t>({0x9876543212345678, 0xDEADBEEFABCDEF01}, SVL / 8));
-  CHECK_MAT_ROW(
-      AARCH64_REG_ZAS0, 3, uint64_t,
-      fillNeon<uint64_t>({0x12345678DEADBEEF, 0xABCDEF0198765432}, SVL / 8));
-  CHECK_MAT_ROW(AARCH64_REG_ZAS1, 1, uint64_t,
-                fillNeonCombined<uint64_t>(
-                    {0x12345678DEADBEEF, 0xABCDEF0198765432}, {0}, SVL / 8));
+      AARCH64_REG_ZAS1, 1, uint32_t,
+      fillNeonCombined<uint32_t>(
+          {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, {0}, SVL / 8));
 
   // Vertical
   initialHeapData_.resize(SVL / 4);

From 89d7501d0ac41c1164040a5846d12990db0ead1e Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 28 Oct 2024 16:02:42 +0000
Subject: [PATCH 25/71] Implemented pred-as-counter to pred_as_mask function,
 and added unit tests.

---
 .../simeng/arch/aarch64/Instruction.hh        | 36 +++++++++++++++++++
 test/unit/aarch64/InstructionTest.cc          | 33 +++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/src/include/simeng/arch/aarch64/Instruction.hh b/src/include/simeng/arch/aarch64/Instruction.hh
index b5f1f07cc5..bee47e01bc 100644
--- a/src/include/simeng/arch/aarch64/Instruction.hh
+++ b/src/include/simeng/arch/aarch64/Instruction.hh
@@ -283,6 +283,42 @@ enum class InsnType : uint32_t {
   isBranch = 1 << 14
 };
 
+/** Predefined shift values for converting pred-as-counter to pred-as-mask. */
+const uint64_t predCountShiftVals[9] = {0, 1, 2, 0, 3, 0, 0, 0, 4};
+
+/** Convert Predicate-as-Mask to Predicate-as-Masks.
+ * T represents the element type (i.e. for pg.s, T = uint32_t).
+ * V represents the number of vectors the predicate-as-counter is being used
+ * for. */
+template <typename T, int V>
+std::vector<std::array<uint64_t, 4>> predAsCounterToMasks(
+    const uint64_t predAsCounter, const uint16_t VL_bits) {
+  std::vector<std::array<uint64_t, 4>> out(V, {0, 0, 0, 0});
+
+  const uint16_t elemsPerVec = VL_bits / (sizeof(T) * 8);
+  // Get predicate-as-counter information
+  const bool invert = (predAsCounter & 0b1000000000000000) != 0;
+  const uint64_t predElemCount =
+      (predAsCounter & static_cast<uint64_t>(0b0111111111111111)) >>
+      predCountShiftVals[sizeof(T)];
+
+  for (int r = 0; r < V; r++) {
+    for (int i = 0; i < elemsPerVec; i++) {
+      // Move bit to next position based on element type
+      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+      // If invert = 1, predElemCount dictates number of initial inactive
+      // elements.
+      // If invert = 0, it is number of initial active elements.
+      if ((r * elemsPerVec) + i < predElemCount) {
+        out[r][i / (64 / sizeof(T))] |= (invert) ? 0 : shifted_active;
+      } else {
+        out[r][i / (64 / sizeof(T))] |= (invert) ? shifted_active : 0;
+      }
+    }
+  }
+  return out;
+}
+
 /** A basic Armv9.2-a implementation of the `Instruction` interface. */
 class Instruction : public simeng::Instruction {
  public:
diff --git a/test/unit/aarch64/InstructionTest.cc b/test/unit/aarch64/InstructionTest.cc
index 1ecf14a1a6..80024bc1c6 100644
--- a/test/unit/aarch64/InstructionTest.cc
+++ b/test/unit/aarch64/InstructionTest.cc
@@ -627,6 +627,39 @@ TEST_F(AArch64InstructionTest, setters) {
   EXPECT_TRUE(insn.isWaitingCommit());
 }
 
+// Test predAsCounterToMasks function.
+TEST_F(AArch64InstructionTest, predAsCounterToMasks_test) {
+  // 1.5 full vectors from start, VL = 128b, uint8_t elem size
+  std::vector<std::array<uint64_t, 4>> ref(2, {0, 0, 0, 0});
+  ref[0][0] =
+      0b0000000000000000000000000000000000000000000000001111111111111111;
+  ref[1][0] =
+      0b0000000000000000000000000000000000000000000000000000000011111111;
+  // invert = 0, num active Elems = 24
+  uint64_t pn =
+      0b0000000000000000000000000000000000000000000000000000000000110001;
+  auto out = predAsCounterToMasks<uint8_t, 2>(pn, 128);
+  EXPECT_EQ(out[0][0], ref[0][0]);
+  EXPECT_EQ(out[1][0], ref[1][0]);
+
+  // 0.5 of last vector, VL = 1024b, uint64_t elem size
+  std::vector<std::array<uint64_t, 4>> ref2(4, {0, 0, 0, 0});
+  ref2[3][1] =
+      0b0000000100000001000000010000000100000001000000010000000100000001;
+  // Invert = 1, num inactive Elems = 56
+  uint64_t pn2 =
+      0b0000000000000000000000000000000000000000000000001000001110001000;
+  auto out2 = predAsCounterToMasks<uint64_t, 4>(pn2, 1024);
+  EXPECT_EQ(out2[0][0], ref2[0][0]);
+  EXPECT_EQ(out2[0][1], ref2[0][1]);
+  EXPECT_EQ(out2[1][0], ref2[1][0]);
+  EXPECT_EQ(out2[1][1], ref2[1][1]);
+  EXPECT_EQ(out2[2][0], ref2[2][0]);
+  EXPECT_EQ(out2[2][1], ref2[2][1]);
+  EXPECT_EQ(out2[3][0], ref2[3][0]);
+  EXPECT_EQ(out2[3][1], ref2[3][1]);
+}
+
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
\ No newline at end of file

From ad5bd876c42b053794fcce9a346e5084ba95a875 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 28 Oct 2024 17:06:52 +0000
Subject: [PATCH 26/71] Implemented st1d (2 consec vecs, uint64) SVE2
 instruction with tests.

---
 src/lib/arch/aarch64/Instruction_address.cc | 21 ++++++++
 src/lib/arch/aarch64/Instruction_execute.cc | 56 ++++++++++-----------
 test/regression/aarch64/instructions/sve.cc | 35 +++++++++++++
 3 files changed, 82 insertions(+), 30 deletions(-)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index ecb5cab359..9881aeec1a 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -1002,6 +1002,27 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
+      case Opcode::AArch64_ST1D_2Z_IMM: {  // st1d {zt1.d, zt2.d}, png, [xn{,
+                                           // #imm, mul vl}]
+        const uint64_t pn = sourceValues_[2].get<uint64_t>();
+        auto preds = predAsCounterToMasks<uint64_t, 2>(pn, VL_bits);
+        const uint16_t partition_num = VL_bits / 64;
+
+        const uint64_t base = sourceValues_[3].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[3].mem.disp);
+        const uint64_t addr = base + (offset * partition_num * 8);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+
+        generatePredicatedContiguousAddressBlocks(addr, partition_num, 8, 8,
+                                                  preds[0].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8),
+                                                  partition_num, 8, 8,
+                                                  preds[1].data(), addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_ST2D_IMM: {  // st2d {zt1.d, zt2.d}, pg, [<xn|sp>{,
                                         // #imm, mul vl}]
         const uint64_t* p = sourceValues_[2].getAsVector<uint64_t>();
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index eeadf477eb..460be3f700 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -2755,28 +2755,18 @@ void Instruction::execute() {
       case Opcode::AArch64_LD1D_2Z_IMM: {  // ld1d {zt1.d, zt2.d}, png/z, [xn{,
                                            // #imm, mul vl}]
         // LOAD
-        const uint64_t* pn = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t pn = sourceValues_[0].get<uint64_t>();
 
-        // Get predicate-as-counter information
-        const bool invert =
-            (pn[0] & static_cast<uint64_t>(0b1000000000000000)) != 0;
-        const uint64_t predElemCount =
-            (pn[0] & static_cast<uint64_t>(0b0111111111110000)) >> 4;
+        auto preds = predAsCounterToMasks<uint64_t, 2>(pn, VL_bits);
 
         uint64_t out[2][32] = {{0}, {0}};
         const uint16_t partition_num = VL_bits / 64;
 
         for (int r = 0; r < 2; r++) {
           for (int i = 0; i < partition_num; i++) {
-            // If invert = 1, predElemCount dictates number of initial inactive
-            // elements.
-            // Otherwise, it is number of initial active elements.
-            if ((r * partition_num) + i < predElemCount) {
-              out[r][i] =
-                  (invert) ? 0 : memoryData_[r].getAsVector<uint64_t>()[i];
-            } else {
-              out[r][i] =
-                  (invert) ? memoryData_[r].getAsVector<uint64_t>()[i] : 0;
+            uint64_t shifted_active = 1ull << ((i % 8) * 8);
+            if (preds[r][i / 8] & shifted_active) {
+              out[r][i] = memoryData_[r].getAsVector<uint64_t>()[i];
             }
           }
         }
@@ -2787,28 +2777,18 @@ void Instruction::execute() {
       case Opcode::AArch64_LD1D_4Z_IMM: {  // ld1d {zt1.d - zt4.d}, png/z, [xn{,
                                            // #imm, mul vl}]
         // LOAD
-        const uint64_t* pn = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t pn = sourceValues_[0].get<uint64_t>();
 
-        // Get predicate-as-counter information
-        const bool invert =
-            (pn[0] & static_cast<uint64_t>(0b1000000000000000)) != 0;
-        const uint64_t predElemCount =
-            (pn[0] & static_cast<uint64_t>(0b0111111111110000)) >> 4;
+        auto preds = predAsCounterToMasks<uint64_t, 4>(pn, VL_bits);
 
         uint64_t out[4][32] = {{0}, {0}, {0}, {0}};
         const uint16_t partition_num = VL_bits / 64;
 
         for (int r = 0; r < 4; r++) {
           for (int i = 0; i < partition_num; i++) {
-            // If invert = 1, predElemCount dictates number of initial inactive
-            // elements.
-            // Otherwise, it is number of initial active elements.
-            if ((r * partition_num) + i < predElemCount) {
-              out[r][i] =
-                  (invert) ? 0 : memoryData_[r].getAsVector<uint64_t>()[i];
-            } else {
-              out[r][i] =
-                  (invert) ? memoryData_[r].getAsVector<uint64_t>()[i] : 0;
+            uint64_t shifted_active = 1ull << ((i % 8) * 8);
+            if (preds[r][i / 8] & shifted_active) {
+              out[r][i] = memoryData_[r].getAsVector<uint64_t>()[i];
             }
           }
         }
@@ -4800,6 +4780,22 @@ void Instruction::execute() {
         memoryData_ = sve_merge_store_data<uint64_t>(d, p, VL_bits);
         break;
       }
+      case Opcode::AArch64_ST1D_2Z_IMM: {  // st1d {zt1.d, zt2.d}, png, [xn{,
+                                           // #imm, mul vl}]
+        // STORE
+        const uint64_t* t1 = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* t2 = sourceValues_[1].getAsVector<uint64_t>();
+        const uint64_t pn = sourceValues_[2].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint64_t, 2>(pn, VL_bits);
+
+        memoryData_ =
+            sve_merge_store_data<uint64_t>(t1, preds[0].data(), VL_bits);
+        std::vector<RegisterValue> out2 =
+            sve_merge_store_data<uint64_t>(t2, preds[1].data(), VL_bits);
+        memoryData_.insert(memoryData_.end(), out2.begin(), out2.end());
+        break;
+      }
       case Opcode::AArch64_ST1Fourv16b: {  // st1 {vt.16b, vt2.16b, vt3.16b,
                                            // vt4.16b}, [xn|sp]
         // STORE
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 9933a5f653..8e26434bd4 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -6638,6 +6638,41 @@ TEST_P(InstSve, st1d) {
   }
 }
 
+TEST_P(InstSve, st1d_multivec) {
+  // Two vectors
+  initialHeapData_.resize(VL / 4);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
+  fillHeap<uint64_t>(heap64, src, VL / 32);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    sub sp, sp, #4095
+    mov x4, #256
+    madd x4, x4, x4, x4
+    ptrue p0.d
+    ptrue pn8.d
+    ld1d {z0.d}, p0/z, [x0]
+    ld1d {z1.d}, p0/z, [x0, #1, mul vl]
+    st1d {z0.d, z1.d}, pn8, [sp]
+    st1d {z0.d, z1.d}, pn8, [x4, #2, mul vl]
+  )");
+
+  for (uint64_t i = 0; i < (VL / 32); i++) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 8)),
+              src[i % 4]);
+  }
+
+  for (uint64_t i = 0; i < (VL / 32); i++) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(65792 + (2 * (VL / 8)) + (i * 8)),
+              src[i % 4]);
+  }
+}
+
 TEST_P(InstSve, st2d) {
   // 32-bit
   RUN_AARCH64(R"(

From 9bf115a2beb5d3093b773af9cdd8ca7147ae2700 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 28 Oct 2024 17:18:17 +0000
Subject: [PATCH 27/71] Implemented st1d (2 consec vecs, uint64, scalar offset)
 SVE2 instruction with tests.

---
 src/lib/arch/aarch64/Instruction_address.cc | 20 ++++++++++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc |  4 ++++
 test/regression/aarch64/instructions/sve.cc | 10 +++++-----
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index 9881aeec1a..1087668f53 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -1002,6 +1002,26 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
+      case Opcode::AArch64_ST1D_2Z: {  // st1d {zt1.d, zt2.d}, png, [xn, xm, lsl
+                                       // #3]
+        const uint64_t pn = sourceValues_[2].get<uint64_t>();
+        auto preds = predAsCounterToMasks<uint64_t, 2>(pn, VL_bits);
+        const uint16_t partition_num = VL_bits / 64;
+
+        const uint64_t base = sourceValues_[3].get<uint64_t>();
+        const uint64_t offset = sourceValues_[4].get<uint64_t>();
+        const uint64_t addr = base + (offset << 3);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+
+        generatePredicatedContiguousAddressBlocks(addr, partition_num, 8, 8,
+                                                  preds[0].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8),
+                                                  partition_num, 8, 8,
+                                                  preds[1].data(), addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_ST1D_2Z_IMM: {  // st1d {zt1.d, zt2.d}, png, [xn{,
                                            // #imm, mul vl}]
         const uint64_t pn = sourceValues_[2].get<uint64_t>();
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 460be3f700..e6c070d3b6 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -4780,6 +4780,10 @@ void Instruction::execute() {
         memoryData_ = sve_merge_store_data<uint64_t>(d, p, VL_bits);
         break;
       }
+      case Opcode::AArch64_ST1D_2Z:  // st1d {zt1.d, zt2.d}, png, [xn, xm, lsl
+                                     // #3]
+        // STORE
+        [[fallthrough]];
       case Opcode::AArch64_ST1D_2Z_IMM: {  // st1d {zt1.d, zt2.d}, png, [xn{,
                                            // #imm, mul vl}]
         // STORE
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 8e26434bd4..c52fd16396 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -6651,6 +6651,7 @@ TEST_P(InstSve, st1d_multivec) {
     svc #0
 
     sub sp, sp, #4095
+    mov x1, #1
     mov x4, #256
     madd x4, x4, x4, x4
     ptrue p0.d
@@ -6658,18 +6659,17 @@ TEST_P(InstSve, st1d_multivec) {
     ld1d {z0.d}, p0/z, [x0]
     ld1d {z1.d}, p0/z, [x0, #1, mul vl]
     st1d {z0.d, z1.d}, pn8, [sp]
-    st1d {z0.d, z1.d}, pn8, [x4, #2, mul vl]
+    st1d {z0.d, z1.d}, pn8, [x4, #4, mul vl]
+    st1d {z0.d, z1.d}, pn8, [x4, x1, lsl #3]
   )");
 
   for (uint64_t i = 0; i < (VL / 32); i++) {
     EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
                                        4095 + (i * 8)),
               src[i % 4]);
-  }
-
-  for (uint64_t i = 0; i < (VL / 32); i++) {
-    EXPECT_EQ(getMemoryValue<uint64_t>(65792 + (2 * (VL / 8)) + (i * 8)),
+    EXPECT_EQ(getMemoryValue<uint64_t>(65792 + (4 * (VL / 8)) + (i * 8)),
               src[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(65792 + 8 + (i * 8)), src[i % 4]);
   }
 }
 

From ff8bb58094b9c266151c7da45fd3aa77e50a60b2 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Tue, 29 Oct 2024 12:10:37 +0000
Subject: [PATCH 28/71] Implemented LD1W (2 vec and 4 vec, imm offset) SVE2
 instructions with tests.

---
 src/lib/arch/aarch64/Instruction_address.cc | 42 ++++++++++-
 src/lib/arch/aarch64/Instruction_execute.cc | 46 ++++++++++++
 test/regression/aarch64/instructions/sve.cc | 77 ++++++++++++++++++++-
 3 files changed, 163 insertions(+), 2 deletions(-)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index 1087668f53..1db4d54f4f 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -333,7 +333,7 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
 
         const uint64_t base = sourceValues_[1].get<uint64_t>();
         const uint64_t offset =
-            static_cast<uint64_t>(metadata_.operands[5].mem.disp);
+            static_cast<uint64_t>(metadata_.operands[3].mem.disp);
         const uint64_t addr = base + (offset * partition_num * 8);
 
         std::vector<memory::MemoryAccessTarget> addresses;
@@ -407,6 +407,46 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
         break;
       }
+      case Opcode::AArch64_LD1W_2Z_IMM: {  // ld1w {zt1.s, zt2.s}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        const uint16_t partition_num = VL_bits / 32;
+
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset =
+            static_cast<uint64_t>(metadata_.operands[3].mem.disp);
+        const uint64_t addr = base + (offset * partition_num * 4);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(2);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_LD1W_4Z_IMM: {  // ld1w {zt1.s - zt4.s}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        const uint16_t partition_num = VL_bits / 32;
+
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset =
+            static_cast<uint64_t>(metadata_.operands[5].mem.disp);
+        const uint64_t addr = base + (offset * partition_num * 4);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(4);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+        addresses.push_back({addr + 2 * blockSize, blockSize});
+        addresses.push_back({addr + 3 * blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_LD2D: {  // ld2d {zt1.d, zt2.d}, pg/z, [xn|sp, xm,
                                     // lsl #3]
         const uint64_t base = sourceValues_[1].get<uint64_t>();
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index e6c070d3b6..13bed72f1b 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -3251,6 +3251,52 @@ void Instruction::execute() {
         results_[0] = {out, 256};
         break;
       }
+      case Opcode::AArch64_LD1W_2Z_IMM: {  // ld1w {zt1.s, zt2.s}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        // LOAD
+        const uint64_t pn = sourceValues_[0].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint32_t, 2>(pn, VL_bits);
+
+        uint32_t out[2][64] = {{0}, {0}};
+        const uint16_t partition_num = VL_bits / 32;
+
+        for (int r = 0; r < 2; r++) {
+          for (int i = 0; i < partition_num; i++) {
+            uint64_t shifted_active = 1ull << ((i % 16) * 4);
+            if (preds[r][i / 16] & shifted_active) {
+              out[r][i] = memoryData_[r].getAsVector<uint32_t>()[i];
+            }
+          }
+        }
+        results_[0] = {out[0], 256};
+        results_[1] = {out[1], 256};
+        break;
+      }
+      case Opcode::AArch64_LD1W_4Z_IMM: {  // ld1w {zt1.s - zt4.s}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        // LOAD
+        const uint64_t pn = sourceValues_[0].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint32_t, 4>(pn, VL_bits);
+
+        uint32_t out[4][64] = {{0}, {0}, {0}, {0}};
+        const uint16_t partition_num = VL_bits / 32;
+
+        for (int r = 0; r < 4; r++) {
+          for (int i = 0; i < partition_num; i++) {
+            uint64_t shifted_active = 1ull << ((i % 16) * 4);
+            if (preds[r][i / 16] & shifted_active) {
+              out[r][i] = memoryData_[r].getAsVector<uint32_t>()[i];
+            }
+          }
+        }
+        results_[0] = {out[0], 256};
+        results_[1] = {out[1], 256};
+        results_[2] = {out[2], 256};
+        results_[3] = {out[3], 256};
+        break;
+      }
       case Opcode::AArch64_LD1i32: {  // ld1 {vt.s}[index], [xn]
         // LOAD
         const int index = metadata_.operands[0].vector_index;
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index c52fd16396..46cd7bdac3 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -5221,11 +5221,11 @@ TEST_P(InstSve, ld1h) {
 }
 
 TEST_P(InstSve, ld1w) {
+  // Single vector
   initialHeapData_.resize(VL / 4);
   uint32_t* heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
   std::vector<uint32_t> src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
   fillHeap<uint32_t>(heap32, src, VL / 16);
-
   RUN_AARCH64(R"(
     # Get heap address
     mov x0, 0
@@ -5260,6 +5260,81 @@ TEST_P(InstSve, ld1w) {
   CHECK_NEON(3, uint64_t,
              fillNeonCombined<uint64_t>(
                  {0x12345678DEADBEEF, 0xABCDEF0198765432}, {0}, VL / 8));
+
+  // Multi vector
+  initialHeapData_.resize(VL);
+  uint32_t* heap32_multi = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  std::vector<uint32_t> src_multi = {0xDEADBEEF, 0x12345678, 0x98765432,
+                                     0xABCDEF01};
+  fillHeap<uint32_t>(heap32_multi, src_multi, VL / 4);
+
+  // Two vector
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    dup z0.s, #1
+    dup z1.s, #2
+
+    ptrue pn8.s
+
+    ld1w {z0.s, z1.s}, pn8/z, [x0, #2, mul vl]
+  )");
+  uint16_t base = (VL / 32) * 2;
+  uint16_t offset = (VL / 32);
+  CHECK_NEON(0, uint32_t,
+             fillNeon<uint32_t>({src[(base) % 4], src[(base + 1) % 4],
+                                 src[(base + 2) % 4], src[(base + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(
+      1, uint32_t,
+      fillNeon<uint32_t>(
+          {src[((base + offset)) % 4], src[((base + offset) + 1) % 4],
+           src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]},
+          VL / 8));
+
+  // Four vector
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    dup z0.s, #1
+    dup z1.s, #2
+    dup z2.s, #3
+    dup z3.s, #4
+
+    ptrue pn8.s
+
+    ld1w {z0.s - z3.s}, pn8/z, [x0, #4, mul vl]
+  )");
+  base = (VL / 32) * 4;
+  offset = (VL / 32);
+  CHECK_NEON(0, uint32_t,
+             fillNeon<uint32_t>({src[(base) % 4], src[(base + 1) % 4],
+                                 src[(base + 2) % 4], src[(base + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(
+      1, uint32_t,
+      fillNeon<uint32_t>(
+          {src[((base + offset)) % 4], src[((base + offset) + 1) % 4],
+           src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]},
+          VL / 8));
+  CHECK_NEON(2, uint32_t,
+             fillNeon<uint32_t>({src[((base + (offset * 2))) % 4],
+                                 src[((base + (offset * 2)) + 1) % 4],
+                                 src[((base + (offset * 2)) + 2) % 4],
+                                 src[((base + (offset * 2)) + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(3, uint32_t,
+             fillNeon<uint32_t>({src[((base + (offset * 3))) % 4],
+                                 src[((base + (offset * 3)) + 1) % 4],
+                                 src[((base + (offset * 3)) + 2) % 4],
+                                 src[((base + (offset * 3)) + 3) % 4]},
+                                VL / 8));
 }
 
 TEST_P(InstSve, ld2d) {

From c40e9f4272797f724203dc7f0bee92f3640916d6 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Tue, 29 Oct 2024 12:18:48 +0000
Subject: [PATCH 29/71] Implemented LD1W (2 vec, scalar offset) SVE2
 instruction with tests.

---
 src/lib/arch/aarch64/Instruction_address.cc | 16 ++++++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc |  4 ++++
 test/regression/aarch64/instructions/sve.cc | 11 +++++++++++
 3 files changed, 31 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index 1db4d54f4f..c43e8ec7b2 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -407,6 +407,22 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
         break;
       }
+      case Opcode::AArch64_LD1W_2Z: {  // ld1w {zt1.s, zt2.s}, png/z, [xn,
+                                       // xm, lsl #2]
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset = sourceValues_[2].get<uint64_t>();
+        const uint64_t addr = base + (offset << 2);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(2);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_LD1W_2Z_IMM: {  // ld1w {zt1.s, zt2.s}, png/z, [xn{,
                                            // #imm, mul vl}]
         const uint16_t partition_num = VL_bits / 32;
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 13bed72f1b..e977e3e0dd 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -3251,6 +3251,10 @@ void Instruction::execute() {
         results_[0] = {out, 256};
         break;
       }
+      case Opcode::AArch64_LD1W_2Z:  // ld1w {zt1.s, zt2.s}, png/z, [xn, xm,
+                                     // lsl #2]
+        // LOAD
+        [[fallthrough]];
       case Opcode::AArch64_LD1W_2Z_IMM: {  // ld1w {zt1.s, zt2.s}, png/z, [xn{,
                                            // #imm, mul vl}]
         // LOAD
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 46cd7bdac3..96a168a5c2 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -5277,10 +5277,14 @@ TEST_P(InstSve, ld1w) {
 
     dup z0.s, #1
     dup z1.s, #2
+    dup z2.s, #3
+    dup z3.s, #4
 
     ptrue pn8.s
+    mov x1, #2
 
     ld1w {z0.s, z1.s}, pn8/z, [x0, #2, mul vl]
+    ld1w {z2.s, z3.s}, pn8/z, [x0, x1, lsl #2]
   )");
   uint16_t base = (VL / 32) * 2;
   uint16_t offset = (VL / 32);
@@ -5295,6 +5299,13 @@ TEST_P(InstSve, ld1w) {
            src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]},
           VL / 8));
 
+  CHECK_NEON(2, uint32_t,
+             fillNeon<uint32_t>({src[2], src[3], src[0], src[1]}, VL / 8));
+  CHECK_NEON(3, uint32_t,
+             fillNeon<uint32_t>({src[(2 + offset) % 4], src[(3 + offset) % 4],
+                                 src[(0 + offset) % 4], src[(1 + offset) % 4]},
+                                VL / 8));
+
   // Four vector
   RUN_AARCH64(R"(
     # Get heap address

From 5f4fd1c4bea34362b91b539ce33affeda09472d4 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Tue, 29 Oct 2024 12:35:34 +0000
Subject: [PATCH 30/71] Implemented ST1W (2 vec, imm and scalar offset) SVE2
 instructions with tests.

---
 src/lib/arch/aarch64/Instruction_address.cc | 41 +++++++++++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc | 20 ++++++++++
 test/regression/aarch64/instructions/sve.cc | 35 ++++++++++++++++++
 3 files changed, 96 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index c43e8ec7b2..aeec71008f 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -1208,6 +1208,47 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
+      case Opcode::AArch64_ST1W_2Z: {  // st1w {zt1.s, zt2.s}, png, [xn, xm, lsl
+                                       // #2]
+        const uint64_t pn = sourceValues_[2].get<uint64_t>();
+        auto preds = predAsCounterToMasks<uint32_t, 2>(pn, VL_bits);
+        const uint16_t partition_num = VL_bits / 32;
+
+        const uint64_t base = sourceValues_[3].get<uint64_t>();
+        const uint64_t offset = sourceValues_[4].get<uint64_t>();
+        const uint64_t addr = base + (offset << 2);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+
+        generatePredicatedContiguousAddressBlocks(addr, partition_num, 4, 4,
+                                                  preds[0].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8),
+                                                  partition_num, 4, 4,
+                                                  preds[1].data(), addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_ST1W_2Z_IMM: {  // st1w {zt1.s, zt2.s}, png, [xn{,
+                                           // #imm, mul vl}]
+        const uint64_t pn = sourceValues_[2].get<uint64_t>();
+        auto preds = predAsCounterToMasks<uint32_t, 2>(pn, VL_bits);
+        const uint16_t partition_num = VL_bits / 32;
+
+        const uint64_t base = sourceValues_[3].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[3].mem.disp);
+        const uint64_t addr = base + (offset * partition_num * 4);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+
+        generatePredicatedContiguousAddressBlocks(addr, partition_num, 4, 4,
+                                                  preds[0].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8),
+                                                  partition_num, 4, 4,
+                                                  preds[1].data(), addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_SST1W_D_IMM: {  // st1w {zt.d}, pg, [zn.d{, #imm}]
         const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index e977e3e0dd..87f52e089e 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -5054,6 +5054,26 @@ void Instruction::execute() {
         memoryData_ = sve_merge_store_data<uint32_t>(d, p, VL_bits);
         break;
       }
+      case Opcode::AArch64_ST1W_2Z:  // st1w {zt1.s, zt2.s}, png, [xn, xm, lsl
+                                     // #2]
+        // STORE
+        [[fallthrough]];
+      case Opcode::AArch64_ST1W_2Z_IMM: {  // st1w {zt1.s, zt2.s}, png, [xn{,
+                                           // #imm, mul vl}]
+        // STORE
+        const uint32_t* t1 = sourceValues_[0].getAsVector<uint32_t>();
+        const uint32_t* t2 = sourceValues_[1].getAsVector<uint32_t>();
+        const uint64_t pn = sourceValues_[2].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint32_t, 2>(pn, VL_bits);
+
+        memoryData_ =
+            sve_merge_store_data<uint32_t>(t1, preds[0].data(), VL_bits);
+        std::vector<RegisterValue> out2 =
+            sve_merge_store_data<uint32_t>(t2, preds[1].data(), VL_bits);
+        memoryData_.insert(memoryData_.end(), out2.begin(), out2.end());
+        break;
+      }
       case Opcode::AArch64_ST1i16: {  // st1 {vt.h}[index], [xn]
         // STORE
         const uint16_t* t = sourceValues_[0].getAsVector<uint16_t>();
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 96a168a5c2..8410f6d724 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -6977,6 +6977,41 @@ TEST_P(InstSve, st1w) {
   }
 }
 
+TEST_P(InstSve, st1w_multivec) {
+  // Two vectors
+  initialHeapData_.resize(VL / 4);
+  uint32_t* heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  std::vector<uint32_t> src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
+  fillHeap<uint32_t>(heap32, src, VL / 16);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    sub sp, sp, #4095
+    mov x1, #2
+    mov x4, #256
+    madd x4, x4, x4, x4
+    ptrue p0.s
+    ptrue pn8.s
+    ld1w {z0.s}, p0/z, [x0]
+    ld1w {z1.s}, p0/z, [x0, #1, mul vl]
+    st1w {z0.s, z1.s}, pn8, [sp]
+    st1w {z0.s, z1.s}, pn8, [x4, #4, mul vl]
+    st1w {z0.s, z1.s}, pn8, [x4, x1, lsl #2]
+  )");
+
+  for (uint64_t i = 0; i < (VL / 16); i++) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 4)),
+              src[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint32_t>(65792 + (4 * (VL / 8)) + (i * 4)),
+              src[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint32_t>(65792 + 8 + (i * 4)), src[i % 4]);
+  }
+}
+
 TEST_P(InstSve, str_predicate) {
   initialHeapData_.resize(VL / 64);
   uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());

From 7a717e189e7d49342945c7292e1c137454bf3c44 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Tue, 29 Oct 2024 15:27:49 +0000
Subject: [PATCH 31/71] Implemented LD1B (2 vec, imm and scalar offset) SVE2
 instructions with tests.

---
 src/lib/arch/aarch64/Instruction_address.cc | 34 ++++++++
 src/lib/arch/aarch64/Instruction_execute.cc | 25 ++++++
 test/regression/aarch64/instructions/sve.cc | 89 +++++++++++++++++++++
 3 files changed, 148 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index aeec71008f..a0ed89fcd4 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -319,6 +319,40 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
         break;
       }
+      case Opcode::AArch64_LD1B_2Z: {  // ld1b {zt1.b, zt2.b}, png/z, [xn, xm]
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset = sourceValues_[2].get<uint64_t>();
+        const uint64_t addr = base + offset;
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(2);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_LD1B_2Z_IMM: {  // ld1b {zt1.b, zt2.b}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        const uint16_t partition_num = VL_bits / 8;
+
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset =
+            static_cast<uint64_t>(metadata_.operands[3].mem.disp);
+        const uint64_t addr = base + (offset * partition_num);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(2);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_LD1D: {  // ld1d {zt.d}, pg/z, [xn, xm, lsl #3]
         const uint64_t base = sourceValues_[1].get<uint64_t>();
         const uint64_t offset = sourceValues_[2].get<uint64_t>();
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 87f52e089e..b5496371fc 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -2733,6 +2733,31 @@ void Instruction::execute() {
         results_[0] = {out, 256};
         break;
       }
+      case Opcode::AArch64_LD1B_2Z:  // ld1b {zt1.b, zt2.b}, png/z, [xn, xm]
+        // LOAD
+        [[fallthrough]];
+      case Opcode::AArch64_LD1B_2Z_IMM: {  // ld1b {zt1.b, zt2.b}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        // LOAD
+        const uint64_t pn = sourceValues_[0].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint8_t, 2>(pn, VL_bits);
+
+        uint8_t out[2][256] = {{0}, {0}};
+        const uint16_t partition_num = VL_bits / 8;
+
+        for (int r = 0; r < 2; r++) {
+          for (int i = 0; i < partition_num; i++) {
+            uint64_t shifted_active = 1ull << (i % 64);
+            if (preds[r][i / 64] & shifted_active) {
+              out[r][i] = memoryData_[r].getAsVector<uint8_t>()[i];
+            }
+          }
+        }
+        results_[0] = {out[0], 256};
+        results_[1] = {out[1], 256};
+        break;
+      }
       case Opcode::AArch64_LD1D: {  // ld1d  {zt.d}, pg/z, [xn, xm, lsl #3]
         // LOAD
         const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 8410f6d724..00ae0efe56 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -4893,6 +4893,7 @@ TEST_P(InstSve, ld1rw) {
 }
 
 TEST_P(InstSve, ld1b) {
+  // Single vector
   initialHeapData_.resize(VL / 4);
   uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
   std::vector<uint8_t> src = {0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12,
@@ -4930,6 +4931,94 @@ TEST_P(InstSve, ld1b) {
                                VL / 16));
   std::rotate(src.begin(), src.begin() + ((VL / 8) % 16), src.end());
   CHECK_NEON(2, uint8_t, fillNeon<uint8_t>(src, VL / 16));
+
+  // Multi vector
+  initialHeapData_.resize(VL);
+  uint8_t* heap8_multi = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src_multi = {0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56,
+                                    0x34, 0x12, 0x32, 0x54, 0x76, 0x98,
+                                    0x01, 0xEF, 0xCD, 0xAB};
+  ;
+  fillHeap<uint8_t>(heap8_multi, src_multi, VL);
+
+  // Two vector
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    dup z0.b, #1
+    dup z1.b, #2
+    dup z2.b, #3
+    dup z3.b, #4
+
+    ptrue pn8.b
+    mov x1, #2
+
+    ld1b {z0.b, z1.b}, pn8/z, [x0, #2, mul vl]
+    ld1b {z2.b, z3.b}, pn8/z, [x0, x1]
+  )");
+  uint16_t base = (VL / 8) * 2;
+  uint16_t offset = (VL / 8);
+  CHECK_NEON(0, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[(base) % 16],
+                     src[(base + 1) % 16],
+                     src[(base + 2) % 16],
+                     src[(base + 3) % 16],
+                     src[(base + 4) % 16],
+                     src[(base + 5) % 16],
+                     src[(base + 6) % 16],
+                     src[(base + 7) % 16],
+                     src[(base + 8) % 16],
+                     src[(base + 9) % 16],
+                     src[(base + 10) % 16],
+                     src[(base + 11) % 16],
+                     src[(base + 12) % 16],
+                     src[(base + 13) % 16],
+                     src[(base + 14) % 16],
+                     src[(base + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(1, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + offset)) % 16],
+                     src[((base + offset) + 1) % 16],
+                     src[((base + offset) + 2) % 16],
+                     src[((base + offset) + 3) % 16],
+                     src[((base + offset) + 4) % 16],
+                     src[((base + offset) + 5) % 16],
+                     src[((base + offset) + 6) % 16],
+                     src[((base + offset) + 7) % 16],
+                     src[((base + offset) + 8) % 16],
+                     src[((base + offset) + 9) % 16],
+                     src[((base + offset) + 10) % 16],
+                     src[((base + offset) + 11) % 16],
+                     src[((base + offset) + 12) % 16],
+                     src[((base + offset) + 13) % 16],
+                     src[((base + offset) + 14) % 16],
+                     src[((base + offset) + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(2, uint8_t,
+             fillNeon<uint8_t>({src[2], src[3], src[4], src[5], src[6], src[7],
+                                src[8], src[9], src[10], src[11], src[12],
+                                src[13], src[14], src[15], src[0], src[1]},
+                               VL / 8));
+  CHECK_NEON(
+      3, uint8_t,
+      fillNeon<uint8_t>({src[(2 + offset) % 16], src[(3 + offset) % 16],
+                         src[(4 + offset) % 16], src[(5 + offset) % 16],
+                         src[(6 + offset) % 16], src[(7 + offset) % 16],
+                         src[(8 + offset) % 16], src[(9 + offset) % 16],
+                         src[(10 + offset) % 16], src[(11 + offset) % 16],
+                         src[(12 + offset) % 16], src[(13 + offset) % 16],
+                         src[(14 + offset) % 16], src[(15 + offset) % 16],
+                         src[(0 + offset) % 16], src[(1 + offset) % 16]},
+                        VL / 8));
 }
 
 TEST_P(InstSve, ld1sw_gather) {

From 6dca41088b31f43f326aaf8052baeb67e915d159 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Tue, 29 Oct 2024 15:41:58 +0000
Subject: [PATCH 32/71] Implemented UMPOA (8-bit to 32-bit widening uint) SME
 instruction with tests.

---
 src/lib/arch/aarch64/Instruction_execute.cc | 38 +++++++++++++++++++++
 test/regression/aarch64/instructions/sme.cc | 32 +++++++++++++++++
 2 files changed, 70 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index b5496371fc..41fd194c31 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -5902,6 +5902,44 @@ void Instruction::execute() {
         results_[0] = {out, 256};
         break;
       }
+      case Opcode::AArch64_UMOPA_MPPZZ_S: {  // umopa zada.s, pn/m, pm/m, zn.b,
+                                             // zm.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const uint8_t* zn = sourceValues_[tileDim + 2].getAsVector<uint8_t>();
+        const uint8_t* zm = sourceValues_[tileDim + 3].getAsVector<uint8_t>();
+
+        // zn is a SVLs x 4 sub matrix
+        // zm is a 4 x SVLs sub matrix
+        // Resulting SVLs x SVLs matrix has results widened to 32-bit
+        for (int row = 0; row < tileDim; row++) {
+          uint32_t outRow[64] = {0};
+          const uint32_t* zadaRow = sourceValues_[row].getAsVector<uint32_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            uint32_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << (znIndex % 64);
+              const uint64_t shifted_active_zm = 1ull << (zmIndex % 64);
+              if ((pn[znIndex / 64] & shifted_active_zn) &&
+                  (pm[zmIndex / 64] & shifted_active_zm))
+                sum += (static_cast<uint32_t>(zn[znIndex]) *
+                        static_cast<uint32_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
       case Opcode::AArch64_UMOVvi32_idx0:  // umov wd, vn.s[0]
       case Opcode::AArch64_UMOVvi32: {     // umov wd, vn.s[index]
         const uint32_t* vec = sourceValues_[0].getAsVector<uint32_t>();
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index 16716a5603..f3b83d510d 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -575,6 +575,38 @@ TEST_P(InstSme, st1w) {
   }
 }
 
+TEST_P(InstSme, umopa) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.b, #8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    umopa za0.s, p0/m, p1/m, z1.b, z2.b
+
+    dup z3.b, #7
+    dup z4.b, #4
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    umopa za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>({96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t,
+                  fillNeon<uint32_t>({112}, (SVL / 16)));
+  }
+}
+
 TEST_P(InstSme, zero) {
   // ZT0
   RUN_AARCH64(R"(

From 8b1f9e711c3c1a218cc8411504318bcf516f8f3d Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Tue, 29 Oct 2024 16:05:02 +0000
Subject: [PATCH 33/71] Implemented LD1B (4 vec, imm offset) SVE2 instruction
 with tests.

---
 src/lib/arch/aarch64/Instruction_address.cc |  21 ++++
 src/lib/arch/aarch64/Instruction_execute.cc |  24 +++++
 test/regression/aarch64/instructions/sve.cc | 104 +++++++++++++++++++-
 3 files changed, 148 insertions(+), 1 deletion(-)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index a0ed89fcd4..10c391e33d 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -353,6 +353,27 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
+      case Opcode::AArch64_LD1B_4Z_IMM: {  // ld1b {zt1.b - zt4.b}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        const uint16_t partition_num = VL_bits / 8;
+
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset =
+            static_cast<uint64_t>(metadata_.operands[3].mem.disp);
+        const uint64_t addr = base + (offset * partition_num);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(4);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+        addresses.push_back({addr + 2 * blockSize, blockSize});
+        addresses.push_back({addr + 3 * blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_LD1D: {  // ld1d {zt.d}, pg/z, [xn, xm, lsl #3]
         const uint64_t base = sourceValues_[1].get<uint64_t>();
         const uint64_t offset = sourceValues_[2].get<uint64_t>();
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 41fd194c31..dc9bce38c3 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -2758,6 +2758,30 @@ void Instruction::execute() {
         results_[1] = {out[1], 256};
         break;
       }
+      case Opcode::AArch64_LD1B_4Z_IMM: {  // ld1b {zt1.b - zt4.b}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        // LOAD
+        const uint64_t pn = sourceValues_[0].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint8_t, 4>(pn, VL_bits);
+
+        uint8_t out[4][256] = {{0}, {0}};
+        const uint16_t partition_num = VL_bits / 8;
+
+        for (int r = 0; r < 4; r++) {
+          for (int i = 0; i < partition_num; i++) {
+            uint64_t shifted_active = 1ull << (i % 64);
+            if (preds[r][i / 64] & shifted_active) {
+              out[r][i] = memoryData_[r].getAsVector<uint8_t>()[i];
+            }
+          }
+        }
+        results_[0] = {out[0], 256};
+        results_[1] = {out[1], 256};
+        results_[2] = {out[2], 256};
+        results_[3] = {out[3], 256};
+        break;
+      }
       case Opcode::AArch64_LD1D: {  // ld1d  {zt.d}, pg/z, [xn, xm, lsl #3]
         // LOAD
         const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 00ae0efe56..48f67a2b1f 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -4938,7 +4938,6 @@ TEST_P(InstSve, ld1b) {
   std::vector<uint8_t> src_multi = {0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56,
                                     0x34, 0x12, 0x32, 0x54, 0x76, 0x98,
                                     0x01, 0xEF, 0xCD, 0xAB};
-  ;
   fillHeap<uint8_t>(heap8_multi, src_multi, VL);
 
   // Two vector
@@ -5019,6 +5018,109 @@ TEST_P(InstSve, ld1b) {
                          src[(14 + offset) % 16], src[(15 + offset) % 16],
                          src[(0 + offset) % 16], src[(1 + offset) % 16]},
                         VL / 8));
+
+  // Four vector
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    dup z0.b, #1
+    dup z1.b, #2
+    dup z2.b, #3
+    dup z3.b, #4
+
+    ptrue pn8.b
+
+    ld1b {z0.b - z3.b}, pn8/z, [x0, #4, mul vl]
+  )");
+  base = (VL / 8) * 4;
+  offset = (VL / 8);
+  CHECK_NEON(0, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[(base) % 16],
+                     src[(base + 1) % 16],
+                     src[(base + 2) % 16],
+                     src[(base + 3) % 16],
+                     src[(base + 4) % 16],
+                     src[(base + 5) % 16],
+                     src[(base + 6) % 16],
+                     src[(base + 7) % 16],
+                     src[(base + 8) % 16],
+                     src[(base + 9) % 16],
+                     src[(base + 10) % 16],
+                     src[(base + 11) % 16],
+                     src[(base + 12) % 16],
+                     src[(base + 13) % 16],
+                     src[(base + 14) % 16],
+                     src[(base + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(1, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + offset)) % 16],
+                     src[((base + offset) + 1) % 16],
+                     src[((base + offset) + 2) % 16],
+                     src[((base + offset) + 3) % 16],
+                     src[((base + offset) + 4) % 16],
+                     src[((base + offset) + 5) % 16],
+                     src[((base + offset) + 6) % 16],
+                     src[((base + offset) + 7) % 16],
+                     src[((base + offset) + 8) % 16],
+                     src[((base + offset) + 9) % 16],
+                     src[((base + offset) + 10) % 16],
+                     src[((base + offset) + 11) % 16],
+                     src[((base + offset) + 12) % 16],
+                     src[((base + offset) + 13) % 16],
+                     src[((base + offset) + 14) % 16],
+                     src[((base + offset) + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(2, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + (2 * offset))) % 16],
+                     src[((base + (2 * offset)) + 1) % 16],
+                     src[((base + (2 * offset)) + 2) % 16],
+                     src[((base + (2 * offset)) + 3) % 16],
+                     src[((base + (2 * offset)) + 4) % 16],
+                     src[((base + (2 * offset)) + 5) % 16],
+                     src[((base + (2 * offset)) + 6) % 16],
+                     src[((base + (2 * offset)) + 7) % 16],
+                     src[((base + (2 * offset)) + 8) % 16],
+                     src[((base + (2 * offset)) + 9) % 16],
+                     src[((base + (2 * offset)) + 10) % 16],
+                     src[((base + (2 * offset)) + 11) % 16],
+                     src[((base + (2 * offset)) + 12) % 16],
+                     src[((base + (2 * offset)) + 13) % 16],
+                     src[((base + (2 * offset)) + 14) % 16],
+                     src[((base + (2 * offset)) + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(3, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + (3 * offset))) % 16],
+                     src[((base + (3 * offset)) + 1) % 16],
+                     src[((base + (3 * offset)) + 2) % 16],
+                     src[((base + (3 * offset)) + 3) % 16],
+                     src[((base + (3 * offset)) + 4) % 16],
+                     src[((base + (3 * offset)) + 5) % 16],
+                     src[((base + (3 * offset)) + 6) % 16],
+                     src[((base + (3 * offset)) + 7) % 16],
+                     src[((base + (3 * offset)) + 8) % 16],
+                     src[((base + (3 * offset)) + 9) % 16],
+                     src[((base + (3 * offset)) + 10) % 16],
+                     src[((base + (3 * offset)) + 11) % 16],
+                     src[((base + (3 * offset)) + 12) % 16],
+                     src[((base + (3 * offset)) + 13) % 16],
+                     src[((base + (3 * offset)) + 14) % 16],
+                     src[((base + (3 * offset)) + 15) % 16],
+                 },
+                 VL / 8));
 }
 
 TEST_P(InstSve, ld1sw_gather) {

From 5325d3f26ce0e12d30bec4170911dfe2dc727909 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Wed, 30 Oct 2024 12:28:34 +0000
Subject: [PATCH 34/71] Implemented UDOT (4-way, VGx4 8-bit to 32-bit widening,
 indexed vector) SME instruction with tests.

---
 src/lib/arch/aarch64/Instruction_execute.cc | 71 +++++++++++++++++++++
 test/regression/aarch64/instructions/sme.cc | 61 ++++++++++++++++++
 2 files changed, 132 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index dc9bce38c3..8a2dd74612 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -5875,6 +5875,77 @@ void Instruction::execute() {
         results_[0] = {div_3ops<uint64_t>(sourceValues_), 8};
         break;
       }
+      case Opcode::AArch64_UDOT_VG4_M4ZZI_BtoS: {  // udot za.s[wv, #off, vgx4],
+                                                   // {zn1.b - zn4.b},
+                                                   // zm.b[#index]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 32;
+        // Get ZA stride between quarters and index into each ZA quarter
+        const uint16_t zaStride = zaRowCount / 4;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+        // Get zm vector and zm's index
+        const uint8_t* zm =
+            sourceValues_[zaRowCount + 5].getAsVector<uint8_t>();
+        const int zmIndex = metadata_.operands[5].vector_index;
+
+        // Pre-set all ZA result rows as only 4 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        // Loop over each source vector and destination vector (from the za
+        // single-vector group) pair
+        for (int r = 0; r < 4; r++) {
+          // For ZA single-vector groups of 4 vectors (vgx4), each vector is in
+          // a different quarter of ZA; indexed into it by Wv+off.
+          const uint32_t* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<uint32_t>();
+          const uint8_t* znr =
+              sourceValues_[zaRowCount + 1 + r].getAsVector<uint8_t>();
+          uint32_t out[64] = {0};
+          // Loop over all 32-bit elements of output row vector `zaRow`
+          for (int e = 0; e < elemCount; e++) {
+            // This instruction destructively adds the widened dot product
+            // (4x 8-bit --> 1x 32-bit) of the following to each 32-bit element
+            // in the current `zaRow`:
+            //    - four 8-bit values in each corresponding 32-bit element of
+            //      the current source `znr` vector
+            //    - four 8-bit values from a 32-bit element of `zm`, selected
+            //      from each 128-bit segment of `zm` using an index
+            //
+            // The 128-bit segment of `zm` currently in use corresponds to the
+            // 128-bit segment that the current 32-bit elements of `znr`
+            // and `zaRow` are within.
+            // For example, with a SVL = 512-bits, elements `e` of `zaRow` in
+            // the range 0->15, and zmIndex = 1:
+            //    - When `e` = 0 -> 3, the 32-bit element used from `zm` will be
+            //                         zm[1] (1st 32-bit element in 0th 128-bit
+            //                         segment)
+            //    - When `e` = 4 -> 7, the 32-bit element used from `zm` will be
+            //                         zm[5] (1st 32-bit element in 1st 128-bit
+            //                         segment)
+            out[e] = zaRow[e];
+            // MOD 4 as there are 4 32-bit elements per 128-bit segment of `zm`
+            const int zmSegBase = e - (e % 4);
+            const int s = zmSegBase + zmIndex;
+            // There are 4 8-bit elements per 32-bit element of `znr` and `zm`
+            for (int i = 0; i < 4; i++) {
+              out[e] += static_cast<uint32_t>(znr[4 * e + i]) *
+                        static_cast<uint32_t>(zm[4 * s + i]);
+            }
+          }
+          // Update results_ for completed row
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
       case Opcode::AArch64_UDOT_ZZZI_S: {  // udot zd.s, zn.b, zm.b[index]
         results_[0] = sveUdot_indexed<uint32_t, uint8_t, 4>(sourceValues_,
                                                             metadata_, VL_bits);
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index f3b83d510d..0237a02840 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -575,6 +575,67 @@ TEST_P(InstSme, st1w) {
   }
 }
 
+TEST_P(InstSme, udot_vgx4) {
+  // 8-bit to 32-bit widening
+  initialHeapData_.resize(SVL / 8);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src = {0, 1, 2,  3,  4,  5,  6,  7,
+                              8, 9, 10, 11, 12, 13, 14, 15};
+  fillHeap<uint8_t>(heap8, src, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 96 (uint32_t)
+    dup z1.b, #8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+    umopa za0.s, p0/m, p1/m, z1.b, z2.b
+    umopa za1.s, p0/m, p1/m, z1.b, z2.b
+    umopa za2.s, p0/m, p1/m, z1.b, z2.b
+    umopa za3.s, p0/m, p1/m, z1.b, z2.b
+
+    # initialise registers
+    mov w8, #1
+    dup z4.b, #10
+    dup z5.b, #11
+    dup z6.b, #12
+    dup z7.b, #13
+    ld1b {z10.b}, p0/z, [x0]
+
+    udot za.s[w8, #1, vgx4], {z4.b - z7.b}, z10.b[2]
+  )");
+  const uint16_t zaStride = (SVL / 8) / 4;
+  const uint16_t zaQuartIndex = 2;
+  for (uint64_t i = 0; i < (SVL / 8); i++) {
+    // Effected rows all use same zm values of {8, 9, 10, 11}
+    if (i == zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({476}, (SVL / 8)));
+    } else if (i == zaStride + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({514}, (SVL / 8)));
+    } else if (i == (2 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({552}, (SVL / 8)));
+    } else if (i == (3 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({590}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 96 throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({96}, (SVL / 8)));
+    }
+  }
+}
+
 TEST_P(InstSme, umopa) {
   // 32-bit
   RUN_AARCH64(R"(

From 7125a40066f94ed718cf94e058d54896ecceaa59 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Wed, 30 Oct 2024 13:03:21 +0000
Subject: [PATCH 35/71] Implemented MOVA (array to vecs, 4 registers) SME
 instruction with tests.

---
 src/lib/arch/aarch64/Instruction_execute.cc | 29 ++++++++---
 test/regression/aarch64/instructions/sme.cc | 56 ++++++++++++++++++++-
 2 files changed, 76 insertions(+), 9 deletions(-)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 8a2dd74612..d95696c840 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -3986,19 +3986,32 @@ void Instruction::execute() {
         const uint8_t offs4 =
             metadata_.operands[4].sme.slice_offset.imm_range.offset;
 
-        uint8_t out[4][256] = {{0}, {0}, {0}, {0}};
-
         for (uint8_t i = offs1; i <= offs4; i++) {
-          // Get correct next row
-          const uint8_t* row =
-              sourceValues_[(ws + i) % sliceCount].getAsVector<uint8_t>();
-          // Update out and results_
           const uint8_t index = i - offs1;
-          memcpy(out[index], row, sliceCount);
-          results_[index] = {out[index], 256};
+          results_[index] = sourceValues_[(ws + i) % sliceCount];
         }
         break;
       }
+      case Opcode::AArch64_MOVA_VG4_4ZMXI: {  // mova {zd1.d - zd4.d}, za.d[wv,
+                                              // offs, vgx4]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        // Get ZA stride between quarters and index into each ZA quarter
+        const uint16_t zaStride = zaRowCount / 4;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[4].sme.slice_offset.imm) %
+                                 zaStride;
+
+        results_[0] = sourceValues_[zaIndex];
+        results_[1] = sourceValues_[zaStride + zaIndex];
+        results_[2] = sourceValues_[(2 * zaStride) + zaIndex];
+        results_[3] = sourceValues_[(3 * zaStride) + zaIndex];
+        break;
+      }
       case Opcode::AArch64_MOVID: {  // movi dd, #imm
         results_[0] = {static_cast<uint64_t>(metadata_.operands[1].imm), 256};
         break;
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index 0237a02840..f7a3689e62 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -7,7 +7,7 @@ namespace {
 
 using InstSme = AArch64RegressionTest;
 
-TEST_P(InstSme, mova) {
+TEST_P(InstSme, mova_tileToVec) {
   // 8-bit
   RUN_AARCH64(R"(
     smstart
@@ -47,6 +47,60 @@ TEST_P(InstSme, mova) {
   CHECK_NEON(7, float, fillNeonCombined<float>({4}, {10}, SVL / 8));
 }
 
+TEST_P(InstSme, mova_zaToVecs) {
+  // 4 vectors
+  initialHeapData_.resize(SVL / 8);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src = {0, 1, 2,  3,  4,  5,  6,  7,
+                              8, 9, 10, 11, 12, 13, 14, 15};
+  fillHeap<uint8_t>(heap8, src, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 96 (uint32_t)
+    dup z1.b, #8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+    umopa za0.s, p0/m, p1/m, z1.b, z2.b
+    umopa za1.s, p0/m, p1/m, z1.b, z2.b
+    umopa za2.s, p0/m, p1/m, z1.b, z2.b
+    umopa za3.s, p0/m, p1/m, z1.b, z2.b
+
+    # Set 4 of the za rows
+    mov w8, #1
+    dup z4.b, #10
+    dup z5.b, #11
+    dup z6.b, #12
+    dup z7.b, #13
+    ld1b {z10.b}, p0/z, [x0]
+    udot za.s[w8, #1, vgx4], {z4.b - z7.b}, z10.b[2]
+
+    mov w9, #0
+    mova {z20.d - z23.d}, za.d[w9, #0, vgx4]
+    mov {z24.d - z27.d}, za.d[w8, #1, vgx4]
+  )");
+  // Check extracted un-effected rows (two uint32_t values of 96 equal one
+  // uint64_t value of 412316860512)
+  CHECK_NEON(20, uint64_t, fillNeon<uint64_t>({412316860512}, SVL / 8));
+  CHECK_NEON(21, uint64_t, fillNeon<uint64_t>({412316860512}, SVL / 8));
+  CHECK_NEON(22, uint64_t, fillNeon<uint64_t>({412316860512}, SVL / 8));
+  CHECK_NEON(23, uint64_t, fillNeon<uint64_t>({412316860512}, SVL / 8));
+  // Check extracted effected rows (two uint32_t values concatonated into one
+  // uint64_t value)
+  CHECK_NEON(24, uint64_t, fillNeon<uint64_t>({2044404433372}, SVL / 8));
+  CHECK_NEON(25, uint64_t, fillNeon<uint64_t>({2207613190658}, SVL / 8));
+  CHECK_NEON(26, uint64_t, fillNeon<uint64_t>({2370821947944}, SVL / 8));
+  CHECK_NEON(27, uint64_t, fillNeon<uint64_t>({2534030705230}, SVL / 8));
+}
+
 TEST_P(InstSme, mova_tilesToVecs) {
   // uint8_t; 4 vectors
   initialHeapData_.resize(SVL / 4);

From c6da568feae16035980f52d6f43816e08ccc8ade Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Wed, 30 Oct 2024 13:35:23 +0000
Subject: [PATCH 36/71] Implemented ST1W (4 vec, imm offset) SVE2 instructions
 with tests.

---
 src/lib/arch/aarch64/Instruction_address.cc | 27 +++++++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc | 24 ++++++++++++++++
 test/regression/aarch64/instructions/sve.cc | 32 ++++++++++++++++++++-
 3 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index 10c391e33d..bd78d6649c 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -1304,6 +1304,33 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
+      case Opcode::AArch64_ST1W_4Z_IMM: {  // st1w {zt1.s - zt4.s}, png, [xn{,
+                                           // #imm, mul vl}]
+        const uint64_t pn = sourceValues_[4].get<uint64_t>();
+        auto preds = predAsCounterToMasks<uint32_t, 4>(pn, VL_bits);
+        const uint16_t partition_num = VL_bits / 32;
+
+        const uint64_t base = sourceValues_[5].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[5].mem.disp);
+        const uint64_t addr = base + (offset * partition_num * 4);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+
+        generatePredicatedContiguousAddressBlocks(addr, partition_num, 4, 4,
+                                                  preds[0].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8),
+                                                  partition_num, 4, 4,
+                                                  preds[1].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + 2 * (VL_bits / 8),
+                                                  partition_num, 4, 4,
+                                                  preds[2].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + 3 * (VL_bits / 8),
+                                                  partition_num, 4, 4,
+                                                  preds[3].data(), addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_SST1W_D_IMM: {  // st1w {zt.d}, pg, [zn.d{, #imm}]
         const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
         const uint16_t partition_num = VL_bits / 64;
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index d95696c840..1f471da2d2 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -5136,6 +5136,30 @@ void Instruction::execute() {
         memoryData_.insert(memoryData_.end(), out2.begin(), out2.end());
         break;
       }
+      case Opcode::AArch64_ST1W_4Z_IMM: {  // st1w {zt1.s - zt4.s}, png, [xn{,
+                                           // #imm, mul vl}]
+        // STORE
+        const uint32_t* t1 = sourceValues_[0].getAsVector<uint32_t>();
+        const uint32_t* t2 = sourceValues_[1].getAsVector<uint32_t>();
+        const uint32_t* t3 = sourceValues_[2].getAsVector<uint32_t>();
+        const uint32_t* t4 = sourceValues_[3].getAsVector<uint32_t>();
+        const uint64_t pn = sourceValues_[4].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint32_t, 4>(pn, VL_bits);
+
+        memoryData_ =
+            sve_merge_store_data<uint32_t>(t1, preds[0].data(), VL_bits);
+        std::vector<RegisterValue> out2 =
+            sve_merge_store_data<uint32_t>(t2, preds[1].data(), VL_bits);
+        std::vector<RegisterValue> out3 =
+            sve_merge_store_data<uint32_t>(t3, preds[2].data(), VL_bits);
+        std::vector<RegisterValue> out4 =
+            sve_merge_store_data<uint32_t>(t4, preds[3].data(), VL_bits);
+        memoryData_.insert(memoryData_.end(), out2.begin(), out2.end());
+        memoryData_.insert(memoryData_.end(), out3.begin(), out3.end());
+        memoryData_.insert(memoryData_.end(), out4.begin(), out4.end());
+        break;
+      }
       case Opcode::AArch64_ST1i16: {  // st1 {vt.h}[index], [xn]
         // STORE
         const uint16_t* t = sourceValues_[0].getAsVector<uint16_t>();
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 48f67a2b1f..1d3925dcd8 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -7192,7 +7192,6 @@ TEST_P(InstSve, st1w_multivec) {
     st1w {z0.s, z1.s}, pn8, [x4, #4, mul vl]
     st1w {z0.s, z1.s}, pn8, [x4, x1, lsl #2]
   )");
-
   for (uint64_t i = 0; i < (VL / 16); i++) {
     EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
                                        4095 + (i * 4)),
@@ -7201,6 +7200,37 @@ TEST_P(InstSve, st1w_multivec) {
               src[i % 4]);
     EXPECT_EQ(getMemoryValue<uint32_t>(65792 + 8 + (i * 4)), src[i % 4]);
   }
+
+  // Four vectors
+  initialHeapData_.resize(VL);
+  heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  fillHeap<uint32_t>(heap32, src, VL / 4);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    sub sp, sp, #4095
+    mov x1, #2
+    mov x4, #256
+    madd x4, x4, x4, x4
+    ptrue p0.s
+    ptrue pn8.s
+    ld1w {z0.s}, p0/z, [x0]
+    ld1w {z1.s}, p0/z, [x0, #1, mul vl]
+    ld1w {z2.s}, p0/z, [x0, #2, mul vl]
+    ld1w {z3.s}, p0/z, [x0, #3, mul vl]
+    st1w {z0.s - z3.s}, pn8, [sp]
+    st1w {z0.s - z3.s}, pn8, [x4, #8, mul vl]
+  )");
+  for (uint64_t i = 0; i < (VL / 8); i++) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 4)),
+              src[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint32_t>(65792 + (8 * (VL / 8)) + (i * 4)),
+              src[i % 4]);
+  }
 }
 
 TEST_P(InstSve, str_predicate) {

From 7e2f9a462b72e51373f429c8166cca2e7a51485d Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Wed, 30 Oct 2024 16:55:44 +0000
Subject: [PATCH 37/71] Fixed SVE udot execution logic.

---
 src/include/simeng/arch/aarch64/helpers/sve.hh | 6 +++++-
 test/regression/aarch64/instructions/sve.cc    | 4 +++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index df924c1f8c..c963b22f7a 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -1640,9 +1640,13 @@ RegisterValue sveUdot_indexed(
   D out[256 / sizeof(D)] = {0};
   for (int i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) {
     D acc = zd[i];
+    // Index into zm selects which D-type element within each 128-bit vector
+    // segment to use
+    int base = i - (i % (128 / (sizeof(D) * 8)));
+    int zmIndex = base + index;
     for (int j = 0; j < W; j++) {
       acc += (static_cast<D>(zn[(W * i) + j]) *
-              static_cast<N>(zm[(W * index) + j]));
+              static_cast<N>(zm[(W * zmIndex) + j]));
     }
     out[i] = acc;
   }
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 1d3925dcd8..bdf4320658 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -7662,7 +7662,8 @@ TEST_P(InstSve, udot) {
     mov x8, #214
     svc #0
 
-    ldr q0, [x0]
+    ptrue p0.b
+    ld1rqb	{ z0.b }, p0/z, [x0]
 
     dup z2.b, #2
     dup z3.b, #3
@@ -7672,6 +7673,7 @@ TEST_P(InstSve, udot) {
     udot z4.s, z2.b, z0.b[0]
     udot z5.s, z3.b, z0.b[3]
   )");
+
   CHECK_NEON(4, uint32_t, fillNeon<uint32_t>({1534}, VL / 8));
   CHECK_NEON(5, uint32_t, fillNeon<uint32_t>({629}, VL / 8));
 }

From 6772b66e4b57d434007d0b23f8b042d6f0988d95 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Wed, 30 Oct 2024 17:38:57 +0000
Subject: [PATCH 38/71] Fixed issue with LD1B SVE2 (4 vec) instruction.

---
 src/lib/arch/aarch64/Instruction_address.cc | 2 +-
 src/lib/arch/aarch64/Instruction_execute.cc | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index bd78d6649c..41b03e3216 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -359,7 +359,7 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
 
         const uint64_t base = sourceValues_[1].get<uint64_t>();
         const uint64_t offset =
-            static_cast<uint64_t>(metadata_.operands[3].mem.disp);
+            static_cast<uint64_t>(metadata_.operands[5].mem.disp);
         const uint64_t addr = base + (offset * partition_num);
 
         std::vector<memory::MemoryAccessTarget> addresses;
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 1f471da2d2..efe8a208c6 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -2765,14 +2765,15 @@ void Instruction::execute() {
 
         auto preds = predAsCounterToMasks<uint8_t, 4>(pn, VL_bits);
 
-        uint8_t out[4][256] = {{0}, {0}};
+        uint8_t out[4][256] = {{0}, {0}, {0}, {0}};
         const uint16_t partition_num = VL_bits / 8;
 
         for (int r = 0; r < 4; r++) {
+          const uint8_t* data = memoryData_[r].getAsVector<uint8_t>();
           for (int i = 0; i < partition_num; i++) {
             uint64_t shifted_active = 1ull << (i % 64);
             if (preds[r][i / 64] & shifted_active) {
-              out[r][i] = memoryData_[r].getAsVector<uint8_t>()[i];
+              out[r][i] = data[i];
             }
           }
         }

From ab80ba7c514b428f9ba365af2d3b4dd447bc1959 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Thu, 31 Oct 2024 15:53:52 +0000
Subject: [PATCH 39/71] Implemented FMLA (float, double, VGx4, indexed) SME
 instruction with tests.

---
 src/lib/arch/aarch64/Instruction_execute.cc | 106 +++++++++++++++++
 test/regression/aarch64/instructions/sme.cc | 122 +++++++++++++++++++-
 2 files changed, 227 insertions(+), 1 deletion(-)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index efe8a208c6..3ee9dfeec1 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -1824,6 +1824,112 @@ void Instruction::execute() {
             [](double x, double y) -> double { return std::fmin(x, y); });
         break;
       }
+      case Opcode::AArch64_FMLA_VG4_M4ZZI_D: {  // fmla za.d[wv, offs, vgx4],
+                                                // {zn1.d - zn4.d}, zm.d[index]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 64;
+        // Get ZA stride between quarters and index into each ZA quarter
+        const uint16_t zaStride = zaRowCount / 4;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+        // Get zm vector and zm's index
+        const double* zm = sourceValues_[zaRowCount + 5].getAsVector<double>();
+        const int zmIndex = metadata_.operands[5].vector_index;
+
+        // Pre-set all ZA result rows as only 4 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        // Loop over each source vector and destination vector (from the za
+        // single-vector group) pair
+        for (int r = 0; r < 4; r++) {
+          // For ZA single-vector groups of 4 vectors (vgx4), each vector is in
+          // a different quarter of ZA; indexed into it by Wv+off.
+          const double* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<double>();
+          const double* znr =
+              sourceValues_[zaRowCount + 1 + r].getAsVector<double>();
+          double out[32] = {0.0};
+          // Loop over all elements of output row vector `zaRow`
+          for (int e = 0; e < elemCount; e++) {
+            // This instruction multiplies each element of the current `znr` by
+            // an indexed element of `zm` and destructively adds the result to
+            // the corresponding element in the current `zaRow`.
+            //
+            // The index for `zm` specifies which element in each 128-bit
+            // segment to use. The 128-bit segment of `zm` currently in use
+            // corresponds to the 128-bit segment that the current element of
+            // `znr` and `zaRow` is within.
+
+            // MOD 2 as there are 2 64-bit elements per 128-bit segment of `zm`
+            const int zmSegBase = e - (e % 2);
+            out[e] = zaRow[e] + (znr[e] * zm[zmSegBase + zmIndex]);
+          }
+          // Update results_ for completed row
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
+      case Opcode::AArch64_FMLA_VG4_M4ZZI_S: {  // fmla za.s[wv, offs, vgx4],
+                                                // {zn1.s - zn4.s}, zm.s[index]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 32;
+        // Get ZA stride between quarters and index into each ZA quarter
+        const uint16_t zaStride = zaRowCount / 4;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+        // Get zm vector and zm's index
+        const float* zm = sourceValues_[zaRowCount + 5].getAsVector<float>();
+        const int zmIndex = metadata_.operands[5].vector_index;
+
+        // Pre-set all ZA result rows as only 4 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        // Loop over each source vector and destination vector (from the za
+        // single-vector group) pair
+        for (int r = 0; r < 4; r++) {
+          // For ZA single-vector groups of 4 vectors (vgx4), each vector is in
+          // a different quarter of ZA; indexed into it by Wv+off.
+          const float* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<float>();
+          const float* znr =
+              sourceValues_[zaRowCount + 1 + r].getAsVector<float>();
+          float out[64] = {0.0f};
+          // Loop over all elements of output row vector `zaRow`
+          for (int e = 0; e < elemCount; e++) {
+            // This instruction multiplies each element of the current `znr` by
+            // an indexed element of `zm` and destructively adds the result to
+            // the corresponding element in the current `zaRow`.
+            //
+            // The index for `zm` specifies which element in each 128-bit
+            // segment to use. The 128-bit segment of `zm` currently in use
+            // corresponds to the 128-bit segment that the current element of
+            // `znr` and `zaRow` is within.
+
+            // MOD 4 as there are 4 32-bit elements per 128-bit segment of `zm`
+            const int zmSegBase = e - (e % 4);
+            out[e] = zaRow[e] + (znr[e] * zm[zmSegBase + zmIndex]);
+          }
+          // Update results_ for completed row
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
       case Opcode::AArch64_FMLA_ZPmZZ_D: {  // fmla zd.d, pg/m, zn.d, zm.d
         results_[0] = sveMlaPredicated_vecs<double>(sourceValues_, VL_bits);
         break;
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index f7a3689e62..25e2b4c800 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -142,6 +142,126 @@ TEST_P(InstSme, mova_tilesToVecs) {
   CHECK_NEON(11, uint8_t, fillNeon<uint8_t>({0x00}, SVL / 8));
 }
 
+TEST_P(InstSme, fmla_indexed_vgx4) {
+  // float
+  initialHeapData_.resize(SVL);
+  float* heapf = reinterpret_cast<float*>(initialHeapData_.data());
+  std::vector<float> srcf = {0.0f, 1.0f, 2.0f, 3.0f};
+  fillHeap<float>(heapf, srcf, SVL / 4);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 24.0f
+    fdup z1.s, #3.0
+    fdup z2.s, #8.0
+    ptrue p0.s
+    ptrue p1.s
+    fmopa za0.s, p0/m, p1/m, z1.s, z2.s
+    fmopa za1.s, p0/m, p1/m, z1.s, z2.s
+    fmopa za2.s, p0/m, p1/m, z1.s, z2.s
+    fmopa za3.s, p0/m, p1/m, z1.s, z2.s
+
+    # initialise registers
+    mov w8, #1
+    fdup z4.s, #0.25
+    fdup z5.s, #1.5
+    fdup z6.s, #-0.5
+    fdup z7.s, #-2.5
+    ld1w {z10.s}, p0/z, [x0]
+
+    fmla za.s[w8, #1, vgx4], {z4.s - z7.s}, z10.s[2]
+  )");
+  const uint16_t zaStride = (SVL / 8) / 4;
+  const uint16_t zaQuartIndex = 2;
+  for (uint64_t i = 0; i < (SVL / 8); i++) {
+    // Effected rows all use same zm value of 2.0f
+    if (i == zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({24.5f}, (SVL / 8)));
+    } else if (i == zaStride + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({27.0f}, (SVL / 8)));
+    } else if (i == (2 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({23.0f}, (SVL / 8)));
+    } else if (i == (3 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({19.0f}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 24.0f throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({24.0f}, (SVL / 8)));
+    }
+  }
+
+  // double
+  initialHeapData_.resize(SVL);
+  double* heapd = reinterpret_cast<double*>(initialHeapData_.data());
+  std::vector<double> srcd = {2.0f, 3.0f};
+  fillHeap<double>(heapd, srcd, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 24.0f
+    fdup z1.d, #3.0
+    fdup z2.d, #8.0
+    ptrue p0.d
+    ptrue p1.d
+    fmopa za0.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za1.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za2.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za3.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za4.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za5.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za6.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za7.d, p0/m, p1/m, z1.d, z2.d
+
+    # initialise registers
+    mov w8, #1
+    fdup z4.d, #0.25
+    fdup z5.d, #1.5
+    fdup z6.d, #-0.5
+    fdup z7.d, #-2.5
+    ld1d {z10.d}, p0/z, [x0]
+
+    fmla za.d[w8, #1, vgx4], {z4.d - z7.d}, z10.d[0]
+  )");
+  for (uint64_t i = 0; i < (SVL / 8); i++) {
+    // Effected rows all use same zm value of 2.0f
+    if (i == zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({24.5}, (SVL / 8)));
+    } else if (i == zaStride + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({27.0}, (SVL / 8)));
+    } else if (i == (2 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({23.0}, (SVL / 8)));
+    } else if (i == (3 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({19.0}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 24.0 throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({24.0}, (SVL / 8)));
+    }
+  }
+}
+
 TEST_P(InstSme, fmopa) {
   // 32-bit
   RUN_AARCH64(R"(
@@ -629,7 +749,7 @@ TEST_P(InstSme, st1w) {
   }
 }
 
-TEST_P(InstSme, udot_vgx4) {
+TEST_P(InstSme, udot_Indexed_vgx4) {
   // 8-bit to 32-bit widening
   initialHeapData_.resize(SVL / 8);
   uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());

From 9e762b88f0c10c03f6011cd4591338f1c369b80b Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Thu, 31 Oct 2024 16:43:55 +0000
Subject: [PATCH 40/71] Implemented st1d (4 consec vecs, uint64, imm offset)
 SVE2 instruction with tests.

---
 src/lib/arch/aarch64/Instruction_address.cc | 27 ++++++++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc | 24 ++++++++++++++++
 test/regression/aarch64/instructions/sve.cc | 31 +++++++++++++++++++++
 3 files changed, 82 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index 41b03e3216..ad61720c6a 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -1154,6 +1154,33 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
+      case Opcode::AArch64_ST1D_4Z_IMM: {  // st1d {zt1.d - zt4.d}, png, [xn{,
+                                           // #imm, mul vl}]
+        const uint64_t pn = sourceValues_[4].get<uint64_t>();
+        auto preds = predAsCounterToMasks<uint64_t, 4>(pn, VL_bits);
+        const uint16_t partition_num = VL_bits / 64;
+
+        const uint64_t base = sourceValues_[5].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[5].mem.disp);
+        const uint64_t addr = base + (offset * partition_num * 8);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+
+        generatePredicatedContiguousAddressBlocks(addr, partition_num, 8, 8,
+                                                  preds[0].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8),
+                                                  partition_num, 8, 8,
+                                                  preds[1].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + 2 * (VL_bits / 8),
+                                                  partition_num, 8, 8,
+                                                  preds[2].data(), addresses);
+        generatePredicatedContiguousAddressBlocks(addr + 3 * (VL_bits / 8),
+                                                  partition_num, 8, 8,
+                                                  preds[3].data(), addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_ST2D_IMM: {  // st2d {zt1.d, zt2.d}, pg, [<xn|sp>{,
                                         // #imm, mul vl}]
         const uint64_t* p = sourceValues_[2].getAsVector<uint64_t>();
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 3ee9dfeec1..8d609ba38c 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -5019,6 +5019,30 @@ void Instruction::execute() {
         memoryData_.insert(memoryData_.end(), out2.begin(), out2.end());
         break;
       }
+      case Opcode::AArch64_ST1D_4Z_IMM: {  // st1d {zt1.d - zt4.d}, png, [xn{,
+                                           // #imm, mul vl}]
+        // STORE
+        const uint64_t* t1 = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* t2 = sourceValues_[1].getAsVector<uint64_t>();
+        const uint64_t* t3 = sourceValues_[2].getAsVector<uint64_t>();
+        const uint64_t* t4 = sourceValues_[3].getAsVector<uint64_t>();
+        const uint64_t pn = sourceValues_[4].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint64_t, 4>(pn, VL_bits);
+
+        memoryData_ =
+            sve_merge_store_data<uint64_t>(t1, preds[0].data(), VL_bits);
+        std::vector<RegisterValue> out2 =
+            sve_merge_store_data<uint64_t>(t2, preds[1].data(), VL_bits);
+        std::vector<RegisterValue> out3 =
+            sve_merge_store_data<uint64_t>(t3, preds[2].data(), VL_bits);
+        std::vector<RegisterValue> out4 =
+            sve_merge_store_data<uint64_t>(t4, preds[3].data(), VL_bits);
+        memoryData_.insert(memoryData_.end(), out2.begin(), out2.end());
+        memoryData_.insert(memoryData_.end(), out3.begin(), out3.end());
+        memoryData_.insert(memoryData_.end(), out4.begin(), out4.end());
+        break;
+      }
       case Opcode::AArch64_ST1Fourv16b: {  // st1 {vt.16b, vt2.16b, vt3.16b,
                                            // vt4.16b}, [xn|sp]
         // STORE
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index bdf4320658..0e8344b3a0 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -6948,6 +6948,37 @@ TEST_P(InstSve, st1d_multivec) {
               src[i % 4]);
     EXPECT_EQ(getMemoryValue<uint64_t>(65792 + 8 + (i * 8)), src[i % 4]);
   }
+
+  // Four vectors
+  initialHeapData_.resize(VL);
+  heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  fillHeap<uint64_t>(heap64, src, VL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    sub sp, sp, #4095
+    mov x1, #2
+    mov x4, #256
+    madd x4, x4, x4, x4
+    ptrue p0.d
+    ptrue pn8.d
+    ld1d {z0.d}, p0/z, [x0]
+    ld1d {z1.d}, p0/z, [x0, #1, mul vl]
+    ld1d {z2.d}, p0/z, [x0, #2, mul vl]
+    ld1d {z3.d}, p0/z, [x0, #3, mul vl]
+    st1d {z0.d - z3.d}, pn8, [sp]
+    st1d {z0.d - z3.d}, pn8, [x4, #8, mul vl]
+  )");
+  for (uint64_t i = 0; i < (VL / 16); i++) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 8)),
+              src[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(65792 + (8 * (VL / 8)) + (i * 8)),
+              src[i % 4]);
+  }
 }
 
 TEST_P(InstSve, st2d) {

From 7de00825c4e79a4e595dfae37a9e0e5d3eafeb12 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Thu, 31 Oct 2024 17:44:36 +0000
Subject: [PATCH 41/71] Added NEON bf16 UDOT (by element) instruction execution
 logic and BF16 build option.

---
 CMakeLists.txt                              |  1 +
 src/include/simeng/version.hh.in            |  1 +
 src/lib/arch/aarch64/Instruction_execute.cc | 23 +++++++++++++++++++++
 3 files changed, 25 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d0691578fe..42111288ff 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -116,6 +116,7 @@ option(SIMENG_SANITIZE "Enable compiler sanitizers" OFF)
 option(SIMENG_OPTIMIZE "Enable Extra Compiler Optimizations" OFF)
 option(SIMENG_ENABLE_SST "Compile SimEng SST Wrapper" OFF)
 option(SIMENG_ENABLE_SST_TESTS "Enable testing for SST" OFF)
+option(SIMENG_ENABLE_BF16 "Enable __bf16 instruction execution logic" OFF)
 
 # Set CXX flag for Apple Mac so that `binary_function` and `unary_function` types that are used in SST can be recognised. 
 # They were deprecated in C++11 and removed in C++17, and Apple Clang v15 no longer supports these types without the following flag
diff --git a/src/include/simeng/version.hh.in b/src/include/simeng/version.hh.in
index 5f1e8f410b..8a2a823a66 100644
--- a/src/include/simeng/version.hh.in
+++ b/src/include/simeng/version.hh.in
@@ -9,5 +9,6 @@
 #define SIMENG_LLVM_VERSION @SIMENG_LLVM_VERSION@
 #define SIMENG_ENABLE_TESTS "${SIMENG_ENABLE_TESTS}"
 #define SIMENG_BUILD_DIR "${CMAKE_BINARY_DIR}"
+#define SIMENG_ENABLE_BF16 "${SIMENG_ENABLE_BF16}"
 
 #endif
\ No newline at end of file
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 8d609ba38c..583818fc39 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -486,6 +486,29 @@ void Instruction::execute() {
         branchAddress_ = instructionAddress_ + metadata_.operands[0].imm;
         break;
       }
+      case Opcode::AArch64_BF16DOTlanev8bf16: {  // bfdot vd.4s, vn.8h,
+                                                 // vm.2h[index]
+        // BF16 -- EXPERIMENTAL
+        if (std::string(SIMENG_ENABLE_BF16) == "OFF") return executionNYI();
+        // Must be enabled at SimEng compile time
+        // Not verified to be working for all compilers or OSs.
+        // No Tests written
+
+        const float* vd = sourceValues_[0].getAsVector<float>();
+        const __bf16* vn = sourceValues_[1].getAsVector<__bf16>();
+        const __bf16* vm = sourceValues_[2].getAsVector<__bf16>();
+        const int vmIndex = metadata_.operands[2].vector_index;
+
+        float out[4] = {vd[0], vd[1], vd[2], vd[3]};
+        for (int i = 0; i < 4; i++) {
+          out[i] += (static_cast<float>(vn[2 * i]) *
+                     static_cast<float>(vm[2 * vmIndex])) +
+                    (static_cast<float>(vn[2 * i + 1]) *
+                     static_cast<float>(vm[2 * vmIndex + 1]));
+        }
+        results_[0] = RegisterValue(out, 256);
+        break;
+      }
       case Opcode::AArch64_BFMWri: {  // bfm wd, wn, #immr, #imms
         results_[0] = {
             bfm_2imms<uint32_t>(sourceValues_, metadata_, false, false), 8};

From 14a79d8192036439ccacccd2e039af57b829eecc Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Fri, 1 Nov 2024 14:33:56 +0000
Subject: [PATCH 42/71] Implemented ld1b (4 strided vectors, imm and reg
 offset) instructions with tests.

---
 src/lib/arch/aarch64/Instruction_address.cc |  22 +++
 src/lib/arch/aarch64/Instruction_execute.cc |   9 +
 test/regression/aarch64/instructions/sve.cc | 174 ++++++++++++++++++++
 3 files changed, 205 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index ad61720c6a..d9180f189a 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -353,6 +353,28 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
+      case Opcode::AArch64_LD1B_4Z_STRIDED: {  // ld1b {zt1.b, zt2.b, zt3.b,
+                                               // zt4.b}, png/z, [xn, xm]
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset = sourceValues_[2].get<uint64_t>();
+        const uint64_t addr = base + offset;
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(4);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+        addresses.push_back({addr + 2 * blockSize, blockSize});
+        addresses.push_back({addr + 3 * blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_LD1B_4Z_STRIDED_IMM:  // ld1b {zt1.b, zt2.b, zt3.b,
+                                                 // zt4.b}, png/z, [xn{, #imm,
+                                                 // mul vl}]
+        [[fallthrough]];
       case Opcode::AArch64_LD1B_4Z_IMM: {  // ld1b {zt1.b - zt4.b}, png/z, [xn{,
                                            // #imm, mul vl}]
         const uint16_t partition_num = VL_bits / 8;
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 583818fc39..a433e409c8 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -2887,6 +2887,15 @@ void Instruction::execute() {
         results_[1] = {out[1], 256};
         break;
       }
+      case Opcode::AArch64_LD1B_4Z_STRIDED:  // ld1b {zt1.b, zt2.b, zt3.b,
+                                             // zt4.b}, png/z, [xn, xm]
+        // LOAD
+        [[fallthrough]];
+      case Opcode::AArch64_LD1B_4Z_STRIDED_IMM:  // ld1b {zt1.b, zt2.b, zt3.b,
+                                                 // zt4.b}, png/z, [xn{, #imm,
+                                                 // mul vl}]
+        // LOAD
+        [[fallthrough]];
       case Opcode::AArch64_LD1B_4Z_IMM: {  // ld1b {zt1.b - zt4.b}, png/z, [xn{,
                                            // #imm, mul vl}]
         // LOAD
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 0e8344b3a0..60099cbb5d 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -5033,10 +5033,14 @@ TEST_P(InstSve, ld1b) {
 
     ptrue pn8.b
 
+    mov x1, #4
     ld1b {z0.b - z3.b}, pn8/z, [x0, #4, mul vl]
+    ld1b {z16.b, z20.b, z24.b, z28.b}, pn8/z, [x0, #4, mul vl]
+    ld1b {z17.b, z21.b, z25.b, z29.b}, pn8/z, [x0, x1]
   )");
   base = (VL / 8) * 4;
   offset = (VL / 8);
+  // Consecutive vectors
   CHECK_NEON(0, uint8_t,
              fillNeon<uint8_t>(
                  {
@@ -5121,6 +5125,176 @@ TEST_P(InstSve, ld1b) {
                      src[((base + (3 * offset)) + 15) % 16],
                  },
                  VL / 8));
+  // Strided (4-stride) vectors
+  CHECK_NEON(16, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[(base) % 16],
+                     src[(base + 1) % 16],
+                     src[(base + 2) % 16],
+                     src[(base + 3) % 16],
+                     src[(base + 4) % 16],
+                     src[(base + 5) % 16],
+                     src[(base + 6) % 16],
+                     src[(base + 7) % 16],
+                     src[(base + 8) % 16],
+                     src[(base + 9) % 16],
+                     src[(base + 10) % 16],
+                     src[(base + 11) % 16],
+                     src[(base + 12) % 16],
+                     src[(base + 13) % 16],
+                     src[(base + 14) % 16],
+                     src[(base + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(20, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + offset)) % 16],
+                     src[((base + offset) + 1) % 16],
+                     src[((base + offset) + 2) % 16],
+                     src[((base + offset) + 3) % 16],
+                     src[((base + offset) + 4) % 16],
+                     src[((base + offset) + 5) % 16],
+                     src[((base + offset) + 6) % 16],
+                     src[((base + offset) + 7) % 16],
+                     src[((base + offset) + 8) % 16],
+                     src[((base + offset) + 9) % 16],
+                     src[((base + offset) + 10) % 16],
+                     src[((base + offset) + 11) % 16],
+                     src[((base + offset) + 12) % 16],
+                     src[((base + offset) + 13) % 16],
+                     src[((base + offset) + 14) % 16],
+                     src[((base + offset) + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(24, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + (2 * offset))) % 16],
+                     src[((base + (2 * offset)) + 1) % 16],
+                     src[((base + (2 * offset)) + 2) % 16],
+                     src[((base + (2 * offset)) + 3) % 16],
+                     src[((base + (2 * offset)) + 4) % 16],
+                     src[((base + (2 * offset)) + 5) % 16],
+                     src[((base + (2 * offset)) + 6) % 16],
+                     src[((base + (2 * offset)) + 7) % 16],
+                     src[((base + (2 * offset)) + 8) % 16],
+                     src[((base + (2 * offset)) + 9) % 16],
+                     src[((base + (2 * offset)) + 10) % 16],
+                     src[((base + (2 * offset)) + 11) % 16],
+                     src[((base + (2 * offset)) + 12) % 16],
+                     src[((base + (2 * offset)) + 13) % 16],
+                     src[((base + (2 * offset)) + 14) % 16],
+                     src[((base + (2 * offset)) + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(28, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + (3 * offset))) % 16],
+                     src[((base + (3 * offset)) + 1) % 16],
+                     src[((base + (3 * offset)) + 2) % 16],
+                     src[((base + (3 * offset)) + 3) % 16],
+                     src[((base + (3 * offset)) + 4) % 16],
+                     src[((base + (3 * offset)) + 5) % 16],
+                     src[((base + (3 * offset)) + 6) % 16],
+                     src[((base + (3 * offset)) + 7) % 16],
+                     src[((base + (3 * offset)) + 8) % 16],
+                     src[((base + (3 * offset)) + 9) % 16],
+                     src[((base + (3 * offset)) + 10) % 16],
+                     src[((base + (3 * offset)) + 11) % 16],
+                     src[((base + (3 * offset)) + 12) % 16],
+                     src[((base + (3 * offset)) + 13) % 16],
+                     src[((base + (3 * offset)) + 14) % 16],
+                     src[((base + (3 * offset)) + 15) % 16],
+                 },
+                 VL / 8));
+  base = (VL / 8) + 4;
+  CHECK_NEON(17, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[(base) % 16],
+                     src[(base + 1) % 16],
+                     src[(base + 2) % 16],
+                     src[(base + 3) % 16],
+                     src[(base + 4) % 16],
+                     src[(base + 5) % 16],
+                     src[(base + 6) % 16],
+                     src[(base + 7) % 16],
+                     src[(base + 8) % 16],
+                     src[(base + 9) % 16],
+                     src[(base + 10) % 16],
+                     src[(base + 11) % 16],
+                     src[(base + 12) % 16],
+                     src[(base + 13) % 16],
+                     src[(base + 14) % 16],
+                     src[(base + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(21, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + offset)) % 16],
+                     src[((base + offset) + 1) % 16],
+                     src[((base + offset) + 2) % 16],
+                     src[((base + offset) + 3) % 16],
+                     src[((base + offset) + 4) % 16],
+                     src[((base + offset) + 5) % 16],
+                     src[((base + offset) + 6) % 16],
+                     src[((base + offset) + 7) % 16],
+                     src[((base + offset) + 8) % 16],
+                     src[((base + offset) + 9) % 16],
+                     src[((base + offset) + 10) % 16],
+                     src[((base + offset) + 11) % 16],
+                     src[((base + offset) + 12) % 16],
+                     src[((base + offset) + 13) % 16],
+                     src[((base + offset) + 14) % 16],
+                     src[((base + offset) + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(25, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + (2 * offset))) % 16],
+                     src[((base + (2 * offset)) + 1) % 16],
+                     src[((base + (2 * offset)) + 2) % 16],
+                     src[((base + (2 * offset)) + 3) % 16],
+                     src[((base + (2 * offset)) + 4) % 16],
+                     src[((base + (2 * offset)) + 5) % 16],
+                     src[((base + (2 * offset)) + 6) % 16],
+                     src[((base + (2 * offset)) + 7) % 16],
+                     src[((base + (2 * offset)) + 8) % 16],
+                     src[((base + (2 * offset)) + 9) % 16],
+                     src[((base + (2 * offset)) + 10) % 16],
+                     src[((base + (2 * offset)) + 11) % 16],
+                     src[((base + (2 * offset)) + 12) % 16],
+                     src[((base + (2 * offset)) + 13) % 16],
+                     src[((base + (2 * offset)) + 14) % 16],
+                     src[((base + (2 * offset)) + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(29, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + (3 * offset))) % 16],
+                     src[((base + (3 * offset)) + 1) % 16],
+                     src[((base + (3 * offset)) + 2) % 16],
+                     src[((base + (3 * offset)) + 3) % 16],
+                     src[((base + (3 * offset)) + 4) % 16],
+                     src[((base + (3 * offset)) + 5) % 16],
+                     src[((base + (3 * offset)) + 6) % 16],
+                     src[((base + (3 * offset)) + 7) % 16],
+                     src[((base + (3 * offset)) + 8) % 16],
+                     src[((base + (3 * offset)) + 9) % 16],
+                     src[((base + (3 * offset)) + 10) % 16],
+                     src[((base + (3 * offset)) + 11) % 16],
+                     src[((base + (3 * offset)) + 12) % 16],
+                     src[((base + (3 * offset)) + 13) % 16],
+                     src[((base + (3 * offset)) + 14) % 16],
+                     src[((base + (3 * offset)) + 15) % 16],
+                 },
+                 VL / 8));
 }
 
 TEST_P(InstSve, ld1sw_gather) {

From 2db03bcfdef99a322f9794a666b780865f45f655 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Fri, 1 Nov 2024 15:06:17 +0000
Subject: [PATCH 43/71] Implemented UVDOT (VGx4 8-bit to 32-bit widening,
 indexed vector) SME instruction with tests.

---
 src/lib/arch/aarch64/Instruction_execute.cc | 52 ++++++++++++++++++
 test/regression/aarch64/instructions/sme.cc | 61 +++++++++++++++++++++
 2 files changed, 113 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index a433e409c8..c48886aa93 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -6146,6 +6146,58 @@ void Instruction::execute() {
         }
         break;
       }
+      case Opcode::AArch64_UVDOT_VG4_M4ZZI_BtoS: {  // uvdot za.s[wv, #off,
+                                                    // vgx4], {zn1.b - zn4.b},
+                                                    // zm.b[#index]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 32;
+        // Get ZA stride between quarters and index into each ZA quarter
+        const uint16_t zaStride = zaRowCount / 4;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+        // Get zm vector and zm's index
+        const uint8_t* zm =
+            sourceValues_[zaRowCount + 5].getAsVector<uint8_t>();
+        const int zmIndex = metadata_.operands[5].vector_index;
+
+        // Pre-set all ZA result rows as only 4 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        // Loop over each source vector and destination vector (from the za
+        // single-vector group) pair
+        for (int r = 0; r < 4; r++) {
+          // For ZA single-vector groups of 4 vectors (vgx4), each vector is in
+          // a different quarter of ZA; indexed into it by Wv+off.
+          const uint32_t* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<uint32_t>();
+          uint32_t out[64] = {0};
+          // Loop over all 32-bit elements of output row vector `zaRow`
+          for (int e = 0; e < elemCount; e++) {
+            out[e] = zaRow[e];
+            // MOD 4 as there are 4 32-bit elements per 128-bit segment of `zm`
+            const int zmSegBase = e - (e % 4);
+            const int s = zmSegBase + zmIndex;
+            // There are 4 8-bit elements per 32-bit element of `znr` and `zm`
+            for (int i = 0; i < 4; i++) {
+              const uint8_t* znr =
+                  sourceValues_[zaRowCount + 1 + i].getAsVector<uint8_t>();
+              out[e] += static_cast<uint32_t>(znr[4 * e + r]) *
+                        static_cast<uint32_t>(zm[4 * s + i]);
+            }
+          }
+          // Update results_ for completed row
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
       case Opcode::AArch64_UDOT_ZZZI_S: {  // udot zd.s, zn.b, zm.b[index]
         results_[0] = sveUdot_indexed<uint32_t, uint8_t, 4>(sourceValues_,
                                                             metadata_, VL_bits);
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index 25e2b4c800..4ea8c58a74 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -810,6 +810,67 @@ TEST_P(InstSme, udot_Indexed_vgx4) {
   }
 }
 
+TEST_P(InstSme, uvdot_vgx4) {
+  // 8-bit to 32-bit widening
+  initialHeapData_.resize(SVL / 8);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src = {0, 1, 2,  3,  4,  5,  6,  7,
+                              8, 9, 10, 11, 12, 13, 14, 15};
+  fillHeap<uint8_t>(heap8, src, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 96 (uint32_t)
+    dup z1.b, #8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+    umopa za0.s, p0/m, p1/m, z1.b, z2.b
+    umopa za1.s, p0/m, p1/m, z1.b, z2.b
+    umopa za2.s, p0/m, p1/m, z1.b, z2.b
+    umopa za3.s, p0/m, p1/m, z1.b, z2.b
+
+    # initialise registers
+    mov w8, #1
+    dup z4.b, #10
+    dup z5.b, #11
+    dup z6.b, #12
+    dup z7.b, #13
+    ld1b {z10.b}, p0/z, [x0]
+
+    uvdot za.s[w8, #1, vgx4], {z4.b - z7.b}, z10.b[2]
+  )");
+  const uint16_t zaStride = (SVL / 8) / 4;
+  const uint16_t zaQuartIndex = 2;
+  for (uint64_t i = 0; i < (SVL / 8); i++) {
+    // Effected rows all use same zm values of {8, 9, 10, 11}
+    if (i == zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({538}, (SVL / 8)));
+    } else if (i == zaStride + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({538}, (SVL / 8)));
+    } else if (i == (2 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({538}, (SVL / 8)));
+    } else if (i == (3 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({538}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 96 throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({96}, (SVL / 8)));
+    }
+  }
+}
+
 TEST_P(InstSme, umopa) {
   // 32-bit
   RUN_AARCH64(R"(

From 68038b713a8f393cd4ef1570129f241cf2732da5 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Fri, 1 Nov 2024 18:11:03 +0000
Subject: [PATCH 44/71] Implemented ST4W (imm offset) SVE instruction with
 tests.

---
 src/lib/arch/aarch64/Instruction_address.cc | 20 +++++++++
 src/lib/arch/aarch64/Instruction_execute.cc | 40 ++++++++++++++++++
 test/regression/aarch64/instructions/sve.cc | 47 ++++++++++++++++++++-
 3 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index d9180f189a..88e3c1bde4 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -1222,6 +1222,26 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
+      case Opcode::AArch64_ST4W_IMM: {  // st4w {zt1.s, zt2.s, zt3.s, zt4.s},
+                                        // pg, [<xn|sp>{, #imm, mul vl}]
+        const uint64_t* p = sourceValues_[4].getAsVector<uint64_t>();
+        const uint16_t partition_num = VL_bits / 32;
+
+        const uint64_t base = sourceValues_[5].get<uint64_t>();
+        const int64_t offset =
+            static_cast<int64_t>(metadata_.operands[5].mem.disp);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(partition_num * 4);
+
+        uint64_t addr = base + (offset * partition_num * 4);
+
+        generatePredicatedContiguousAddressBlocks(addr, partition_num, 16, 4, p,
+                                                  addresses);
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_ST1_MXIPXX_H_D:    // st1d {zath.d[ws, #imm]}, pg,
                                               // [<xn|sp>{, xm, lsl #3}]
       case Opcode::AArch64_ST1_MXIPXX_V_D: {  // st1d {zatv.d[ws, #imm]}, pg,
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index c48886aa93..4b90ef5d55 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -5448,6 +5448,46 @@ void Instruction::execute() {
         results_[0] = sourceValues_[2].get<uint64_t>() + postIndex;
         break;
       }
+      case Opcode::AArch64_ST4W_IMM: {  // st4w {zt1.s, zt2.s, zt3.s, zt4.s},
+                                        // pg, [<xn|sp>{, #imm, mul vl}]
+        // STORE
+        const uint32_t* d1 = sourceValues_[0].getAsVector<uint32_t>();
+        const uint32_t* d2 = sourceValues_[1].getAsVector<uint32_t>();
+        const uint32_t* d3 = sourceValues_[2].getAsVector<uint32_t>();
+        const uint32_t* d4 = sourceValues_[3].getAsVector<uint32_t>();
+        const uint64_t* p = sourceValues_[4].getAsVector<uint64_t>();
+
+        std::vector<uint32_t> memData;
+        bool inActiveBlock = false;
+
+        const uint16_t partition_num = VL_bits / 32;
+        uint16_t index = 0;
+        for (int i = 0; i < partition_num; i++) {
+          uint64_t shifted_active = 1ull << ((i % 16) * 4);
+          if (p[i / 16] & shifted_active) {
+            // If active and not in active block, initialise
+            if (!inActiveBlock) {
+              memData.clear();
+              inActiveBlock = true;
+            }
+            memData.push_back(d1[i]);
+            memData.push_back(d2[i]);
+            memData.push_back(d3[i]);
+            memData.push_back(d4[i]);
+          } else if (inActiveBlock) {
+            inActiveBlock = false;
+            memoryData_[index] = RegisterValue(
+                (char*)memData.data(), sizeof(uint32_t) * memData.size());
+            index++;
+          }
+        }
+        // Add final block if needed
+        if (inActiveBlock)
+          memoryData_[index] = RegisterValue((char*)memData.data(),
+                                             sizeof(uint32_t) * memData.size());
+
+        break;
+      }
       case Opcode::AArch64_STLRB: {  // stlrb wt, [xn]
         // STORE
         memoryData_[0] = sourceValues_[0];
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 60099cbb5d..18b27a5708 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -7156,7 +7156,6 @@ TEST_P(InstSve, st1d_multivec) {
 }
 
 TEST_P(InstSve, st2d) {
-  // 32-bit
   RUN_AARCH64(R"(
     ptrue p0.d
     mov x0, #0
@@ -7193,6 +7192,52 @@ TEST_P(InstSve, st2d) {
   }
 }
 
+TEST_P(InstSve, st4w) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    ptrue p0.s
+    mov x0, #0
+    addvl x1, x0, #1
+    mov x2, #8
+    udiv x3, x1, x2
+    whilelo p1.s, xzr, x3
+
+    sub sp, sp, #4095
+    mov x6, #300
+
+    dup z0.s, #3
+    dup z1.s, #4
+    dup z2.s, #5
+    dup z3.s, #6
+
+    st4w {z0.s - z3.s}, p0, [sp]
+    st4w {z0.s - z3.s}, p1, [x6, #4, mul vl]
+  )");
+
+  for (uint64_t i = 0; i < (VL / 32); i++) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
+                                       4095 + (4 * i * 4)),
+              3);
+    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
+                                       4095 + (4 * i * 4) + 4),
+              4);
+    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
+                                       4095 + (4 * i * 4) + 8),
+              5);
+    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
+                                       4095 + (4 * i * 4) + 12),
+              6);
+  }
+
+  int index = 4 * (VL / 8);
+  for (uint64_t i = 0; i < (VL / 64); i++) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(300 + index + (4 * i * 4)), 3);
+    EXPECT_EQ(getMemoryValue<uint32_t>(300 + index + (4 * i * 4) + 4), 4);
+    EXPECT_EQ(getMemoryValue<uint32_t>(300 + index + (4 * i * 4) + 8), 5);
+    EXPECT_EQ(getMemoryValue<uint32_t>(300 + index + (4 * i * 4) + 12), 6);
+  }
+}
+
 TEST_P(InstSve, st1w_scatter) {
   // 32-bit
   RUN_AARCH64(R"(

From 4a8f3f64397342088b9c28ee812060a89328b56c Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Fri, 1 Nov 2024 18:23:02 +0000
Subject: [PATCH 45/71] Implemented LD1W (4 vec, scalar offset) SVE2
 instruction with tests.

---
 src/lib/arch/aarch64/Instruction_address.cc | 18 ++++++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc |  4 ++++
 test/regression/aarch64/instructions/sve.cc | 24 +++++++++++++++++++++
 3 files changed, 46 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index 88e3c1bde4..07c2e84709 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -519,6 +519,24 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
+      case Opcode::AArch64_LD1W_4Z: {  // ld1w {zt1.s - zt4.s}, png/z, [xn,
+                                       // xm, lsl #2]
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset = sourceValues_[2].get<uint64_t>();
+        const uint64_t addr = base + (offset << 2);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(4);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+        addresses.push_back({addr + 2 * blockSize, blockSize});
+        addresses.push_back({addr + 3 * blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_LD1W_4Z_IMM: {  // ld1w {zt1.s - zt4.s}, png/z, [xn{,
                                            // #imm, mul vl}]
         const uint16_t partition_num = VL_bits / 32;
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 4b90ef5d55..d36486154b 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -3465,6 +3465,10 @@ void Instruction::execute() {
         results_[1] = {out[1], 256};
         break;
       }
+      case Opcode::AArch64_LD1W_4Z:  // ld1w {zt1.s - zt4.s}, png/z, [xn,
+                                     // xm, lsl #2]
+        // LOAD
+        [[fallthrough]];
       case Opcode::AArch64_LD1W_4Z_IMM: {  // ld1w {zt1.s - zt4.s}, png/z, [xn{,
                                            // #imm, mul vl}]
         // LOAD
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 18b27a5708..a0df6713ea 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -5684,8 +5684,10 @@ TEST_P(InstSve, ld1w) {
     dup z3.s, #4
 
     ptrue pn8.s
+    addvl x1, x1, #1
 
     ld1w {z0.s - z3.s}, pn8/z, [x0, #4, mul vl]
+    ld1w {z4.s - z7.s}, pn8/z, [x0, x1, lsl #2]
   )");
   base = (VL / 32) * 4;
   offset = (VL / 32);
@@ -5711,6 +5713,28 @@ TEST_P(InstSve, ld1w) {
                                  src[((base + (offset * 3)) + 2) % 4],
                                  src[((base + (offset * 3)) + 3) % 4]},
                                 VL / 8));
+  CHECK_NEON(4, uint32_t,
+             fillNeon<uint32_t>({src[(base) % 4], src[(base + 1) % 4],
+                                 src[(base + 2) % 4], src[(base + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(
+      5, uint32_t,
+      fillNeon<uint32_t>(
+          {src[((base + offset)) % 4], src[((base + offset) + 1) % 4],
+           src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]},
+          VL / 8));
+  CHECK_NEON(6, uint32_t,
+             fillNeon<uint32_t>({src[((base + (offset * 2))) % 4],
+                                 src[((base + (offset * 2)) + 1) % 4],
+                                 src[((base + (offset * 2)) + 2) % 4],
+                                 src[((base + (offset * 2)) + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(7, uint32_t,
+             fillNeon<uint32_t>({src[((base + (offset * 3))) % 4],
+                                 src[((base + (offset * 3)) + 1) % 4],
+                                 src[((base + (offset * 3)) + 2) % 4],
+                                 src[((base + (offset * 3)) + 3) % 4]},
+                                VL / 8));
 }
 
 TEST_P(InstSve, ld2d) {

From 3d5b288f08e2f74341d1b3586ff99ef8daf923cb Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Fri, 1 Nov 2024 19:09:34 +0000
Subject: [PATCH 46/71] Implemented FMLA (float, VGx4) SME instruction with
 tests.

---
 src/lib/arch/aarch64/Instruction_execute.cc | 42 +++++++++++++++
 test/regression/aarch64/instructions/sme.cc | 59 +++++++++++++++++++++
 2 files changed, 101 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index d36486154b..131b327fae 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -1847,6 +1847,48 @@ void Instruction::execute() {
             [](double x, double y) -> double { return std::fmin(x, y); });
         break;
       }
+      case Opcode::AArch64_FMLA_VG4_M4Z4Z_S: {  // fmla za.s[wv, offs, vgx4],
+                                                // {zn1.s - zn4.s}, {zm1.s -
+                                                // zm4.s}
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 32;
+        // Get ZA stride between quarters and index into each ZA quarter
+        const uint16_t zaStride = zaRowCount / 4;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+
+        // Pre-set all ZA result rows as only 4 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        // Get sourceValues_ index of first zn and zm regs
+        const uint16_t n = zaRowCount + 1;
+        const uint16_t m = zaRowCount + 5;
+
+        // Loop over each source vector and destination vector (from the za
+        // single-vector group) pair
+        for (int r = 0; r < 4; r++) {
+          // For ZA single-vector groups of 4 vectors (vgx4), each vector is in
+          // a different quarter of ZA; indexed into it by Wv+off.
+          const float* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<float>();
+          const float* zn = sourceValues_[n + r].getAsVector<float>();
+          const float* zm = sourceValues_[m + r].getAsVector<float>();
+          float out[64] = {0.0f};
+          for (int e = 0; e < elemCount; e++) {
+            out[e] = zaRow[e] + (zn[e] * zm[e]);
+          }
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
       case Opcode::AArch64_FMLA_VG4_M4ZZI_D: {  // fmla za.d[wv, offs, vgx4],
                                                 // {zn1.d - zn4.d}, zm.d[index]
         // SME
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index 4ea8c58a74..9e9406ff51 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -142,6 +142,65 @@ TEST_P(InstSme, mova_tilesToVecs) {
   CHECK_NEON(11, uint8_t, fillNeon<uint8_t>({0x00}, SVL / 8));
 }
 
+TEST_P(InstSme, fmla_multiVecs) {
+  // float, vgx4
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 24.0f
+    fdup z1.s, #3.0
+    fdup z2.s, #8.0
+    ptrue p0.s
+    ptrue p1.s
+    fmopa za0.s, p0/m, p1/m, z1.s, z2.s
+    fmopa za1.s, p0/m, p1/m, z1.s, z2.s
+    fmopa za2.s, p0/m, p1/m, z1.s, z2.s
+    fmopa za3.s, p0/m, p1/m, z1.s, z2.s
+
+    # initialise registers
+    mov w8, #1
+    fdup z4.s, #0.25
+    fdup z5.s, #1.5
+    fdup z6.s, #-0.5
+    fdup z7.s, #-2.5
+    fdup z8.s, #3.0
+    fdup z9.s, #4.0
+    fdup z10.s, #5.0
+    fdup z11.s, #6.0
+
+    fmla za.s[w8, #1, vgx4], {z4.s - z7.s}, {z8.s - z11.s}
+  )");
+  const uint16_t zaStride = (SVL / 8) / 4;
+  const uint16_t zaQuartIndex = 2;
+  for (uint64_t i = 0; i < (SVL / 8); i++) {
+    // Effected rows all use same zm value of 2.0f
+    if (i == zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({24.75f}, (SVL / 8)));
+    } else if (i == zaStride + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({30.0f}, (SVL / 8)));
+    } else if (i == (2 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({21.5f}, (SVL / 8)));
+    } else if (i == (3 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({9.0f}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 24.0f throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({24.0f}, (SVL / 8)));
+    }
+  }
+}
+
 TEST_P(InstSme, fmla_indexed_vgx4) {
   // float
   initialHeapData_.resize(SVL);

From b9dcabeac801775439707660080d6b1d094cdf04 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 4 Nov 2024 11:56:51 +0000
Subject: [PATCH 47/71] Implemented MOVA (array to vecs, 2 registers) SME
 instruction with tests.

---
 src/lib/arch/aarch64/Instruction_execute.cc | 18 +++++++
 test/regression/aarch64/instructions/sme.cc | 58 ++++++++++++++++++++-
 2 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 131b327fae..072555cc11 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -4177,6 +4177,24 @@ void Instruction::execute() {
         }
         break;
       }
+      case Opcode::AArch64_MOVA_VG2_2ZMXI: {  // mova {zd1.d, zd2.d}, za.d[wv,
+                                              // offs, vgx2]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        // Get ZA stride between halves and index into each ZA quarter
+        const uint16_t zaStride = zaRowCount / 2;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[2].sme.slice_offset.imm) %
+                                 zaStride;
+
+        results_[0] = sourceValues_[zaIndex];
+        results_[1] = sourceValues_[zaStride + zaIndex];
+        break;
+      }
       case Opcode::AArch64_MOVA_VG4_4ZMXI: {  // mova {zd1.d - zd4.d}, za.d[wv,
                                               // offs, vgx4]
         // SME
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index 9e9406ff51..066970b9ea 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -48,7 +48,7 @@ TEST_P(InstSme, mova_tileToVec) {
 }
 
 TEST_P(InstSme, mova_zaToVecs) {
-  // 4 vectors
+  // 2 vectors
   initialHeapData_.resize(SVL / 8);
   uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
   std::vector<uint8_t> src = {0, 1, 2,  3,  4,  5,  6,  7,
@@ -83,6 +83,62 @@ TEST_P(InstSme, mova_zaToVecs) {
     ld1b {z10.b}, p0/z, [x0]
     udot za.s[w8, #1, vgx4], {z4.b - z7.b}, z10.b[2]
 
+    # Extravt un-updated values
+    mov w9, #0
+    mova {z20.d, z21.d}, za.d[w9, #0, vgx2]
+    # Extract 0th and 2nd updated rows
+    mov {z24.d, z25.d}, za.d[w8, #1, vgx2]
+    # Extract 1st and 3rd updated rows (get new offset into each half)
+    addvl x10, x10, #1
+    mov x20, #4
+    udiv x10, x10, x20
+    mov {z26.d, z27.d}, za.d[w10, #2, vgx2]
+  )");
+  // Check extracted un-effected rows (two uint32_t values of 96 equal one
+  // uint64_t value of 412316860512)
+  CHECK_NEON(20, uint64_t, fillNeon<uint64_t>({412316860512}, SVL / 8));
+  CHECK_NEON(21, uint64_t, fillNeon<uint64_t>({412316860512}, SVL / 8));
+  // Check extracted effected rows (two uint32_t values concatonated into one
+  // uint64_t value)
+  CHECK_NEON(24, uint64_t, fillNeon<uint64_t>({2044404433372}, SVL / 8));
+  CHECK_NEON(25, uint64_t, fillNeon<uint64_t>({2370821947944}, SVL / 8));
+  CHECK_NEON(26, uint64_t, fillNeon<uint64_t>({2207613190658}, SVL / 8));
+  CHECK_NEON(27, uint64_t, fillNeon<uint64_t>({2534030705230}, SVL / 8));
+
+  // 4 vectors
+  initialHeapData_.resize(SVL / 8);
+  heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  src = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  fillHeap<uint8_t>(heap8, src, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 96 (uint32_t)
+    dup z1.b, #8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+    umopa za0.s, p0/m, p1/m, z1.b, z2.b
+    umopa za1.s, p0/m, p1/m, z1.b, z2.b
+    umopa za2.s, p0/m, p1/m, z1.b, z2.b
+    umopa za3.s, p0/m, p1/m, z1.b, z2.b
+
+    # Set 4 of the za rows
+    mov w8, #1
+    dup z4.b, #10
+    dup z5.b, #11
+    dup z6.b, #12
+    dup z7.b, #13
+    ld1b {z10.b}, p0/z, [x0]
+    udot za.s[w8, #1, vgx4], {z4.b - z7.b}, z10.b[2]
+
     mov w9, #0
     mova {z20.d - z23.d}, za.d[w9, #0, vgx4]
     mov {z24.d - z27.d}, za.d[w8, #1, vgx4]

From b988e0117a095f0be1d5f8bcc7a42e14842dfa29 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 4 Nov 2024 12:36:46 +0000
Subject: [PATCH 48/71] Implemented FADD (float, vgx2) SME instruction with
 tests.

---
 src/lib/arch/aarch64/Instruction_execute.cc | 37 +++++++++++++++
 test/regression/aarch64/instructions/sme.cc | 52 +++++++++++++++++++++
 2 files changed, 89 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 072555cc11..9a9e4239fc 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -1373,6 +1373,43 @@ void Instruction::execute() {
         results_[0] = {add_3ops<float>(sourceValues_), 256};
         break;
       }
+      case Opcode::AArch64_FADD_VG2_M2Z_S: {  // fadd za.s[wv, #off, vgx2],
+                                              // {zn1.s, zn2.s}
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 32;
+        // Get ZA stride between halves and index into each ZA half
+        const uint16_t zaStride = zaRowCount / 2;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+
+        // Pre-set all ZA result rows as only 2 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        // For each source vector and ZA Row pair
+        for (int r = 0; r < 2; r++) {
+          // Get row in correct ZA half
+          const float* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<float>();
+          // Get current source vector
+          const float* znr =
+              sourceValues_[zaRowCount + 1 + r].getAsVector<float>();
+          float out[64] = {0.0f};
+          // Loop over all elements and destructively add
+          for (int e = 0; e < elemCount; e++) {
+            out[e] = zaRow[e] + znr[e];
+          }
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
       case Opcode::AArch64_FADD_ZPmI_D: {  // fadd zdn.d, pg/m, zdn.d, const
         results_[0] =
             sveAddPredicated_const<double>(sourceValues_, metadata_, VL_bits);
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index 066970b9ea..4efd18849f 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -198,6 +198,58 @@ TEST_P(InstSme, mova_tilesToVecs) {
   CHECK_NEON(11, uint8_t, fillNeon<uint8_t>({0x00}, SVL / 8));
 }
 
+TEST_P(InstSme, fadd) {
+  // Float, VGx2
+  initialHeapData_.resize(SVL / 8);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src = {0, 1, 2,  3,  4,  5,  6,  7,
+                              8, 9, 10, 11, 12, 13, 14, 15};
+  fillHeap<uint8_t>(heap8, src, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 24.0f
+    fdup z1.s, #3.0
+    fdup z2.s, #8.0
+    ptrue p0.s
+    ptrue p1.s
+    fmopa za0.s, p0/m, p1/m, z1.s, z2.s
+    fmopa za1.s, p0/m, p1/m, z1.s, z2.s
+    fmopa za2.s, p0/m, p1/m, z1.s, z2.s
+    fmopa za3.s, p0/m, p1/m, z1.s, z2.s
+
+    # initialise registers
+    mov w8, #1
+    fdup z4.s, #-2.5
+    fdup z5.s, #3.0
+
+    fadd za.s[w8, #1, vgx2], {z4.s, z5.s}
+  )");
+  const uint16_t zaStride = (SVL / 8) / 2;
+  const uint16_t zaHalfIndex = 2;
+  for (uint64_t i = 0; i < (SVL / 8); i++) {
+    // Effected rows all use same zm value of 2.0f
+    if (i == zaHalfIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({21.5f}, (SVL / 8)));
+    } else if (i == zaStride + zaHalfIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({27.0f}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 24.0f throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
+                    fillNeon<float>({24.0f}, (SVL / 8)));
+    }
+  }
+}
+
 TEST_P(InstSme, fmla_multiVecs) {
   // float, vgx4
   RUN_AARCH64(R"(

From 4f75ffe8466609b2b8ad267cfc8a7a41cb710b64 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 4 Nov 2024 14:16:30 +0000
Subject: [PATCH 49/71] Implemented LD1D (4 vec, scalar offset) SVE2
 instruction with tests.

---
 src/lib/arch/aarch64/Instruction_address.cc | 18 ++++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc |  4 ++++
 test/regression/aarch64/instructions/sve.cc | 26 +++++++++++++++++++++
 3 files changed, 48 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index 07c2e84709..4f52762bac 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -423,6 +423,24 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
+      case Opcode::AArch64_LD1D_4Z: {  // ld1d {zt1.d - zt4.d}, png/z, [xn,
+                                       // xm, lsl #3]
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset = sourceValues_[2].get<uint64_t>();
+        const uint64_t addr = base + (offset << 3);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(4);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+        addresses.push_back({addr + 2 * blockSize, blockSize});
+        addresses.push_back({addr + 3 * blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_LD1D_4Z_IMM: {  // ld1d {zt1.d - zt4.d}, png/z, [xn{,
                                            // #imm, mul vl}]
         const uint16_t partition_num = VL_bits / 64;
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 9a9e4239fc..52d94c3b3a 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -3041,6 +3041,10 @@ void Instruction::execute() {
         results_[1] = {out[1], 256};
         break;
       }
+      case Opcode::AArch64_LD1D_4Z:  // ld1d {zt1.d - zt4.d}, png/z, [xn,
+                                     // xm, lsl #3]
+        // LOAD
+        [[fallthrough]];
       case Opcode::AArch64_LD1D_4Z_IMM: {  // ld1d {zt1.d - zt4.d}, png/z, [xn{,
                                            // #imm, mul vl}]
         // LOAD
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index a0df6713ea..3acf783558 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -5520,6 +5520,10 @@ TEST_P(InstSve, ld1d) {
     ptrue pn8.d
 
     ld1d {z0.d - z3.d}, pn8/z, [x0, #4, mul vl]
+    addvl x1, x1, #1
+    mov x2, #2
+    udiv x1, x1, x2
+    ld1d {z4.d - z7.d}, pn8/z, [x0, x1, lsl #3]
   )");
   base = (VL / 64) * 4;
   offset = (VL / 64);
@@ -5545,6 +5549,28 @@ TEST_P(InstSve, ld1d) {
                                  src[((base + (offset * 3)) + 2) % 4],
                                  src[((base + (offset * 3)) + 3) % 4]},
                                 VL / 8));
+  CHECK_NEON(4, uint64_t,
+             fillNeon<uint64_t>({src[(base) % 4], src[(base + 1) % 4],
+                                 src[(base + 2) % 4], src[(base + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(
+      5, uint64_t,
+      fillNeon<uint64_t>(
+          {src[((base + offset)) % 4], src[((base + offset) + 1) % 4],
+           src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]},
+          VL / 8));
+  CHECK_NEON(6, uint64_t,
+             fillNeon<uint64_t>({src[((base + (offset * 2))) % 4],
+                                 src[((base + (offset * 2)) + 1) % 4],
+                                 src[((base + (offset * 2)) + 2) % 4],
+                                 src[((base + (offset * 2)) + 3) % 4]},
+                                VL / 8));
+  CHECK_NEON(7, uint64_t,
+             fillNeon<uint64_t>({src[((base + (offset * 3))) % 4],
+                                 src[((base + (offset * 3)) + 1) % 4],
+                                 src[((base + (offset * 3)) + 2) % 4],
+                                 src[((base + (offset * 3)) + 3) % 4]},
+                                VL / 8));
 }
 
 TEST_P(InstSve, ld1h) {

From f35472b02d6891bde114b75861d00ca297edf818 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 4 Nov 2024 15:13:54 +0000
Subject: [PATCH 50/71] Implemented FMLA (double, VGx4) SME instruction with
 tests.

---
 src/lib/arch/aarch64/Instruction_execute.cc | 42 +++++++++++++++
 test/regression/aarch64/instructions/sme.cc | 59 +++++++++++++++++++++
 2 files changed, 101 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 52d94c3b3a..4d55b022e7 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -1884,6 +1884,48 @@ void Instruction::execute() {
             [](double x, double y) -> double { return std::fmin(x, y); });
         break;
       }
+      case Opcode::AArch64_FMLA_VG4_M4Z4Z_D: {  // fmla za.d[wv, offs, vgx4],
+                                                // {zn1.d - zn4.d}, {zm1.d -
+                                                // zm4.d}
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 64;
+        // Get ZA stride between quarters and index into each ZA quarter
+        const uint16_t zaStride = zaRowCount / 4;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+
+        // Pre-set all ZA result rows as only 4 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        // Get sourceValues_ index of first zn and zm regs
+        const uint16_t n = zaRowCount + 1;
+        const uint16_t m = zaRowCount + 5;
+
+        // Loop over each source vector and destination vector (from the za
+        // single-vector group) pair
+        for (int r = 0; r < 4; r++) {
+          // For ZA single-vector groups of 4 vectors (vgx4), each vector is in
+          // a different quarter of ZA; indexed into it by Wv+off.
+          const double* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<double>();
+          const double* zn = sourceValues_[n + r].getAsVector<double>();
+          const double* zm = sourceValues_[m + r].getAsVector<double>();
+          double out[32] = {0.0};
+          for (int e = 0; e < elemCount; e++) {
+            out[e] = zaRow[e] + (zn[e] * zm[e]);
+          }
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
       case Opcode::AArch64_FMLA_VG4_M4Z4Z_S: {  // fmla za.s[wv, offs, vgx4],
                                                 // {zn1.s - zn4.s}, {zm1.s -
                                                 // zm4.s}
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index 4efd18849f..40c0b8ec99 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -307,6 +307,65 @@ TEST_P(InstSme, fmla_multiVecs) {
                     fillNeon<float>({24.0f}, (SVL / 8)));
     }
   }
+
+  // double, vgx4
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 24.0
+    fdup z1.d, #3.0
+    fdup z2.d, #8.0
+    ptrue p0.d
+    ptrue p1.d
+    fmopa za0.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za1.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za2.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za3.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za4.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za5.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za6.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za7.d, p0/m, p1/m, z1.d, z2.d
+
+    # initialise registers
+    mov w8, #1
+    fdup z4.d, #0.25
+    fdup z5.d, #1.5
+    fdup z6.d, #-0.5
+    fdup z7.d, #-2.5
+    fdup z8.d, #3.0
+    fdup z9.d, #4.0
+    fdup z10.d, #5.0
+    fdup z11.d, #6.0
+
+    fmla za.d[w8, #1, vgx4], {z4.d - z7.d}, {z8.d - z11.d}
+  )");
+  for (uint64_t i = 0; i < (SVL / 8); i++) {
+    // Effected rows all use same zm value of 2.0
+    if (i == zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({24.75}, (SVL / 8)));
+    } else if (i == zaStride + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({30.0}, (SVL / 8)));
+    } else if (i == (2 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({21.5}, (SVL / 8)));
+    } else if (i == (3 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({9.0}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 24.0 throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({24.0}, (SVL / 8)));
+    }
+  }
 }
 
 TEST_P(InstSme, fmla_indexed_vgx4) {

From 1bf3306deb071c6226fc4160cef9f5808400e337 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 4 Nov 2024 15:19:09 +0000
Subject: [PATCH 51/71] Implemented FADD (double, vgx2) SME instruction with
 tests.

---
 src/lib/arch/aarch64/Instruction_execute.cc | 37 +++++++++++++++
 test/regression/aarch64/instructions/sme.cc | 52 ++++++++++++++++++++-
 2 files changed, 88 insertions(+), 1 deletion(-)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 4d55b022e7..459c999261 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -1373,6 +1373,43 @@ void Instruction::execute() {
         results_[0] = {add_3ops<float>(sourceValues_), 256};
         break;
       }
+      case Opcode::AArch64_FADD_VG2_M2Z_D: {  // fadd za.d[wv, #off, vgx2],
+                                              // {zn1.d, zn2.d}
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 64;
+        // Get ZA stride between halves and index into each ZA half
+        const uint16_t zaStride = zaRowCount / 2;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+
+        // Pre-set all ZA result rows as only 2 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        // For each source vector and ZA Row pair
+        for (int r = 0; r < 2; r++) {
+          // Get row in correct ZA half
+          const double* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<double>();
+          // Get current source vector
+          const double* znr =
+              sourceValues_[zaRowCount + 1 + r].getAsVector<double>();
+          double out[32] = {0.0};
+          // Loop over all elements and destructively add
+          for (int e = 0; e < elemCount; e++) {
+            out[e] = zaRow[e] + znr[e];
+          }
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
       case Opcode::AArch64_FADD_VG2_M2Z_S: {  // fadd za.s[wv, #off, vgx2],
                                               // {zn1.s, zn2.s}
         // SME
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index 40c0b8ec99..eb3ef04e4f 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -235,7 +235,6 @@ TEST_P(InstSme, fadd) {
   const uint16_t zaStride = (SVL / 8) / 2;
   const uint16_t zaHalfIndex = 2;
   for (uint64_t i = 0; i < (SVL / 8); i++) {
-    // Effected rows all use same zm value of 2.0f
     if (i == zaHalfIndex) {
       CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
                     fillNeon<float>({21.5f}, (SVL / 8)));
@@ -248,6 +247,57 @@ TEST_P(InstSme, fadd) {
                     fillNeon<float>({24.0f}, (SVL / 8)));
     }
   }
+
+  // Double, VGx2
+  initialHeapData_.resize(SVL / 8);
+  heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  src = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  fillHeap<uint8_t>(heap8, src, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 24.0
+    fdup z1.d, #3.0
+    fdup z2.d, #8.0
+    ptrue p0.d
+    ptrue p1.d
+    fmopa za0.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za1.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za2.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za3.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za4.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za5.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za6.d, p0/m, p1/m, z1.d, z2.d
+    fmopa za7.d, p0/m, p1/m, z1.d, z2.d
+
+
+    # initialise registers
+    mov w8, #1
+    fdup z4.d, #-2.5
+    fdup z5.d, #3.0
+
+    fadd za.d[w8, #1, vgx2], {z4.d, z5.d}
+  )");
+  for (uint64_t i = 0; i < (SVL / 8); i++) {
+    if (i == zaHalfIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({21.5}, (SVL / 8)));
+    } else if (i == zaStride + zaHalfIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({27.0}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 24.0f throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
+                    fillNeon<double>({24.0}, (SVL / 8)));
+    }
+  }
 }
 
 TEST_P(InstSme, fmla_multiVecs) {

From 4effde42896b63259b6f6526e7b8eb4c14bfca9e Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 4 Nov 2024 15:34:28 +0000
Subject: [PATCH 52/71] Implemented LD1H (Single vec, imm offset) SVE
 instruction with tests.

---
 src/lib/arch/aarch64/Instruction_address.cc | 11 +++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc |  4 ++++
 test/regression/aarch64/instructions/sve.cc | 13 +++++++++++++
 3 files changed, 28 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index 4f52762bac..78956747f2 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -482,6 +482,17 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
         break;
       }
+      case Opcode::AArch64_LD1H_IMM: {  // ld1h  {zt.h}, pg/z, [xn{, #imm, mul
+                                        // vl}]
+        const uint16_t partition_num = VL_bits / 16;
+
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const int64_t offset = metadata_.operands[2].mem.disp;
+        const uint64_t addr = base + (offset * partition_num * 2);
+
+        setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
+        break;
+      }
       case Opcode::AArch64_LD1W: {  // ld1w {zt.s}, pg/z, [xn, xm, lsl #2]
         const uint64_t base = sourceValues_[1].get<uint64_t>();
         const uint64_t offset = sourceValues_[2].get<uint64_t>();
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 459c999261..cec2201ebb 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -3168,6 +3168,10 @@ void Instruction::execute() {
         results_[0] = {out, 256};
         break;
       }
+      case Opcode::AArch64_LD1H_IMM:  // ld1h  {zt.h}, pg/z, [xn{, #imm, mul
+                                      // vl}]
+        // LOAD
+        [[fallthrough]];
       case Opcode::AArch64_LD1H: {  // ld1h  {zt.h}, pg/z, [xn, xm, lsl #1]
         // LOAD
         const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 3acf783558..f8a7a80606 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -5591,6 +5591,7 @@ TEST_P(InstSve, ld1h) {
     ptrue p0.h
     # Load and broadcast values from heap
     ld1h {z0.h}, p0/z, [x0, x1, lsl #1]
+    ld1h {z2.h}, p0/z, [x0]
 
     # Test for inactive lanes
     mov x1, #0
@@ -5600,6 +5601,10 @@ TEST_P(InstSve, ld1h) {
     mov x2, #0
     whilelo p1.h, xzr, x1
     ld1h {z1.h}, p1/z, [x0, x2, lsl #1]
+
+    addvl x10, x10, #1
+    add x10, x10, x0
+    ld1h {z3.h}, p1/z, [x10, #-1, mul vl]
   )");
   CHECK_NEON(0, uint16_t,
              fillNeon<uint16_t>({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876,
@@ -5609,6 +5614,14 @@ TEST_P(InstSve, ld1h) {
              fillNeonCombined<uint16_t>({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432,
                                          0x9876, 0xEF01, 0xABCD},
                                         {0}, VL / 8));
+  CHECK_NEON(2, uint16_t,
+             fillNeon<uint16_t>({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876,
+                                 0xEF01, 0xABCD},
+                                VL / 8));
+  CHECK_NEON(3, uint16_t,
+             fillNeonCombined<uint16_t>({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432,
+                                         0x9876, 0xEF01, 0xABCD},
+                                        {0}, VL / 8));
 }
 
 TEST_P(InstSve, ld1w) {

From 40bba12eb5b5a7fa1a752d9343a53d092f5a8f88 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 4 Nov 2024 16:37:53 +0000
Subject: [PATCH 53/71] Added SVE bf16 DOT (indexed) instruction execution
 logic.

---
 src/lib/arch/aarch64/Instruction_execute.cc | 38 +++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index cec2201ebb..30d591f7b6 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -509,6 +509,44 @@ void Instruction::execute() {
         results_[0] = RegisterValue(out, 256);
         break;
       }
+      case Opcode::AArch64_BFDOT_ZZI: {  // bfdot zd.s, zn.h, zm.h[index]
+        // BF16 -- EXPERIMENTAL
+        if (std::string(SIMENG_ENABLE_BF16) == "OFF") return executionNYI();
+        // Must be enabled at SimEng compile time
+        // Not verified to be working for all compilers or OSs.
+        // No Tests written
+
+        const uint16_t partition_num = VL_bits / 16;
+
+        const float* zd = sourceValues_[0].getAsVector<float>();
+        // Extract data as uint16_t so that bytes-per-element is correct
+        const uint16_t* zn = sourceValues_[1].getAsVector<uint16_t>();
+        const uint16_t* zm = sourceValues_[2].getAsVector<uint16_t>();
+        const int index = metadata_.operands[2].vector_index;
+
+        float out[64] = {0.0f};
+        for (int i = 0; i < partition_num; i++) {
+          // MOD 4 as 4 32-bit elements in each 128-bit segment
+          const int zmBase = i - (i % 4);
+          const int zmIndex = zmBase + index;
+
+          float zn1, zn2, zm1, zm2;
+          // Horrible hack in order to convert bf16 (currently stored in a
+          // uint16_t) into a float.
+          // Each bf16 is copied into the least significant 16-bits of each
+          // float variable.
+          // Need to re-interpret each float destination as a uint16_t* inside
+          // the memcpy so that the least-significant bits can be accessed.
+          memcpy((uint16_t*)&zn1 + 1, &zn[2 * i], 2);
+          memcpy((uint16_t*)&zn2 + 1, &zn[2 * i + 1], 2);
+          memcpy((uint16_t*)&zm1 + 1, &zm[2 * zmIndex], 2);
+          memcpy((uint16_t*)&zm2 + 1, &zm[2 * zmIndex + 1], 2);
+
+          out[i] = zd[i] + ((zn1 * zm1) + (zn2 * zm2));
+        }
+        results_[0] = RegisterValue(out, 256);
+        break;
+      }
       case Opcode::AArch64_BFMWri: {  // bfm wd, wn, #immr, #imms
         results_[0] = {
             bfm_2imms<uint32_t>(sourceValues_, metadata_, false, false), 8};

From 39323607103a56425b53bfd9f32e9233c168d6be Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 4 Nov 2024 17:12:51 +0000
Subject: [PATCH 54/71] Implemented LD1H (two vec, imm and scalar offset) SVE
 instruction with tests.

---
 src/lib/arch/aarch64/Instruction_address.cc | 35 ++++++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc | 26 ++++++++++++
 test/regression/aarch64/instructions/sve.cc | 46 +++++++++++++++++++++
 3 files changed, 107 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index 78956747f2..703df9d849 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -493,6 +493,41 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({addr, static_cast<uint16_t>(VL_bits / 8)});
         break;
       }
+      case Opcode::AArch64_LD1H_2Z: {  // ld1h {zt1.h, zt2.h}, png/z, [xn, xm,
+                                       // lsl #1]
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const int64_t offset = sourceValues_[2].get<int64_t>();
+        const uint64_t addr = base + (offset << 1);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(2);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_LD1H_2Z_IMM: {  // ld1h {zt1.h, zt2.h}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        const uint16_t partition_num = VL_bits / 16;
+
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const uint64_t offset =
+            static_cast<uint64_t>(metadata_.operands[3].mem.disp);
+        const uint64_t addr = base + (offset * partition_num * 4);
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(2);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_LD1W: {  // ld1w {zt.s}, pg/z, [xn, xm, lsl #2]
         const uint64_t base = sourceValues_[1].get<uint64_t>();
         const uint64_t offset = sourceValues_[2].get<uint64_t>();
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 30d591f7b6..bc56b6186e 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -3229,6 +3229,32 @@ void Instruction::execute() {
         results_[0] = {out, 256};
         break;
       }
+      case Opcode::AArch64_LD1H_2Z:  // ld1h {zt1.h, zt2.h}, png/z, [xn, xm,
+                                     // lsl #1]
+        // LOAD
+        [[fallthrough]];
+      case Opcode::AArch64_LD1H_2Z_IMM: {  // ld1h {zt1.h, zt2.h}, png/z, [xn{,
+                                           // #imm, mul vl}]
+        // LOAD
+        const uint64_t pn = sourceValues_[0].get<uint64_t>();
+
+        auto preds = predAsCounterToMasks<uint16_t, 2>(pn, VL_bits);
+
+        uint16_t out[2][128] = {{0}, {0}};
+        const uint16_t partition_num = VL_bits / 16;
+
+        for (int r = 0; r < 2; r++) {
+          for (int i = 0; i < partition_num; i++) {
+            uint64_t shifted_active = 1ull << ((i % 32) * 2);
+            if (preds[r][i / 32] & shifted_active) {
+              out[r][i] = memoryData_[r].getAsVector<uint16_t>()[i];
+            }
+          }
+        }
+        results_[0] = {out[0], 256};
+        results_[1] = {out[1], 256};
+        break;
+      }
       case Opcode::AArch64_LD1Onev16b: {  // ld1 {vt.16b} [xn]
         results_[0] = memoryData_[0].zeroExtend(memoryData_[0].size(), 256);
         break;
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index f8a7a80606..c1f97fca4a 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -5574,6 +5574,7 @@ TEST_P(InstSve, ld1d) {
 }
 
 TEST_P(InstSve, ld1h) {
+  // Single vector
   initialHeapData_.resize(VL / 4);
   uint16_t* heap16 = reinterpret_cast<uint16_t*>(initialHeapData_.data());
   fillHeap<uint16_t>(
@@ -5622,6 +5623,51 @@ TEST_P(InstSve, ld1h) {
              fillNeonCombined<uint16_t>({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432,
                                          0x9876, 0xEF01, 0xABCD},
                                         {0}, VL / 8));
+
+  // Multi vector
+
+  // Two vector
+  initialHeapData_.resize(VL);
+  heap16 = reinterpret_cast<uint16_t*>(initialHeapData_.data());
+  fillHeap<uint16_t>(
+      heap16, {0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, 0xEF01, 0xABCD},
+      VL / 2);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ptrue pn8.h
+    mov x1, #1
+    ld1h {z0.h, z1.h}, pn8/z, [x0]
+    ld1h {z2.h, z3.h}, pn8/z, [x0, x1, lsl #1]
+    ld1h {z4.h, z5.h}, pn8/z, [x0, #2, mul vl]
+  )");
+  CHECK_NEON(0, uint16_t,
+             fillNeon<uint16_t>({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876,
+                                 0xEF01, 0xABCD},
+                                VL / 8));
+  CHECK_NEON(1, uint16_t,
+             fillNeon<uint16_t>({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876,
+                                 0xEF01, 0xABCD},
+                                VL / 8));
+  CHECK_NEON(2, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, 0xEF01,
+                                 0xABCD, 0xBEEF},
+                                VL / 8));
+  CHECK_NEON(3, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, 0xEF01,
+                                 0xABCD, 0xBEEF},
+                                VL / 8));
+  CHECK_NEON(4, uint16_t,
+             fillNeon<uint16_t>({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876,
+                                 0xEF01, 0xABCD},
+                                VL / 8));
+  CHECK_NEON(5, uint16_t,
+             fillNeon<uint16_t>({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876,
+                                 0xEF01, 0xABCD},
+                                VL / 8));
 }
 
 TEST_P(InstSve, ld1w) {

From 5aad523885719d9b4bd3264fbf47bddac1a7f110 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 4 Nov 2024 17:33:19 +0000
Subject: [PATCH 55/71] Implemented BFMOPA (widening) SME instruction.

---
 src/lib/arch/aarch64/Instruction_execute.cc | 58 +++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index bc56b6186e..af49bcb743 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -2219,6 +2219,64 @@ void Instruction::execute() {
         results_[0] = vecFmlsIndexed_3vecs<float, 4>(sourceValues_, metadata_);
         break;
       }
+      case Opcode::AArch64_BFMOPA_MPPZZ: {  // bfmopa zada.s, pn/m, pm/m, zn.h,
+                                            // zm.h
+        // SME
+        // BF16 -- EXPERIMENTAL
+        if (std::string(SIMENG_ENABLE_BF16) == "OFF") return executionNYI();
+        // Must be enabled at SimEng compile time
+        // Not verified to be working for all compilers or OSs.
+        // No Tests written
+
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[rowCount].getAsVector<uint64_t>();
+        const uint64_t* pm =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        // Use uint16_t to get 2-byte elements
+        const uint16_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint16_t>();
+        const uint16_t* zm =
+            sourceValues_[rowCount + 3].getAsVector<uint16_t>();
+
+        // zn is row, zm is col
+        for (int row = 0; row < rowCount; row++) {
+          float outRow[64] = {0.0f};
+          // Shifted active is for bf16 elements
+          uint64_t shifted_active_row = 1ull << ((row % 32) * 2);
+          const float* zadaRow = sourceValues_[row].getAsVector<float>();
+          for (int col = 0; col < rowCount; col++) {
+            outRow[col] = zadaRow[col];
+            // Shifted active is for bf16 elements
+            uint64_t shifted_active_col = 1ull << ((col % 32) * 2);
+            bool pred_row1 = pn[(2 * row) / 32] & shifted_active_row;
+            bool pred_row2 = pn[(2 * row + 1) / 32] & shifted_active_row;
+            bool pred_col1 = pm[(2 * col) / 32] & shifted_active_col;
+            bool pred_col2 = pm[(2 * col + 1) / 32] & shifted_active_col;
+            if ((pred_row1 && pred_col1) || (pred_row2 && pred_col2)) {
+              float zn1, zn2, zm1, zm2;
+              // Horrible hack in order to convert bf16 (currently stored in a
+              // uint16_t) into a float.
+              // Each bf16 is copied into the least significant 16-bits of each
+              // float variable.
+              // Need to re-interpret each float destination as a uint16_t*
+              // inside the memcpy so that the least-significant bits can be
+              // accessed.
+              memcpy((uint16_t*)&zn1 + 1, &zn[2 * row], 2);
+              memcpy((uint16_t*)&zn2 + 1, &zn[2 * row + 1], 2);
+              memcpy((uint16_t*)&zm1 + 1, &zm[2 * col], 2);
+              memcpy((uint16_t*)&zm2 + 1, &zm[2 * col + 1], 2);
+              outRow[col] += (pred_row1 && pred_col1) ? zn1 * zm1 : 0.0f;
+              outRow[col] += (pred_row2 && pred_col2) ? zn2 * zm2 : 0.0f;
+            }
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
       case Opcode::AArch64_FMOPA_MPPZZ_D: {  // fmopa zada.d, pn/m, pm/m, zn.d,
                                              // zm.d
         // SME

From 430c775915055369156ed1c058af870411048e8b Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 4 Nov 2024 18:01:57 +0000
Subject: [PATCH 56/71] Minor UMAXP fix.

---
 src/include/simeng/arch/aarch64/helpers/neon.hh | 4 ++--
 test/regression/aarch64/instructions/neon.cc    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh
index c2bf42e6fa..a10c8afd74 100644
--- a/src/include/simeng/arch/aarch64/helpers/neon.hh
+++ b/src/include/simeng/arch/aarch64/helpers/neon.hh
@@ -570,8 +570,8 @@ RegisterValue vecUMaxP(srcValContainer& sourceValues) {
 
   // Concatenate the vectors
   T temp[2 * I];
-  memcpy(temp, m, sizeof(T) * I);
-  memcpy(temp + (sizeof(T) * I), n, sizeof(T) * I);
+  memcpy(temp, n, sizeof(T) * I);
+  memcpy(temp + (sizeof(T) * I), m, sizeof(T) * I);
   // Compare each adjacent pair of elements
   T out[I];
   for (int i = 0; i < I; i++) {
diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc
index ca9ae26a4e..1621cbbdad 100644
--- a/test/regression/aarch64/instructions/neon.cc
+++ b/test/regression/aarch64/instructions/neon.cc
@@ -2738,7 +2738,7 @@ TEST_P(InstNeon, umaxp) {
 
     ldr q0, [x0]
     ldr q1, [x0, #16]
-    umaxp v2.16b, v0.16b, v1.16b
+    umaxp v2.16b, v1.16b, v0.16b
 
   )");
   CHECK_NEON(2, uint8_t,

From a01c2fca5406a27765bbbc6617e393f1b2c97fde Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 4 Nov 2024 18:13:27 +0000
Subject: [PATCH 57/71] Fixed function comment.

---
 src/include/simeng/arch/aarch64/Instruction.hh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/include/simeng/arch/aarch64/Instruction.hh b/src/include/simeng/arch/aarch64/Instruction.hh
index bee47e01bc..f3854c84b4 100644
--- a/src/include/simeng/arch/aarch64/Instruction.hh
+++ b/src/include/simeng/arch/aarch64/Instruction.hh
@@ -286,7 +286,7 @@ enum class InsnType : uint32_t {
 /** Predefined shift values for converting pred-as-counter to pred-as-mask. */
 const uint64_t predCountShiftVals[9] = {0, 1, 2, 0, 3, 0, 0, 0, 4};
 
-/** Convert Predicate-as-Mask to Predicate-as-Masks.
+/** Convert Predicate-as-Counter to Predicate-as-Masks.
  * T represents the element type (i.e. for pg.s, T = uint32_t).
  * V represents the number of vectors the predicate-as-counter is being used
  * for. */

From 9790c6e8098cb1e168969a9e4d020ae3b2cceba5 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Tue, 5 Nov 2024 17:42:33 +0000
Subject: [PATCH 58/71] Updated BF16 comment.

---
 src/lib/arch/aarch64/Instruction_execute.cc | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index af49bcb743..a058c354fa 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -533,10 +533,9 @@ void Instruction::execute() {
           float zn1, zn2, zm1, zm2;
           // Horrible hack in order to convert bf16 (currently stored in a
           // uint16_t) into a float.
-          // Each bf16 is copied into the least significant 16-bits of each
-          // float variable.
-          // Need to re-interpret each float destination as a uint16_t* inside
-          // the memcpy so that the least-significant bits can be accessed.
+          // Each bf16 is copied into the most significant 16-bits of each
+          // float variable; given IEEE FP32 and BF16 have the same width
+          // exponent and one sign bit.
           memcpy((uint16_t*)&zn1 + 1, &zn[2 * i], 2);
           memcpy((uint16_t*)&zn2 + 1, &zn[2 * i + 1], 2);
           memcpy((uint16_t*)&zm1 + 1, &zm[2 * zmIndex], 2);
@@ -2260,11 +2259,9 @@ void Instruction::execute() {
               float zn1, zn2, zm1, zm2;
               // Horrible hack in order to convert bf16 (currently stored in a
               // uint16_t) into a float.
-              // Each bf16 is copied into the least significant 16-bits of each
-              // float variable.
-              // Need to re-interpret each float destination as a uint16_t*
-              // inside the memcpy so that the least-significant bits can be
-              // accessed.
+              // Each bf16 is copied into the most significant 16-bits of each
+              // float variable; given IEEE FP32 and BF16 have the same width
+              // exponent and one sign bit.
               memcpy((uint16_t*)&zn1 + 1, &zn[2 * row], 2);
               memcpy((uint16_t*)&zn2 + 1, &zn[2 * row + 1], 2);
               memcpy((uint16_t*)&zm1 + 1, &zm[2 * col], 2);

From 5bc9330315777b37132c211877f88a39759cf8f0 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Wed, 6 Nov 2024 12:45:32 +0000
Subject: [PATCH 59/71] Implemented NEON UDOT (by vector) instruction with
 tests.

---
 .../simeng/arch/aarch64/helpers/neon.hh       | 27 +++++++++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc   |  4 +++
 test/regression/aarch64/instructions/neon.cc  | 24 +++++++++++++++++
 3 files changed, 55 insertions(+)

diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh
index a10c8afd74..52d0ef9011 100644
--- a/src/include/simeng/arch/aarch64/helpers/neon.hh
+++ b/src/include/simeng/arch/aarch64/helpers/neon.hh
@@ -951,6 +951,33 @@ RegisterValue vecUzp(srcValContainer& sourceValues, bool isUzp1) {
   return {out, 256};
 }
 
+/** Helper function for NEON instructions with the format `udot vd.s, vn.b,
+ * vm.b`. D represents the number of elements in the output vector to be updated
+ * (i.e. for vd.2s D = 2). Only 2 or 4 are valid. Returns correctly formatted
+ * RegisterValue. */
+template <int D>
+RegisterValue vecUdot(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  // Check D and N are valid values
+  static_assert((D == 2 || D == 4) &&
+                "D must be either 2 or 4 to align with vd.2s or vd.4s.");
+
+  const uint32_t* vd = sourceValues[0].getAsVector<uint32_t>();
+  const uint8_t* vn = sourceValues[1].getAsVector<uint8_t>();
+  const uint8_t* vm = sourceValues[2].getAsVector<uint8_t>();
+
+  uint32_t out[D] = {0};
+  for (int i = 0; i < D; i++) {
+    out[i] = vd[i];
+    for (int j = 0; j < 4; j++) {
+      out[i] += (static_cast<uint32_t>(vn[(4 * i) + j]) *
+                 static_cast<uint32_t>(vm[(4 * i) + j]));
+    }
+  }
+  return {out, 256};
+}
+
 /** Helper function for NEON instructions with the format `udot vd.s, vn.b,
  * vm.4b[index]`.
  * D represents the number of elements in the output vector to be updated (i.e.
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index a058c354fa..505520287c 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -6550,6 +6550,10 @@ void Instruction::execute() {
                                                             metadata_, VL_bits);
         break;
       }
+      case Opcode::AArch64_UDOTv16i8: {  // udot vd.4s, vn.16b, vm.16b
+        results_[0] = vecUdot<4>(sourceValues_, metadata_);
+        break;
+      }
       case Opcode::AArch64_UDOTlanev16i8: {  // udot vd.4s, vn.16b, vm.4b[index]
         results_[0] = vecUdot_byElement<4>(sourceValues_, metadata_);
         break;
diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc
index 1621cbbdad..6271023ea4 100644
--- a/test/regression/aarch64/instructions/neon.cc
+++ b/test/regression/aarch64/instructions/neon.cc
@@ -3681,6 +3681,30 @@ TEST_P(InstNeon, udot) {
   CHECK_NEON(3, uint32_t, {0xd328, 0x288e8, 0x27e25, 0x2b87f});
   CHECK_NEON(4, uint32_t, {0xc333, 0x2731b, 0x0, 0x0});
   CHECK_NEON(5, uint32_t, {0x1fe2, 0x8e62, 0xad7e, 0xb52f});
+
+  // udot by vector
+  initialHeapData_.resize(128);
+  heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  heap64[0] = 0xDEADBEEFFFFFFFFF;
+  heap64[1] = 0x01234567ABBACAFE;
+  heap64[2] = 0xFEDCBA98FFFFFFFF;
+  heap64[3] = 0xDEADCAFEABBABEEF;
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, #0
+    mov x8, #214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #16]
+
+    movi v2.4s, #3
+
+    udot v2.4s, v1.16b, v0.16b
+  )");
+  CHECK_NEON(0, uint64_t, {0xDEADBEEFFFFFFFFF, 0x01234567ABBACAFE});
+  CHECK_NEON(1, uint64_t, {0xFEDCBA98FFFFFFFF, 0xDEADCAFEABBABEEF});
+  CHECK_NEON(2, uint32_t, {0x3F807, 0x288E7, 0x27C6E, 0xB52C});
 }
 
 TEST_P(InstNeon, uzp) {

From 1fd130cde40b882e8a02dbf0217ba57ddd904be5 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Wed, 6 Nov 2024 18:20:14 +0000
Subject: [PATCH 60/71] Implemented SVE UDOT (by vector, 4-way) instruction
 with tests.

---
 .../simeng/arch/aarch64/helpers/sve.hh        | 28 +++++++++++++++++++
 src/lib/arch/aarch64/Instruction_execute.cc   |  5 ++++
 test/regression/aarch64/instructions/sve.cc   | 22 ++++++++++++++-
 3 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index c963b22f7a..50eb19c657 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -1618,6 +1618,34 @@ RegisterValue sveTrn2_3vecs(srcValContainer& sourceValues,
   return {out, 256};
 }
 
+/** Helper function for SVE instructions with the format `udot zd, zn, zm`.
+ * D represents the element type of the destination register (i.e. for zd.s,
+ * D = uint32_t).
+ * N represents the element type of the source registers (i.e. for zn.b, N =
+ * uint8_t).
+ * W represents how many source elements are multiplied to form an output
+ * element (i.e. for 4-way, W = 4).
+ * Returns correctly formatted RegisterValue. */
+template <typename D, typename N, int W>
+RegisterValue sveUdot(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const D* zd = sourceValues[0].getAsVector<D>();
+  const N* zn = sourceValues[1].getAsVector<N>();
+  const N* zm = sourceValues[2].getAsVector<N>();
+
+  D out[256 / sizeof(D)] = {0};
+  for (int i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) {
+    out[i] = zd[i];
+    for (int j = 0; j < W; j++) {
+      out[i] +=
+          (static_cast<D>(zn[(W * i) + j]) * static_cast<N>(zm[(W * i) + j]));
+    }
+  }
+  return {out, 256};
+}
+
 /** Helper function for SVE instructions with the format `udot zd, zn,
  * zm[index]`.
  * D represents the element type of the destination register (i.e. for uint32_t,
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 505520287c..aa44d69079 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -6545,6 +6545,11 @@ void Instruction::execute() {
         }
         break;
       }
+      case Opcode::AArch64_UDOT_ZZZ_S: {  // udot zd.s, zn.b, zm.b
+        results_[0] =
+            sveUdot<uint32_t, uint8_t, 4>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_UDOT_ZZZI_S: {  // udot zd.s, zn.b, zm.b[index]
         results_[0] = sveUdot_indexed<uint32_t, uint8_t, 4>(sourceValues_,
                                                             metadata_, VL_bits);
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index c1f97fca4a..43382697e6 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -8032,9 +8032,29 @@ TEST_P(InstSve, udot) {
     udot z4.s, z2.b, z0.b[0]
     udot z5.s, z3.b, z0.b[3]
   )");
-
   CHECK_NEON(4, uint32_t, fillNeon<uint32_t>({1534}, VL / 8));
   CHECK_NEON(5, uint32_t, fillNeon<uint32_t>({629}, VL / 8));
+
+  // udot by vector - 4-way
+  initialHeapData_.resize(16);
+  heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  heap64[0] = 0xDEADBEEFFFFF00FF;
+  heap64[1] = 0x01234567ABBACAFE;
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, #0
+    mov x8, #214
+    svc #0
+
+    ptrue p0.b
+    ld1rqb	{ z0.b }, p0/z, [x0]
+
+    dup z2.b, #2
+    dup z4.s, #4
+
+    udot z4.s, z2.b, z0.b
+  )");
+  CHECK_NEON(4, uint32_t, fillNeon<uint32_t>({1534, 1652, 1630, 420}, VL / 8));
 }
 
 TEST_P(InstSve, uqdec) {

From 81ddba7e9737ccf9f38ed0567449b064d8b8090a Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Wed, 6 Nov 2024 20:44:12 +0000
Subject: [PATCH 61/71] Implemented SVE ST4W (scalar offset) instruction with
 tests, and changed address generation logic for ST2W and ST4W.

---
 .../simeng/arch/aarch64/operandContainer.hh   |  2 +-
 src/lib/arch/aarch64/Instruction_address.cc   | 68 +++++++++++++++++--
 src/lib/arch/aarch64/Instruction_execute.cc   | 51 +++-----------
 test/regression/aarch64/instructions/sve.cc   | 10 +++
 4 files changed, 82 insertions(+), 49 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/operandContainer.hh b/src/include/simeng/arch/aarch64/operandContainer.hh
index c73b8881da..996454b007 100644
--- a/src/include/simeng/arch/aarch64/operandContainer.hh
+++ b/src/include/simeng/arch/aarch64/operandContainer.hh
@@ -10,7 +10,7 @@ namespace arch {
 namespace aarch64 {
 
 /** The maximum number of source registers a non-SME instruction can have. */
-const uint8_t MAX_SOURCE_REGISTERS = 6;
+const uint8_t MAX_SOURCE_REGISTERS = 7;
 
 /** The maximum number of destination registers a non-SME instruction can have.
  */
diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index 703df9d849..7e4da09efc 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -1299,8 +1299,52 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
 
         uint64_t addr = base + (offset * partition_num * 8);
 
-        generatePredicatedContiguousAddressBlocks(addr, partition_num, 16, 8, p,
-                                                  addresses);
+        // As vectors are stored in an interleaved manner (i.e. zt1[0], zt2[0],
+        // zt1[1], zt2[1], ...) we must generate an address for each element (if
+        // the predicate is true for that element). This is because, if the
+        // predicate indicates that all elements are active, a single address
+        // and MemoryAccessTarget will be generated with a size of 2xVL. This
+        // could lead to issues for core models which have a maximum store
+        // bandwidth of 1xVL.
+        for (int i = 0; i < partition_num; i++) {
+          uint64_t shifted_active = 1ull << ((i % 8) * 8);
+          if (p[i / 8] & shifted_active) {
+            addresses.push_back({addr + (2 * i * 8), 8});
+            addresses.push_back({addr + (2 * i * 8) + 8, 8});
+          }
+        }
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_ST4W: {  // st4w {zt1.s, zt2.s, zt3.s, zt4.s},
+                                    // pg, [<xn|sp>, xm, lsl #2]
+        const uint64_t* p = sourceValues_[4].getAsVector<uint64_t>();
+        const uint16_t partition_num = VL_bits / 32;
+
+        const uint64_t base = sourceValues_[5].get<uint64_t>();
+        const int64_t offset = sourceValues_[6].get<int64_t>();
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(partition_num * 4);
+
+        uint64_t addr = base + (offset << 2);
+
+        // As vectors are stored in an interleaved manner (i.e. zt1[0], zt2[0],
+        // zt3[0], zt4[0], zt1[1], zt2[1], zt3[1], zt4[1] ...) we must generate
+        // an address for each element (if the predicate is true for that
+        // element). This is because, if the predicate indicates that all
+        // elements are active, a single address and MemoryAccessTarget will be
+        // generated with a size of 4xVL. This could lead to issues for core
+        // models which have a maximum store bandwidth of 1xVL.
+        for (int i = 0; i < partition_num; i++) {
+          uint64_t shifted_active = 1ull << ((i % 16) * 4);
+          if (p[i / 16] & shifted_active) {
+            addresses.push_back({addr + (4 * i * 4), 4});
+            addresses.push_back({addr + (4 * i * 4) + 4, 4});
+            addresses.push_back({addr + (4 * i * 4) + 8, 4});
+            addresses.push_back({addr + (4 * i * 4) + 12, 4});
+          }
+        }
         setMemoryAddresses(std::move(addresses));
         break;
       }
@@ -1315,12 +1359,24 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
 
         std::vector<memory::MemoryAccessTarget> addresses;
         addresses.reserve(partition_num * 4);
-
         uint64_t addr = base + (offset * partition_num * 4);
 
-        generatePredicatedContiguousAddressBlocks(addr, partition_num, 16, 4, p,
-                                                  addresses);
-
+        // As vectors are stored in an interleaved manner (i.e. zt1[0], zt2[0],
+        // zt3[0], zt4[0], zt1[1], zt2[1], zt3[1], zt4[1] ...) we must generate
+        // an address for each element (if the predicate is true for that
+        // element). This is because, if the predicate indicates that all
+        // elements are active, a single address and MemoryAccessTarget will be
+        // generated with a size of 4xVL. This could lead to issues for core
+        // models which have a maximum store bandwidth of 1xVL.
+        for (int i = 0; i < partition_num; i++) {
+          uint64_t shifted_active = 1ull << ((i % 16) * 4);
+          if (p[i / 16] & shifted_active) {
+            addresses.push_back({addr + (4 * i * 4), 4});
+            addresses.push_back({addr + (4 * i * 4) + 4, 4});
+            addresses.push_back({addr + (4 * i * 4) + 8, 4});
+            addresses.push_back({addr + (4 * i * 4) + 12, 4});
+          }
+        }
         setMemoryAddresses(std::move(addresses));
         break;
       }
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index aa44d69079..c63d38e3d2 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -5709,33 +5709,15 @@ void Instruction::execute() {
         const uint64_t* d2 = sourceValues_[1].getAsVector<uint64_t>();
         const uint64_t* p = sourceValues_[2].getAsVector<uint64_t>();
 
-        std::vector<uint64_t> memData;
-        bool inActiveBlock = false;
-
         const uint16_t partition_num = VL_bits / 64;
         uint16_t index = 0;
         for (int i = 0; i < partition_num; i++) {
           uint64_t shifted_active = 1ull << ((i % 8) * 8);
           if (p[i / 8] & shifted_active) {
-            // If active and not in active block, initialise
-            if (!inActiveBlock) {
-              memData.clear();
-              inActiveBlock = true;
-            }
-            memData.push_back(d1[i]);
-            memData.push_back(d2[i]);
-          } else if (inActiveBlock) {
-            inActiveBlock = false;
-            memoryData_[index] = RegisterValue(
-                (char*)memData.data(), sizeof(uint64_t) * memData.size());
-            index++;
+            memoryData_[index++] = RegisterValue(d1[i], 8);
+            memoryData_[index++] = RegisterValue(d2[i], 8);
           }
         }
-        // Add final block if needed
-        if (inActiveBlock)
-          memoryData_[index] = RegisterValue((char*)memData.data(),
-                                             sizeof(uint64_t) * memData.size());
-
         break;
       }
       case Opcode::AArch64_ST2Twov4s_POST: {  // st2 {vt1.4s, vt2.4s}, [xn],
@@ -5755,6 +5737,9 @@ void Instruction::execute() {
         results_[0] = sourceValues_[2].get<uint64_t>() + postIndex;
         break;
       }
+      case Opcode::AArch64_ST4W:  // st4w {zt1.s, zt2.s, zt3.s, zt4.s},
+                                  // pg, [<xn|sp>, xm, lsl #2]
+        [[fallthrough]];
       case Opcode::AArch64_ST4W_IMM: {  // st4w {zt1.s, zt2.s, zt3.s, zt4.s},
                                         // pg, [<xn|sp>{, #imm, mul vl}]
         // STORE
@@ -5764,35 +5749,17 @@ void Instruction::execute() {
         const uint32_t* d4 = sourceValues_[3].getAsVector<uint32_t>();
         const uint64_t* p = sourceValues_[4].getAsVector<uint64_t>();
 
-        std::vector<uint32_t> memData;
-        bool inActiveBlock = false;
-
         const uint16_t partition_num = VL_bits / 32;
         uint16_t index = 0;
         for (int i = 0; i < partition_num; i++) {
           uint64_t shifted_active = 1ull << ((i % 16) * 4);
           if (p[i / 16] & shifted_active) {
-            // If active and not in active block, initialise
-            if (!inActiveBlock) {
-              memData.clear();
-              inActiveBlock = true;
-            }
-            memData.push_back(d1[i]);
-            memData.push_back(d2[i]);
-            memData.push_back(d3[i]);
-            memData.push_back(d4[i]);
-          } else if (inActiveBlock) {
-            inActiveBlock = false;
-            memoryData_[index] = RegisterValue(
-                (char*)memData.data(), sizeof(uint32_t) * memData.size());
-            index++;
+            memoryData_[index++] = RegisterValue(d1[i], 4);
+            memoryData_[index++] = RegisterValue(d2[i], 4);
+            memoryData_[index++] = RegisterValue(d3[i], 4);
+            memoryData_[index++] = RegisterValue(d4[i], 4);
           }
         }
-        // Add final block if needed
-        if (inActiveBlock)
-          memoryData_[index] = RegisterValue((char*)memData.data(),
-                                             sizeof(uint32_t) * memData.size());
-
         break;
       }
       case Opcode::AArch64_STLRB: {  // stlrb wt, [xn]
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 43382697e6..a4103b9ecb 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -7321,6 +7321,8 @@ TEST_P(InstSve, st4w) {
 
     st4w {z0.s - z3.s}, p0, [sp]
     st4w {z0.s - z3.s}, p1, [x6, #4, mul vl]
+    addvl x7, x7, #3
+    st4w {z0.s - z3.s}, p1, [x6, x7, lsl #2]
   )");
 
   for (uint64_t i = 0; i < (VL / 32); i++) {
@@ -7345,6 +7347,14 @@ TEST_P(InstSve, st4w) {
     EXPECT_EQ(getMemoryValue<uint32_t>(300 + index + (4 * i * 4) + 8), 5);
     EXPECT_EQ(getMemoryValue<uint32_t>(300 + index + (4 * i * 4) + 12), 6);
   }
+
+  index = 12 * (VL / 8);
+  for (uint64_t i = 0; i < (VL / 64); i++) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(300 + index + (4 * i * 4)), 3);
+    EXPECT_EQ(getMemoryValue<uint32_t>(300 + index + (4 * i * 4) + 4), 4);
+    EXPECT_EQ(getMemoryValue<uint32_t>(300 + index + (4 * i * 4) + 8), 5);
+    EXPECT_EQ(getMemoryValue<uint32_t>(300 + index + (4 * i * 4) + 12), 6);
+  }
 }
 
 TEST_P(InstSve, st1w_scatter) {

From 4c99a0f4a1e69e6a144c3400527c7d1800ddc97d Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Wed, 6 Nov 2024 22:00:13 +0000
Subject: [PATCH 62/71] Implemented LD1B (4 vec, scalar offset) SVE2
 instruction with tests.

---
 src/lib/arch/aarch64/Instruction_address.cc | 17 ++++
 src/lib/arch/aarch64/Instruction_execute.cc |  5 +-
 test/regression/aarch64/instructions/sve.cc | 92 ++++++++++++++++++++-
 3 files changed, 112 insertions(+), 2 deletions(-)

diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index 7e4da09efc..67e4599e5c 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -396,6 +396,23 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
+      case Opcode::AArch64_LD1B_4Z: {  // ld1b {zt1.b - zt4.b}, png/z, [xn, xm]
+        const uint64_t base = sourceValues_[1].get<uint64_t>();
+        const int64_t offset = sourceValues_[2].get<int64_t>();
+        const uint64_t addr = base + offset;
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(4);
+
+        uint16_t blockSize = VL_bits / 8;
+        addresses.push_back({addr, blockSize});
+        addresses.push_back({addr + blockSize, blockSize});
+        addresses.push_back({addr + 2 * blockSize, blockSize});
+        addresses.push_back({addr + 3 * blockSize, blockSize});
+
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
       case Opcode::AArch64_LD1D: {  // ld1d {zt.d}, pg/z, [xn, xm, lsl #3]
         const uint64_t base = sourceValues_[1].get<uint64_t>();
         const uint64_t offset = sourceValues_[2].get<uint64_t>();
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index c63d38e3d2..d398e7ef39 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -3075,7 +3075,7 @@ void Instruction::execute() {
         }
         break;
       }
-      case Opcode::AArch64_LD1B: {  // ld1b  {zt.b}, pg/z, [xn, xm]
+      case Opcode::AArch64_LD1B: {  // ld1b {zt.b}, pg/z, [xn, xm]
         // LOAD
         const uint64_t* p = sourceValues_[0].getAsVector<uint64_t>();
 
@@ -3147,6 +3147,9 @@ void Instruction::execute() {
                                                  // mul vl}]
         // LOAD
         [[fallthrough]];
+      case Opcode::AArch64_LD1B_4Z:  // ld1b {zt1.b - zt4.b}, png/z, [xn, xm]
+        // LOAD
+        [[fallthrough]];
       case Opcode::AArch64_LD1B_4Z_IMM: {  // ld1b {zt1.b - zt4.b}, png/z, [xn{,
                                            // #imm, mul vl}]
         // LOAD
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index a4103b9ecb..16a966d00a 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -5035,6 +5035,7 @@ TEST_P(InstSve, ld1b) {
 
     mov x1, #4
     ld1b {z0.b - z3.b}, pn8/z, [x0, #4, mul vl]
+    ld1b {z4.b - z7.b}, pn8/z, [x0, x1]
     ld1b {z16.b, z20.b, z24.b, z28.b}, pn8/z, [x0, #4, mul vl]
     ld1b {z17.b, z21.b, z25.b, z29.b}, pn8/z, [x0, x1]
   )");
@@ -5125,7 +5126,95 @@ TEST_P(InstSve, ld1b) {
                      src[((base + (3 * offset)) + 15) % 16],
                  },
                  VL / 8));
+  base = 4;
+  offset = (VL / 8);
+  CHECK_NEON(4, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[(base) % 16],
+                     src[(base + 1) % 16],
+                     src[(base + 2) % 16],
+                     src[(base + 3) % 16],
+                     src[(base + 4) % 16],
+                     src[(base + 5) % 16],
+                     src[(base + 6) % 16],
+                     src[(base + 7) % 16],
+                     src[(base + 8) % 16],
+                     src[(base + 9) % 16],
+                     src[(base + 10) % 16],
+                     src[(base + 11) % 16],
+                     src[(base + 12) % 16],
+                     src[(base + 13) % 16],
+                     src[(base + 14) % 16],
+                     src[(base + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(4, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + offset)) % 16],
+                     src[((base + offset) + 1) % 16],
+                     src[((base + offset) + 2) % 16],
+                     src[((base + offset) + 3) % 16],
+                     src[((base + offset) + 4) % 16],
+                     src[((base + offset) + 5) % 16],
+                     src[((base + offset) + 6) % 16],
+                     src[((base + offset) + 7) % 16],
+                     src[((base + offset) + 8) % 16],
+                     src[((base + offset) + 9) % 16],
+                     src[((base + offset) + 10) % 16],
+                     src[((base + offset) + 11) % 16],
+                     src[((base + offset) + 12) % 16],
+                     src[((base + offset) + 13) % 16],
+                     src[((base + offset) + 14) % 16],
+                     src[((base + offset) + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(6, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + (2 * offset))) % 16],
+                     src[((base + (2 * offset)) + 1) % 16],
+                     src[((base + (2 * offset)) + 2) % 16],
+                     src[((base + (2 * offset)) + 3) % 16],
+                     src[((base + (2 * offset)) + 4) % 16],
+                     src[((base + (2 * offset)) + 5) % 16],
+                     src[((base + (2 * offset)) + 6) % 16],
+                     src[((base + (2 * offset)) + 7) % 16],
+                     src[((base + (2 * offset)) + 8) % 16],
+                     src[((base + (2 * offset)) + 9) % 16],
+                     src[((base + (2 * offset)) + 10) % 16],
+                     src[((base + (2 * offset)) + 11) % 16],
+                     src[((base + (2 * offset)) + 12) % 16],
+                     src[((base + (2 * offset)) + 13) % 16],
+                     src[((base + (2 * offset)) + 14) % 16],
+                     src[((base + (2 * offset)) + 15) % 16],
+                 },
+                 VL / 8));
+  CHECK_NEON(7, uint8_t,
+             fillNeon<uint8_t>(
+                 {
+                     src[((base + (3 * offset))) % 16],
+                     src[((base + (3 * offset)) + 1) % 16],
+                     src[((base + (3 * offset)) + 2) % 16],
+                     src[((base + (3 * offset)) + 3) % 16],
+                     src[((base + (3 * offset)) + 4) % 16],
+                     src[((base + (3 * offset)) + 5) % 16],
+                     src[((base + (3 * offset)) + 6) % 16],
+                     src[((base + (3 * offset)) + 7) % 16],
+                     src[((base + (3 * offset)) + 8) % 16],
+                     src[((base + (3 * offset)) + 9) % 16],
+                     src[((base + (3 * offset)) + 10) % 16],
+                     src[((base + (3 * offset)) + 11) % 16],
+                     src[((base + (3 * offset)) + 12) % 16],
+                     src[((base + (3 * offset)) + 13) % 16],
+                     src[((base + (3 * offset)) + 14) % 16],
+                     src[((base + (3 * offset)) + 15) % 16],
+                 },
+                 VL / 8));
   // Strided (4-stride) vectors
+  base = (VL / 8) * 4;
+  offset = (VL / 8);
   CHECK_NEON(16, uint8_t,
              fillNeon<uint8_t>(
                  {
@@ -5210,7 +5299,8 @@ TEST_P(InstSve, ld1b) {
                      src[((base + (3 * offset)) + 15) % 16],
                  },
                  VL / 8));
-  base = (VL / 8) + 4;
+  base = 4;
+  offset = (VL / 8);
   CHECK_NEON(17, uint8_t,
              fillNeon<uint8_t>(
                  {

From 0d74234ea2e591fab17e8da5baac2a78b93625be Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Thu, 7 Nov 2024 11:34:27 +0000
Subject: [PATCH 63/71] Implemented UDOT (4-way, VGx4 8-bit to 32-bit widening)
 SME instruction with tests.

---
 src/lib/arch/aarch64/Instruction_execute.cc | 49 ++++++++++++++++
 test/regression/aarch64/instructions/sme.cc | 65 ++++++++++++++++++++-
 2 files changed, 113 insertions(+), 1 deletion(-)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index d398e7ef39..4300fc68c3 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -6392,6 +6392,55 @@ void Instruction::execute() {
         results_[0] = {div_3ops<uint64_t>(sourceValues_), 8};
         break;
       }
+      case Opcode::AArch64_UDOT_VG4_M4Z4Z_BtoS: {  // udot za.s[wv, #off, vgx4],
+                                                   // {zn1.b - zn4.b}, {zm1.b -
+                                                   // zm4.b}
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 32;
+        // Get ZA stride between quarters and index into each ZA quarter
+        const uint16_t zaStride = zaRowCount / 4;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+
+        // Pre-set all ZA result rows as only 4 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        // Get base zn and zm register indexed in sourceValues
+        const uint16_t znBase = zaRowCount + 1;
+        const uint16_t zmBase = zaRowCount + 5;
+
+        // Loop over each source vector and destination vector (from the za
+        // single-vector group) pair
+        for (int r = 0; r < 4; r++) {
+          // For ZA single-vector groups of 4 vectors (vgx4), each vector is in
+          // a different quarter of ZA; indexed into it by Wv+off.
+          const uint32_t* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<uint32_t>();
+          const uint8_t* znr = sourceValues_[znBase + r].getAsVector<uint8_t>();
+          const uint8_t* zmr = sourceValues_[zmBase + r].getAsVector<uint8_t>();
+          uint32_t out[64] = {0};
+          // Loop over all 32-bit elements of output row vector `zaRow`
+          for (int e = 0; e < elemCount; e++) {
+            out[e] = zaRow[e];
+            // There are 4 8-bit elements per 32-bit element of `znr` and `zmr`
+            for (int i = 0; i < 4; i++) {
+              out[e] += static_cast<uint32_t>(znr[4 * e + i]) *
+                        static_cast<uint32_t>(zmr[4 * e + i]);
+            }
+          }
+          // Update results_ for completed row
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
       case Opcode::AArch64_UDOT_VG4_M4ZZI_BtoS: {  // udot za.s[wv, #off, vgx4],
                                                    // {zn1.b - zn4.b},
                                                    // zm.b[#index]
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index eb3ef04e4f..8b12725472 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -1086,7 +1086,70 @@ TEST_P(InstSme, udot_Indexed_vgx4) {
   }
 }
 
-TEST_P(InstSme, uvdot_vgx4) {
+TEST_P(InstSme, udot_vgx4) {
+  // 8-bit to 32-bit widening
+  initialHeapData_.resize(SVL / 8);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src = {0, 1, 2,  3,  4,  5,  6,  7,
+                              8, 9, 10, 11, 12, 13, 14, 15};
+  fillHeap<uint8_t>(heap8, src, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 96 (uint32_t)
+    dup z1.b, #8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+    umopa za0.s, p0/m, p1/m, z1.b, z2.b
+    umopa za1.s, p0/m, p1/m, z1.b, z2.b
+    umopa za2.s, p0/m, p1/m, z1.b, z2.b
+    umopa za3.s, p0/m, p1/m, z1.b, z2.b
+
+    # initialise registers
+    mov w8, #1
+    dup z4.b, #10
+    dup z5.b, #11
+    dup z6.b, #12
+    dup z7.b, #13
+    ld1b {z8.b}, p0/z, [x0]
+    ld1b {z9.b}, p0/z, [x0]
+    ld1b {z10.b}, p0/z, [x0]
+    ld1b {z11.b}, p0/z, [x0]
+
+    udot za.s[w8, #1, vgx4], {z4.b - z7.b}, {z8.b - z11.b}
+  )");
+  const uint16_t zaStride = (SVL / 8) / 4;
+  const uint16_t zaQuartIndex = 2;
+  for (uint64_t i = 0; i < (SVL / 8); i++) {
+    if (i == zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({156, 316, 476, 636}, (SVL / 8)));
+    } else if (i == zaStride + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({162, 338, 514, 690}, (SVL / 8)));
+    } else if (i == (2 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({168, 360, 552, 744}, (SVL / 8)));
+    } else if (i == (3 * zaStride) + zaQuartIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({174, 382, 590, 798}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 96 throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({96}, (SVL / 8)));
+    }
+  }
+}
+
+TEST_P(InstSme, uvdot_indexed_vgx4) {
   // 8-bit to 32-bit widening
   initialHeapData_.resize(SVL / 8);
   uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());

From 40a0fa4d0c762eb3e59f67d289f81f280a797856 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Thu, 7 Nov 2024 16:50:31 +0000
Subject: [PATCH 64/71] Implemented ADD (uint32, vgx2, vectors and ZA), SME
 instruction with tests.

---
 src/lib/arch/aarch64/Instruction_execute.cc | 34 ++++++++++++++++
 test/regression/aarch64/instructions/sme.cc | 45 +++++++++++++++++++++
 2 files changed, 79 insertions(+)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 4300fc68c3..28a2546797 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -328,6 +328,40 @@ void Instruction::execute() {
         results_[0] = vecAdd_3ops<uint8_t, 8>(sourceValues_);
         break;
       }
+      case Opcode::AArch64_ADD_VG2_M2Z_S: {  // add za.s[wv, off, vgx2], {zn1.s,
+                                             // zn2.s}
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint16_t elemCount = VL_bits / 32;
+
+        // Get ZA stride between halves and index into each ZA half
+        const uint16_t zaStride = zaRowCount / 2;
+        const uint32_t zaIndex = (sourceValues_[zaRowCount].get<uint32_t>() +
+                                  metadata_.operands[0].sme.slice_offset.imm) %
+                                 zaStride;
+
+        // Pre-set all ZA result rows as only 2 will be updated in loop below
+        for (int z = 0; z < zaRowCount; z++) {
+          results_[z] = sourceValues_[z];
+        }
+
+        for (int r = 0; r < 2; r++) {
+          const uint32_t* zaRow =
+              sourceValues_[(r * zaStride) + zaIndex].getAsVector<uint32_t>();
+          const uint32_t* znr =
+              sourceValues_[zaRowCount + 1 + r].getAsVector<uint32_t>();
+          uint32_t out[64] = {0};
+          for (int i = 0; i < elemCount; i++) {
+            out[i] = zaRow[i] + znr[i];
+          }
+          results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256);
+        }
+        break;
+      }
       case Opcode::AArch64_ADR: {  // adr xd, #imm
         results_[0] = instructionAddress_ + metadata_.operands[1].imm;
         break;
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index 8b12725472..aca3d0ba99 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -7,6 +7,51 @@ namespace {
 
 using InstSme = AArch64RegressionTest;
 
+TEST_P(InstSme, add) {
+  // uint32_T, vgx2, vecs with ZA
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    # Pre-fill all of za with 96 (uint32_t)
+    dup z0.b, #8
+    dup z1.b, #3
+    ptrue p0.b
+    ptrue p1.b
+    umopa za0.s, p0/m, p1/m, z0.b, z1.b
+    umopa za1.s, p0/m, p1/m, z0.b, z1.b
+    umopa za2.s, p0/m, p1/m, z0.b, z1.b
+    umopa za3.s, p0/m, p1/m, z0.b, z1.b
+
+    # Set 2 of the za rows
+    mov w8, #1
+    dup z0.s, #8
+    dup z1.s, #3
+    add za.s[w8, #1, vgx2], {z0.s, z1.s}
+  )");
+  const uint16_t zaStride = (SVL / 8) / 2;
+  const uint16_t zaHalfIndex = 2;
+  for (uint64_t i = 0; i < (SVL / 8); i++) {
+    if (i == zaHalfIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({104}, (SVL / 8)));
+    } else if (i == zaStride + zaHalfIndex) {
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({99}, (SVL / 8)));
+    } else {
+      // un-effected rows should still be 96 throughout
+      CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
+                    fillNeon<uint32_t>({96}, (SVL / 8)));
+    }
+  }
+}
+
 TEST_P(InstSme, mova_tileToVec) {
   // 8-bit
   RUN_AARCH64(R"(

From 950de4124cdfbfb13bc67fcac043da4232451e0a Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Thu, 7 Nov 2024 19:58:23 +0000
Subject: [PATCH 65/71] Implemented ZIP (4 vectors) SVE2 instruction with
 tests.

---
 src/lib/arch/aarch64/Instruction_execute.cc | 23 +++++++++++++++++++++
 test/regression/aarch64/instructions/sve.cc | 15 +++++++++++++-
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 28a2546797..558ebc0525 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -7076,6 +7076,29 @@ void Instruction::execute() {
         results_[0] = vecZip<uint8_t, 8>(sourceValues_, true);
         break;
       }
+      case Opcode::AArch64_ZIP_VG4_4Z4Z_S: {  // zip {zd1.s - zd4.s}, {zn1.s -
+                                              // zn4.s}
+        const uint32_t* zn[4];
+        zn[0] = sourceValues_[0].getAsVector<uint32_t>();
+        zn[1] = sourceValues_[1].getAsVector<uint32_t>();
+        zn[2] = sourceValues_[2].getAsVector<uint32_t>();
+        zn[3] = sourceValues_[3].getAsVector<uint32_t>();
+
+        const uint16_t quads = VL_bits / (32 * 4);
+
+        uint32_t out[4][64] = {{0}, {0}, {0}, {0}};
+        for (int r = 0; r < 4; r++) {
+          const uint16_t base = r * quads;
+          for (int q = 0; q < quads; q++) {
+            out[r][4 * q] = zn[0][base + q];
+            out[r][4 * q + 1] = zn[1][base + q];
+            out[r][4 * q + 2] = zn[2][base + q];
+            out[r][4 * q + 3] = zn[3][base + q];
+          }
+          results_[r] = RegisterValue(out[r], 256);
+        }
+        break;
+      }
       case Opcode::AArch64_ZERO_M: {  // zero {mask}
         // SME
         // Not in right context mode. Raise exception
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 16a966d00a..f9699593f3 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -9145,13 +9145,26 @@ TEST_P(InstSve, zip) {
     zip1 z16.b, z12.b, z13.b
     zip2 z17.b, z14.b, z15.b
   )");
-
   CHECK_NEON(4, double, fillNeon<double>({0.5, -0.5}, VL / 8));
   CHECK_NEON(5, double, fillNeon<double>({0.75, -0.75}, VL / 8));
   CHECK_NEON(10, float, fillNeon<float>({0.5, -0.75}, VL / 8));
   CHECK_NEON(11, float, fillNeon<float>({-0.5, 0.75}, VL / 8));
   CHECK_NEON(16, int8_t, fillNeon<int8_t>({1, -2}, VL / 8));
   CHECK_NEON(17, int8_t, fillNeon<int8_t>({-1, 2}, VL / 8));
+
+  // Multi-vector
+  RUN_AARCH64(R"(
+    #32-bit
+    dup z0.s, #5
+    dup z1.s, #6
+    dup z2.s, #7
+    dup z3.s, #8
+    zip {z4.s - z7.s}, {z0.s - z3.s}
+  )");
+  CHECK_NEON(4, uint32_t, fillNeon<uint32_t>({5, 6, 7, 8}, VL / 8));
+  CHECK_NEON(5, uint32_t, fillNeon<uint32_t>({5, 6, 7, 8}, VL / 8));
+  CHECK_NEON(6, uint32_t, fillNeon<uint32_t>({5, 6, 7, 8}, VL / 8));
+  CHECK_NEON(7, uint32_t, fillNeon<uint32_t>({5, 6, 7, 8}, VL / 8));
 }
 
 TEST_P(InstSve, psel) {

From 03a95e70f21701be27d2f8e01bdd31955a53f248 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Tue, 10 Dec 2024 12:11:14 +0000
Subject: [PATCH 66/71] Attended PR comments.

---
 CMakeLists.txt                                 | 3 +--
 src/include/simeng/arch/aarch64/Instruction.hh | 7 ++++---
 src/include/simeng/arch/aarch64/helpers/sve.hh | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 42111288ff..afeeb4abac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -157,8 +157,7 @@ if(SIMENG_ENABLE_TESTS)
       # Print message containing if the full test suite will run
     if (${LLVM_PACKAGE_VERSION} VERSION_LESS "14.0")
       message(STATUS "LLVM version does not support AArch64 extensions SVE2, SVE2.1, SME, or SME2. Related tests will fail.")
-    endif()
-    if (${LLVM_PACKAGE_VERSION} VERSION_LESS "18.0")
+    elseif (${LLVM_PACKAGE_VERSION} VERSION_LESS "18.0")
       message(STATUS "LLVM version does not support AArch64 extensions SME2 or SVE2.1. Related test will fail.")
     endif()
 
diff --git a/src/include/simeng/arch/aarch64/Instruction.hh b/src/include/simeng/arch/aarch64/Instruction.hh
index f3854c84b4..6db73f0e69 100644
--- a/src/include/simeng/arch/aarch64/Instruction.hh
+++ b/src/include/simeng/arch/aarch64/Instruction.hh
@@ -306,9 +306,10 @@ std::vector<std::array<uint64_t, 4>> predAsCounterToMasks(
     for (int i = 0; i < elemsPerVec; i++) {
       // Move bit to next position based on element type
       uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
-      // If invert = 1, predElemCount dictates number of initial inactive
-      // elements.
-      // If invert = 0, it is number of initial active elements.
+      // If invert = True (invert bit = 1), predElemCount dictates number of
+      // initial inactive elements.
+      // If invert = False (invert bit = 0), it indicates the number of initial
+      // active elements.
       if ((r * elemsPerVec) + i < predElemCount) {
         out[r][i / (64 / sizeof(T))] |= (invert) ? 0 : shifted_active;
       } else {
diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 50eb19c657..a42bd9680c 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -626,7 +626,7 @@ std::enable_if_t<std::is_floating_point_v<T>, RegisterValue> sveFDivPredicated(
   return {out, 256};
 }
 
-/** Helpfer function for SVE instructions with the format `faddv rd, pg, zn.
+/** Helper function for SVE instructions with the format `faddv rd, pg, zn.
  * D represents the source vector element type and the destination scalar
  * register type (i.e. for zn.s and sd, D = float).
  * Returns correctly formatted RegisterValue. */

From 672936312ab2b115cd167ce17d59affbb079fd9b Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Fri, 13 Dec 2024 18:02:01 +0000
Subject: [PATCH 67/71] Minor bug fixes.

---
 src/lib/arch/aarch64/Instruction_decode.cc  | 11 ++++++-----
 src/lib/arch/aarch64/Instruction_execute.cc |  9 +++++----
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/lib/arch/aarch64/Instruction_decode.cc b/src/lib/arch/aarch64/Instruction_decode.cc
index 6d2007cb55..a6bc075efd 100644
--- a/src/lib/arch/aarch64/Instruction_decode.cc
+++ b/src/lib/arch/aarch64/Instruction_decode.cc
@@ -537,8 +537,9 @@ void Instruction::decode() {
 
     if (isInstruction(InsnType::isStoreData)) {
       // Identify store instruction group
-      if (AARCH64_REG_Z0 <= metadata_.operands[0].reg &&
-          metadata_.operands[0].reg <= AARCH64_REG_Z31) {
+      if ((AARCH64_REG_Z0 <= metadata_.operands[0].reg &&
+           metadata_.operands[0].reg <= AARCH64_REG_Z31) ||
+          metadata_.operands[0].reg == AARCH64_REG_ZT0) {
         setInstructionType(InsnType::isSVEData);
       } else if ((metadata_.operands[0].reg <= AARCH64_REG_S31 &&
                   metadata_.operands[0].reg >= AARCH64_REG_Q0) ||
@@ -548,7 +549,7 @@ void Instruction::decode() {
       } else if (metadata_.operands[0].is_vreg) {
         setInstructionType(InsnType::isVectorData);
       } else if ((metadata_.operands[0].reg >= AARCH64_REG_ZAB0 &&
-                  metadata_.operands[0].reg <= AARCH64_REG_ZT0) ||
+                  metadata_.operands[0].reg < AARCH64_REG_ZT0) ||
                  metadata_.operands[0].reg == AARCH64_REG_ZA) {
         setInstructionType(InsnType::isSMEData);
       }
@@ -644,8 +645,8 @@ void Instruction::decode() {
       }
     }
   } else {
-    // For SME instructions, resize the following structures to have the
-    // exact amount of space required
+    // For SME instructions (not using ZT0), resize the following structures to
+    // have the exact amount of space required
     sourceRegisters_.resize(sourceRegisterCount_);
     destinationRegisters_.resize(destinationRegisterCount_);
     sourceValues_.resize(sourceRegisterCount_);
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 558ebc0525..6ed9d6695a 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -4181,10 +4181,11 @@ void Instruction::execute() {
       }
       case Opcode::AArch64_LDRSBWpost: {  // ldrsb wt, [xn], #imm
         // LOAD
-        results_[1] = RegisterValue(
-            static_cast<int32_t>(memoryData_[0].get<int8_t>()), 4);
-        results_[0] =
-            sourceValues_[0].get<uint64_t>() + metadata_.operands[2].imm;
+        results_[1] =
+            RegisterValue(static_cast<int32_t>(memoryData_[0].get<int8_t>()), 4)
+                .zeroExtend(4, 8);
+        results_[0] = RegisterValue(
+            sourceValues_[0].get<uint64_t>() + metadata_.operands[2].imm, 8);
         break;
       }
       case Opcode::AArch64_LDRSBWroX: {  // ldrsb wt, [xn, xm{, extend

From 850b741068d7adf2696bfc4e01a7f9c8450c9a7c Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Mon, 16 Dec 2024 13:16:01 +0000
Subject: [PATCH 68/71] Attended PR comments.

---
 .../simeng/arch/aarch64/Instruction.hh        |  5 +----
 src/include/simeng/version.hh.in              |  2 +-
 src/lib/arch/aarch64/Instruction_execute.cc   |  7 ++++---
 .../aarch64/AArch64RegressionTest.hh          | 21 ++++++++-----------
 test/regression/aarch64/Exception.cc          |  2 ++
 test/regression/aarch64/instructions/float.cc |  6 +++---
 test/regression/aarch64/instructions/sme.cc   |  2 +-
 7 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/Instruction.hh b/src/include/simeng/arch/aarch64/Instruction.hh
index 6db73f0e69..b1ffb97575 100644
--- a/src/include/simeng/arch/aarch64/Instruction.hh
+++ b/src/include/simeng/arch/aarch64/Instruction.hh
@@ -283,9 +283,6 @@ enum class InsnType : uint32_t {
   isBranch = 1 << 14
 };
 
-/** Predefined shift values for converting pred-as-counter to pred-as-mask. */
-const uint64_t predCountShiftVals[9] = {0, 1, 2, 0, 3, 0, 0, 0, 4};
-
 /** Convert Predicate-as-Counter to Predicate-as-Masks.
  * T represents the element type (i.e. for pg.s, T = uint32_t).
  * V represents the number of vectors the predicate-as-counter is being used
@@ -300,7 +297,7 @@ std::vector<std::array<uint64_t, 4>> predAsCounterToMasks(
   const bool invert = (predAsCounter & 0b1000000000000000) != 0;
   const uint64_t predElemCount =
       (predAsCounter & static_cast<uint64_t>(0b0111111111111111)) >>
-      predCountShiftVals[sizeof(T)];
+      static_cast<uint8_t>(std::log2f(sizeof(T)) + 1);
 
   for (int r = 0; r < V; r++) {
     for (int i = 0; i < elemsPerVec; i++) {
diff --git a/src/include/simeng/version.hh.in b/src/include/simeng/version.hh.in
index 8a2a823a66..f563e281f9 100644
--- a/src/include/simeng/version.hh.in
+++ b/src/include/simeng/version.hh.in
@@ -9,6 +9,6 @@
 #define SIMENG_LLVM_VERSION @SIMENG_LLVM_VERSION@
 #define SIMENG_ENABLE_TESTS "${SIMENG_ENABLE_TESTS}"
 #define SIMENG_BUILD_DIR "${CMAKE_BINARY_DIR}"
-#define SIMENG_ENABLE_BF16 "${SIMENG_ENABLE_BF16}"
+#define SIMENG_ENABLE_BF16 ${SIMENG_ENABLE_BF16}
 
 #endif
\ No newline at end of file
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 6ed9d6695a..78dcb6c5d8 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -520,10 +520,10 @@ void Instruction::execute() {
         branchAddress_ = instructionAddress_ + metadata_.operands[0].imm;
         break;
       }
+#if SIMENG_ENABLE_BF16 == 1
       case Opcode::AArch64_BF16DOTlanev8bf16: {  // bfdot vd.4s, vn.8h,
                                                  // vm.2h[index]
         // BF16 -- EXPERIMENTAL
-        if (std::string(SIMENG_ENABLE_BF16) == "OFF") return executionNYI();
         // Must be enabled at SimEng compile time
         // Not verified to be working for all compilers or OSs.
         // No Tests written
@@ -545,7 +545,6 @@ void Instruction::execute() {
       }
       case Opcode::AArch64_BFDOT_ZZI: {  // bfdot zd.s, zn.h, zm.h[index]
         // BF16 -- EXPERIMENTAL
-        if (std::string(SIMENG_ENABLE_BF16) == "OFF") return executionNYI();
         // Must be enabled at SimEng compile time
         // Not verified to be working for all compilers or OSs.
         // No Tests written
@@ -580,6 +579,7 @@ void Instruction::execute() {
         results_[0] = RegisterValue(out, 256);
         break;
       }
+#endif
       case Opcode::AArch64_BFMWri: {  // bfm wd, wn, #immr, #imms
         results_[0] = {
             bfm_2imms<uint32_t>(sourceValues_, metadata_, false, false), 8};
@@ -2252,11 +2252,11 @@ void Instruction::execute() {
         results_[0] = vecFmlsIndexed_3vecs<float, 4>(sourceValues_, metadata_);
         break;
       }
+#if SIMENG_ENABLE_BF16 == 1
       case Opcode::AArch64_BFMOPA_MPPZZ: {  // bfmopa zada.s, pn/m, pm/m, zn.h,
                                             // zm.h
         // SME
         // BF16 -- EXPERIMENTAL
-        if (std::string(SIMENG_ENABLE_BF16) == "OFF") return executionNYI();
         // Must be enabled at SimEng compile time
         // Not verified to be working for all compilers or OSs.
         // No Tests written
@@ -2308,6 +2308,7 @@ void Instruction::execute() {
         }
         break;
       }
+#endif
       case Opcode::AArch64_FMOPA_MPPZZ_D: {  // fmopa zada.d, pn/m, pm/m, zn.d,
                                              // zm.d
         // SME
diff --git a/test/regression/aarch64/AArch64RegressionTest.hh b/test/regression/aarch64/AArch64RegressionTest.hh
index 8285726ee7..6afdc47d2a 100644
--- a/test/regression/aarch64/AArch64RegressionTest.hh
+++ b/test/regression/aarch64/AArch64RegressionTest.hh
@@ -192,20 +192,19 @@ inline std::vector<std::tuple<CoreType, std::string>> genCoreTypeSVLPairs(
 
 /** Check each element of the Lookup Table register ZT0 against expected values.
  *
- * The `tag` argument is the register index (must be 0), and the `type` argument
- * is the C++ data type to use for value comparisons. The third argument should
- * be an initializer list containing one value for each register element (for a
- * total of `(64 / sizeof(type))` values).
+ * The `type` argument is the C++ data type to use for value comparisons. The
+ * third argument should be an initializer list containing one value for each
+ * register element (for a total of `(64 / sizeof(type))` values).
  *
  * For example:
  *
  *     // Compare zt0 to some expected 32-bit uint64 values.
  *     CHECK_TABLE(0, uint32_t, {1, 2, 3, 4, ..., 16});
  */
-#define CHECK_TABLE(tag, type, ...)             \
-  {                                             \
-    SCOPED_TRACE("<<== error generated here");  \
-    checkTableRegister<type>(tag, __VA_ARGS__); \
+#define CHECK_TABLE(type, ...)                 \
+  {                                            \
+    SCOPED_TRACE("<<== error generated here"); \
+    checkTableRegister<type>(__VA_ARGS__);     \
   }
 
 /** A helper macro to predecode the first instruction in a snippet of Armv9.2-a
@@ -385,11 +384,9 @@ class AArch64RegressionTest : public RegressionTest {
    * better diagnostic messages, rather than called directly from test code.
    */
   template <typename T>
-  void checkTableRegister(uint8_t tag,
-                          const std::array<T, (64 / sizeof(T))>& values) const {
-    assert(tag == 0 && "Only a tag of value 0 is valid for Table registers");
+  void checkTableRegister(const std::array<T, (64 / sizeof(T))>& values) const {
     const T* data = RegressionTest::getVectorRegister<T>(
-        {simeng::arch::aarch64::RegisterType::TABLE, tag});
+        {simeng::arch::aarch64::RegisterType::TABLE, 0});
     for (unsigned i = 0; i < (64 / sizeof(T)); i++) {
       EXPECT_NEAR(data[i], values[i], 0.0005)
           << "Mismatch for element " << i << ".";
diff --git a/test/regression/aarch64/Exception.cc b/test/regression/aarch64/Exception.cc
index 2133629473..b987ae4429 100644
--- a/test/regression/aarch64/Exception.cc
+++ b/test/regression/aarch64/Exception.cc
@@ -151,6 +151,7 @@ TEST_P(Exception, unmapped_sys_reg) {
   EXPECT_EQ(stdout_.substr(0, strlen(err)), err);
 }
 
+#if SIMENG_LLVM_VERSION >= 14
 // TODO: Write test for InstructionException::StreamingModeUpdate once it has a
 // trigger case
 // TODO: Write test for InstructionException::ZAregisterStatusUpdate once it has
@@ -370,6 +371,7 @@ TEST_P(Exception, svcr) {
                   fillNeon<uint32_t>({0}, SVL / 8));
   }
 }
+#endif
 
 INSTANTIATE_TEST_SUITE_P(
     AArch64, Exception,
diff --git a/test/regression/aarch64/instructions/float.cc b/test/regression/aarch64/instructions/float.cc
index bc2d09ea27..627e710e7c 100644
--- a/test/regression/aarch64/instructions/float.cc
+++ b/test/regression/aarch64/instructions/float.cc
@@ -1459,9 +1459,9 @@ TEST_P(InstFloat, ucvtf) {
   // representation error to ensure tests pass
   initialHeapData_.resize(12);
   heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
-  heap32[0] = 0x000001EE;
-  heap32[1] = 0x00021F3B;
-  heap32[2] = 0x32FE6B75;
+  heap32[0] = 0x000001EE;  // 123.5 (2 fraction bits)
+  heap32[1] = 0x00021F3B;  // 543.23 (8 fraction bits)
+  heap32[2] = 0x32FE6B75;  // 101.987654321 (23 fraction bits)
   RUN_AARCH64(R"(
     # Get heap address
     mov x0, 0
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index aca3d0ba99..d908d13a1d 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -1294,7 +1294,7 @@ TEST_P(InstSme, zero) {
 
     zero {zt0}
   )");
-  CHECK_TABLE(0, uint64_t, {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0});
+  CHECK_TABLE(uint64_t, {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0});
 
   // ZA tiles
   RUN_AARCH64(R"(

From 1d0409697df3921e553057312dfb7e07772601d6 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Wed, 18 Dec 2024 10:29:18 +0000
Subject: [PATCH 69/71] Updated multi-vector load logic.

---
 src/lib/arch/aarch64/Instruction_execute.cc | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 78dcb6c5d8..1981a02b71 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -3162,10 +3162,11 @@ void Instruction::execute() {
         const uint16_t partition_num = VL_bits / 8;
 
         for (int r = 0; r < 2; r++) {
+          const uint8_t* data = memoryData_[r].getAsVector<uint8_t>();
           for (int i = 0; i < partition_num; i++) {
             uint64_t shifted_active = 1ull << (i % 64);
             if (preds[r][i / 64] & shifted_active) {
-              out[r][i] = memoryData_[r].getAsVector<uint8_t>()[i];
+              out[r][i] = data[i];
             }
           }
         }
@@ -3240,10 +3241,11 @@ void Instruction::execute() {
         const uint16_t partition_num = VL_bits / 64;
 
         for (int r = 0; r < 2; r++) {
+          const uint64_t* data = memoryData_[r].getAsVector<uint64_t>();
           for (int i = 0; i < partition_num; i++) {
             uint64_t shifted_active = 1ull << ((i % 8) * 8);
             if (preds[r][i / 8] & shifted_active) {
-              out[r][i] = memoryData_[r].getAsVector<uint64_t>()[i];
+              out[r][i] = data[i];
             }
           }
         }
@@ -3266,10 +3268,11 @@ void Instruction::execute() {
         const uint16_t partition_num = VL_bits / 64;
 
         for (int r = 0; r < 4; r++) {
+          const uint64_t* data = memoryData_[r].getAsVector<uint64_t>();
           for (int i = 0; i < partition_num; i++) {
             uint64_t shifted_active = 1ull << ((i % 8) * 8);
             if (preds[r][i / 8] & shifted_active) {
-              out[r][i] = memoryData_[r].getAsVector<uint64_t>()[i];
+              out[r][i] = data[i];
             }
           }
         }
@@ -3337,10 +3340,11 @@ void Instruction::execute() {
         const uint16_t partition_num = VL_bits / 16;
 
         for (int r = 0; r < 2; r++) {
+          const uint16_t* data = memoryData_[r].getAsVector<uint16_t>();
           for (int i = 0; i < partition_num; i++) {
             uint64_t shifted_active = 1ull << ((i % 32) * 2);
             if (preds[r][i / 32] & shifted_active) {
-              out[r][i] = memoryData_[r].getAsVector<uint16_t>()[i];
+              out[r][i] = data[i];
             }
           }
         }
@@ -3777,10 +3781,11 @@ void Instruction::execute() {
         const uint16_t partition_num = VL_bits / 32;
 
         for (int r = 0; r < 2; r++) {
+          const uint32_t* data = memoryData_[r].getAsVector<uint32_t>();
           for (int i = 0; i < partition_num; i++) {
             uint64_t shifted_active = 1ull << ((i % 16) * 4);
             if (preds[r][i / 16] & shifted_active) {
-              out[r][i] = memoryData_[r].getAsVector<uint32_t>()[i];
+              out[r][i] = data[i];
             }
           }
         }
@@ -3803,10 +3808,11 @@ void Instruction::execute() {
         const uint16_t partition_num = VL_bits / 32;
 
         for (int r = 0; r < 4; r++) {
+          const uint32_t* data = memoryData_[r].getAsVector<uint32_t>();
           for (int i = 0; i < partition_num; i++) {
             uint64_t shifted_active = 1ull << ((i % 16) * 4);
             if (preds[r][i / 16] & shifted_active) {
-              out[r][i] = memoryData_[r].getAsVector<uint32_t>()[i];
+              out[r][i] = data[i];
             }
           }
         }

From 246d39ab47748e81262955290a319a17027ad307 Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Fri, 20 Dec 2024 11:06:10 +0000
Subject: [PATCH 70/71] CI CD fixes.

---
 src/include/simeng/arch/aarch64/Instruction.hh | 4 ++--
 src/include/simeng/arch/aarch64/helpers/sve.hh | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/include/simeng/arch/aarch64/Instruction.hh b/src/include/simeng/arch/aarch64/Instruction.hh
index b1ffb97575..6cbc0c2908 100644
--- a/src/include/simeng/arch/aarch64/Instruction.hh
+++ b/src/include/simeng/arch/aarch64/Instruction.hh
@@ -300,14 +300,14 @@ std::vector<std::array<uint64_t, 4>> predAsCounterToMasks(
       static_cast<uint8_t>(std::log2f(sizeof(T)) + 1);
 
   for (int r = 0; r < V; r++) {
-    for (int i = 0; i < elemsPerVec; i++) {
+    for (uint16_t i = 0; i < elemsPerVec; i++) {
       // Move bit to next position based on element type
       uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
       // If invert = True (invert bit = 1), predElemCount dictates number of
       // initial inactive elements.
       // If invert = False (invert bit = 0), it indicates the number of initial
       // active elements.
-      if ((r * elemsPerVec) + i < predElemCount) {
+      if (static_cast<uint64_t>(r * elemsPerVec) + i < predElemCount) {
         out[r][i / (64 / sizeof(T))] |= (invert) ? 0 : shifted_active;
       } else {
         out[r][i / (64 / sizeof(T))] |= (invert) ? shifted_active : 0;
diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index a42bd9680c..cf9ffd5683 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -1636,7 +1636,7 @@ RegisterValue sveUdot(
   const N* zm = sourceValues[2].getAsVector<N>();
 
   D out[256 / sizeof(D)] = {0};
-  for (int i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) {
+  for (size_t i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) {
     out[i] = zd[i];
     for (int j = 0; j < W; j++) {
       out[i] +=
@@ -1666,7 +1666,7 @@ RegisterValue sveUdot_indexed(
   const int index = metadata.operands[2].vector_index;
 
   D out[256 / sizeof(D)] = {0};
-  for (int i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) {
+  for (size_t i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) {
     D acc = zd[i];
     // Index into zm selects which D-type element within each 128-bit vector
     // segment to use

From 0ec0b8db24f62d28d7f131c4830a8677441f49dd Mon Sep 17 00:00:00 2001
From: Finn Wilkinson <finnwilkinson5@gmail.com>
Date: Fri, 20 Dec 2024 11:49:48 +0000
Subject: [PATCH 71/71] CI CD fixes pt2.

---
 test/regression/aarch64/instructions/sme.cc | 46 ++++++++++-----------
 test/regression/aarch64/instructions/sve.cc |  4 +-
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index d908d13a1d..75be221ae0 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -37,7 +37,7 @@ TEST_P(InstSme, add) {
   )");
   const uint16_t zaStride = (SVL / 8) / 2;
   const uint16_t zaHalfIndex = 2;
-  for (uint64_t i = 0; i < (SVL / 8); i++) {
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
     if (i == zaHalfIndex) {
       CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
                     fillNeon<uint32_t>({104}, (SVL / 8)));
@@ -279,7 +279,7 @@ TEST_P(InstSme, fadd) {
   )");
   const uint16_t zaStride = (SVL / 8) / 2;
   const uint16_t zaHalfIndex = 2;
-  for (uint64_t i = 0; i < (SVL / 8); i++) {
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
     if (i == zaHalfIndex) {
       CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
                     fillNeon<float>({21.5f}, (SVL / 8)));
@@ -330,7 +330,7 @@ TEST_P(InstSme, fadd) {
 
     fadd za.d[w8, #1, vgx2], {z4.d, z5.d}
   )");
-  for (uint64_t i = 0; i < (SVL / 8); i++) {
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
     if (i == zaHalfIndex) {
       CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
                     fillNeon<double>({21.5}, (SVL / 8)));
@@ -382,7 +382,7 @@ TEST_P(InstSme, fmla_multiVecs) {
   )");
   const uint16_t zaStride = (SVL / 8) / 4;
   const uint16_t zaQuartIndex = 2;
-  for (uint64_t i = 0; i < (SVL / 8); i++) {
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
     // Effected rows all use same zm value of 2.0f
     if (i == zaQuartIndex) {
       CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
@@ -441,7 +441,7 @@ TEST_P(InstSme, fmla_multiVecs) {
 
     fmla za.d[w8, #1, vgx4], {z4.d - z7.d}, {z8.d - z11.d}
   )");
-  for (uint64_t i = 0; i < (SVL / 8); i++) {
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
     // Effected rows all use same zm value of 2.0
     if (i == zaQuartIndex) {
       CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
@@ -501,7 +501,7 @@ TEST_P(InstSme, fmla_indexed_vgx4) {
   )");
   const uint16_t zaStride = (SVL / 8) / 4;
   const uint16_t zaQuartIndex = 2;
-  for (uint64_t i = 0; i < (SVL / 8); i++) {
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
     // Effected rows all use same zm value of 2.0f
     if (i == zaQuartIndex) {
       CHECK_MAT_ROW(AARCH64_REG_ZA, i, float,
@@ -561,7 +561,7 @@ TEST_P(InstSme, fmla_indexed_vgx4) {
 
     fmla za.d[w8, #1, vgx4], {z4.d - z7.d}, z10.d[0]
   )");
-  for (uint64_t i = 0; i < (SVL / 8); i++) {
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
     // Effected rows all use same zm value of 2.0f
     if (i == zaQuartIndex) {
       CHECK_MAT_ROW(AARCH64_REG_ZA, i, double,
@@ -605,7 +605,7 @@ TEST_P(InstSme, fmopa) {
 
     fmopa za2.s, p0/m, p2/m, z3.s, z4.s
   )");
-  for (uint64_t i = 0; i < (SVL / 32); i++) {
+  for (uint16_t i = 0; i < (SVL / 32); i++) {
     CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, float,
                   fillNeon<float>({10.0f}, (SVL / 8)));
     CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, float,
@@ -633,7 +633,7 @@ TEST_P(InstSme, fmopa) {
 
     fmopa za2.d, p0/m, p2/m, z3.d, z4.d
   )");
-  for (uint64_t i = 0; i < (SVL / 64); i++) {
+  for (uint16_t i = 0; i < (SVL / 64); i++) {
     CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, double,
                   fillNeon<double>({10.0}, (SVL / 8)));
     CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, double,
@@ -850,7 +850,7 @@ TEST_P(InstSme, st1d) {
     st1d {za0h.d[w12, 0]}, p0, [sp, x1, lsl #3]
     st1d {za1h.d[w12, 1]}, p0, [x4]
   )");
-  for (uint64_t i = 0; i < (SVL / 64); i++) {
+  for (uint16_t i = 0; i < (SVL / 64); i++) {
     EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
                                        4095 + (i * 8)),
               src[i % 2]);
@@ -880,7 +880,7 @@ TEST_P(InstSme, st1d) {
     ld1d {za1h.d[w13, 1]}, p1/z, [x0, x3, lsl #3]
     st1d {za1h.d[w13, 1]}, p1, [x5, x3, lsl #3]
   )");
-  for (uint64_t i = 0; i < (SVL / 128); i++) {
+  for (uint16_t i = 0; i < (SVL / 128); i++) {
     EXPECT_EQ(getMemoryValue<uint64_t>(800 + (i * 8)), src[i % 2]);
     EXPECT_EQ(getMemoryValue<uint64_t>(800 + 16 + (i * 8)), src[i % 2]);
   }
@@ -911,7 +911,7 @@ TEST_P(InstSme, st1d) {
     st1d {za0v.d[w12, 0]}, p0, [sp, x1, lsl #3]
     st1d {za1v.d[w12, 1]}, p0, [x4]
   )");
-  for (uint64_t i = 0; i < (SVL / 64); i++) {
+  for (uint16_t i = 0; i < (SVL / 64); i++) {
     EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
                                        4095 + (i * 8)),
               src_vert[i % 2]);
@@ -941,7 +941,7 @@ TEST_P(InstSme, st1d) {
     ld1d {za1v.d[w13, 1]}, p1/z, [x0, x3, lsl #3]
     st1d {za1v.d[w13, 1]}, p1, [x5, x3, lsl #3]
   )");
-  for (uint64_t i = 0; i < (SVL / 128); i++) {
+  for (uint16_t i = 0; i < (SVL / 128); i++) {
     EXPECT_EQ(getMemoryValue<uint64_t>(800 + (i * 8)), src_vert[i % 2]);
     EXPECT_EQ(getMemoryValue<uint64_t>(800 + 16 + (i * 8)), src_vert[i % 2]);
   }
@@ -974,7 +974,7 @@ TEST_P(InstSme, st1w) {
     st1w {za0h.s[w12, 0]}, p0, [sp, x1, lsl #2]
     st1w {za1h.s[w12, 1]}, p0, [x4]
   )");
-  for (uint64_t i = 0; i < (SVL / 32); i++) {
+  for (uint16_t i = 0; i < (SVL / 32); i++) {
     EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
                                        4095 + (i * 4)),
               src[i % 4]);
@@ -1003,7 +1003,7 @@ TEST_P(InstSme, st1w) {
     ld1w {za1h.s[w12, 2]}, p1/z, [x0, x3, lsl #2]
     st1w {za1h.s[w12, 2]}, p1, [x5, x3, lsl #2]
   )");
-  for (uint64_t i = 0; i < (SVL / 64); i++) {
+  for (uint16_t i = 0; i < (SVL / 64); i++) {
     EXPECT_EQ(getMemoryValue<uint32_t>(800 + (i * 4)), src[i % 4]);
     EXPECT_EQ(getMemoryValue<uint32_t>(800 + 16 + (i * 4)), src[i % 4]);
   }
@@ -1035,7 +1035,7 @@ TEST_P(InstSme, st1w) {
     st1w {za0v.s[w12, 0]}, p0, [sp, x1, lsl #2]
     st1w {za1v.s[w12, 1]}, p0, [x4]
   )");
-  for (uint64_t i = 0; i < (SVL / 32); i++) {
+  for (uint16_t i = 0; i < (SVL / 32); i++) {
     EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
                                        4095 + (i * 4)),
               src_vert[i % 4]);
@@ -1064,7 +1064,7 @@ TEST_P(InstSme, st1w) {
     ld1w {za1v.s[w12, 2]}, p1/z, [x0, x3, lsl #2]
     st1w {za1v.s[w12, 2]}, p1, [x5, x3, lsl #2]
   )");
-  for (uint64_t i = 0; i < (SVL / 64); i++) {
+  for (uint16_t i = 0; i < (SVL / 64); i++) {
     EXPECT_EQ(getMemoryValue<uint32_t>(800 + (i * 4)), src_vert[i % 4]);
     EXPECT_EQ(getMemoryValue<uint32_t>(800 + 16 + (i * 4)), src_vert[i % 4]);
   }
@@ -1109,7 +1109,7 @@ TEST_P(InstSme, udot_Indexed_vgx4) {
   )");
   const uint16_t zaStride = (SVL / 8) / 4;
   const uint16_t zaQuartIndex = 2;
-  for (uint64_t i = 0; i < (SVL / 8); i++) {
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
     // Effected rows all use same zm values of {8, 9, 10, 11}
     if (i == zaQuartIndex) {
       CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
@@ -1173,7 +1173,7 @@ TEST_P(InstSme, udot_vgx4) {
   )");
   const uint16_t zaStride = (SVL / 8) / 4;
   const uint16_t zaQuartIndex = 2;
-  for (uint64_t i = 0; i < (SVL / 8); i++) {
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
     if (i == zaQuartIndex) {
       CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
                     fillNeon<uint32_t>({156, 316, 476, 636}, (SVL / 8)));
@@ -1233,7 +1233,7 @@ TEST_P(InstSme, uvdot_indexed_vgx4) {
   )");
   const uint16_t zaStride = (SVL / 8) / 4;
   const uint16_t zaQuartIndex = 2;
-  for (uint64_t i = 0; i < (SVL / 8); i++) {
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
     // Effected rows all use same zm values of {8, 9, 10, 11}
     if (i == zaQuartIndex) {
       CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t,
@@ -1279,7 +1279,7 @@ TEST_P(InstSme, umopa) {
 
     umopa za2.s, p0/m, p2/m, z3.b, z4.b
   )");
-  for (uint64_t i = 0; i < (SVL / 32); i++) {
+  for (uint16_t i = 0; i < (SVL / 32); i++) {
     CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t,
                   fillNeon<uint32_t>({96}, (SVL / 8)));
     CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t,
@@ -1302,7 +1302,7 @@ TEST_P(InstSme, zero) {
 
     zero {za}
   )");
-  for (uint64_t i = 0; i < (SVL / 8); i++) {
+  for (uint16_t i = 0; i < (SVL / 8); i++) {
     CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint64_t,
                   fillNeon<uint64_t>({0}, SVL / 8));
   }
@@ -1339,7 +1339,7 @@ TEST_P(InstSme, zero) {
 
     zero {za0.s, za2.s}
   )");
-  for (uint64_t i = 0; i < (SVL / 32); i++) {
+  for (uint16_t i = 0; i < (SVL / 32); i++) {
     CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t,
                   fillNeon<uint32_t>({0}, SVL / 8));
     CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t,
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index f9699593f3..9411ef0085 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -2884,7 +2884,7 @@ TEST_P(InstSve, faddv) {
   )");
   float s3 = 0.0f;
   float s4 = 0.0f;
-  for (int i = 0; i < VL / 32; i++) {
+  for (uint64_t i = 0; i < VL / 32; i++) {
     s3 += fsrc[i % (fsrc.size())];
     if (i < (VL / 64)) s4 += fsrc[i % (fsrc.size())];
   }
@@ -2922,7 +2922,7 @@ TEST_P(InstSve, faddv) {
   )");
   double d3 = 0.0;
   double d4 = 0.0;
-  for (int i = 0; i < VL / 64; i++) {
+  for (uint64_t i = 0; i < (VL / 64); i++) {
     d3 += dsrc[i % (dsrc.size())];
     if (i < (VL / 128)) d4 += dsrc[i % (dsrc.size())];
   }