UoB-HPC · FinnWilkinson · Oct 7, 2024 · Oct 7, 2024 · Oct 8, 2024 · Oct 9, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -116,6 +116,7 @@ option(SIMENG_SANITIZE "Enable compiler sanitizers" OFF)
 option(SIMENG_OPTIMIZE "Enable Extra Compiler Optimizations" OFF)
 option(SIMENG_ENABLE_SST "Compile SimEng SST Wrapper" OFF)
 option(SIMENG_ENABLE_SST_TESTS "Enable testing for SST" OFF)
+option(SIMENG_ENABLE_BF16 "Enable __bf16 instruction execution logic" OFF)
 
 # Set CXX flag for Apple Mac so that `binary_function` and `unary_function` types that are used in SST can be recognised. 
 # They were deprecated in C++11 and removed in C++17, and Apple Clang v15 no longer supports these types without the following flag
@@ -155,10 +156,9 @@ if(SIMENG_ENABLE_TESTS)
 
       # Print message containing if the full test suite will run
     if (${LLVM_PACKAGE_VERSION} VERSION_LESS "14.0")
-      message(STATUS "LLVM version does not support AArch64 extensions SME or SVE2. These test suites will be skipped.")
-    endif()
-    if (${LLVM_PACKAGE_VERSION} VERSION_LESS "18.0")
-      message(STATUS "LLVM version does not support AArch64 extensions SME2. These test suites will be skipped.")
+      message(STATUS "LLVM version does not support AArch64 extensions SVE2, SVE2.1, SME, or SME2. Related tests will fail.")
+    elseif (${LLVM_PACKAGE_VERSION} VERSION_LESS "18.0")
+      message(STATUS "LLVM version does not support AArch64 extensions SME2 or SVE2.1. Related test will fail.")
     endif()
 
   else()

diff --git a/src/include/simeng/arch/aarch64/ArchInfo.hh b/src/include/simeng/arch/aarch64/ArchInfo.hh
@@ -18,7 +18,8 @@ class ArchInfo : public simeng::arch::ArchInfo {
                            aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1,
                            aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0,
                            aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0,
-                           aarch64_sysreg::AARCH64_SYSREG_SVCR}),
+                           aarch64_sysreg::AARCH64_SYSREG_SVCR,
+                           aarch64_sysreg::AARCH64_SYSREG_TPIDR2_EL0}),
         zaSize_(config["Core"]["Streaming-Vector-Length"].as<uint16_t>() / 8) {
     // Generate the architecture-defined architectural register structure
     archRegStruct_ = {

diff --git a/src/include/simeng/arch/aarch64/Instruction.hh b/src/include/simeng/arch/aarch64/Instruction.hh
@@ -283,6 +283,40 @@ enum class InsnType : uint32_t {
   isBranch = 1 << 14
 };
 
+/** Convert Predicate-as-Counter to Predicate-as-Masks.
+ * T represents the element type (i.e. for pg.s, T = uint32_t).
+ * V represents the number of vectors the predicate-as-counter is being used
+ * for. */
+template <typename T, int V>
+std::vector<std::array<uint64_t, 4>> predAsCounterToMasks(
+    const uint64_t predAsCounter, const uint16_t VL_bits) {
+  std::vector<std::array<uint64_t, 4>> out(V, {0, 0, 0, 0});
+
+  const uint16_t elemsPerVec = VL_bits / (sizeof(T) * 8);
+  // Get predicate-as-counter information
+  const bool invert = (predAsCounter & 0b1000000000000000) != 0;
+  const uint64_t predElemCount =
+      (predAsCounter & static_cast<uint64_t>(0b0111111111111111)) >>
+      static_cast<uint8_t>(std::log2f(sizeof(T)) + 1);
+
+  for (int r = 0; r < V; r++) {
+    for (uint16_t i = 0; i < elemsPerVec; i++) {
+      // Move bit to next position based on element type
+      uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+      // If invert = True (invert bit = 1), predElemCount dictates number of
+      // initial inactive elements.
+      // If invert = False (invert bit = 0), it indicates the number of initial
+      // active elements.
+      if (static_cast<uint64_t>(r * elemsPerVec) + i < predElemCount) {
+        out[r][i / (64 / sizeof(T))] |= (invert) ? 0 : shifted_active;
+      } else {
+        out[r][i / (64 / sizeof(T))] |= (invert) ? shifted_active : 0;
+      }
+    }
+  }
+  return out;
+}
+
 /** A basic Armv9.2-a implementation of the `Instruction` interface. */
 class Instruction : public simeng::Instruction {
  public:

diff --git a/src/include/simeng/arch/aarch64/helpers/float.hh b/src/include/simeng/arch/aarch64/helpers/float.hh
@@ -194,6 +194,23 @@ D fcvtzu_integer(srcValContainer& sourceValues) {
   return result;
 }
 
+/** Helper function for SCALAR/FP instructions with the format ucvtf rd, rn
+ * #fbits.
+ * D represents the destination register type (e.g. for Sd, D = float).
+ * N represents the source register type (e.g. for Xn, N = uint32_t).
+ * Returns single value of type D. */
+template <typename D, typename N>
+D ucvtf_fixedToFloat(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  // Convert Fixed-Point to FP
+  // Using algorithm from
+  // https://embeddedartistry.com/blog/2018/07/12/simple-fixed-point-conversion-in-c/
+  const N xn = sourceValues[0].get<N>();
+  const N fbits = static_cast<N>(metadata.operands[2].imm);
+  return (static_cast<D>(xn) / static_cast<D>(1ull << fbits));
+}
+
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh
@@ -568,9 +568,14 @@ RegisterValue vecUMaxP(srcValContainer& sourceValues) {
   const T* n = sourceValues[0].getAsVector<T>();
   const T* m = sourceValues[1].getAsVector<T>();
 
+  // Concatenate the vectors
+  T temp[2 * I];
+  memcpy(temp, n, sizeof(T) * I);
+  memcpy(temp + (sizeof(T) * I), m, sizeof(T) * I);
+  // Compare each adjacent pair of elements
   T out[I];
   for (int i = 0; i < I; i++) {
-    out[i] = std::max(n[i], m[i]);
+    out[i] = std::max(temp[2 * i], temp[2 * i + 1]);
   }
   return {out, 256};
 }
@@ -585,9 +590,14 @@ RegisterValue vecUMinP(srcValContainer& sourceValues) {
   const T* n = sourceValues[0].getAsVector<T>();
   const T* m = sourceValues[1].getAsVector<T>();
 
+  // Concatenate the vectors
+  T temp[2 * I];
+  memcpy(temp, m, sizeof(T) * I);
+  memcpy(temp + (sizeof(T) * I), n, sizeof(T) * I);
+
   T out[I];
   for (int i = 0; i < I; i++) {
-    out[i] = std::min(n[i], m[i]);
+    out[i] = std::min(temp[2 * i], temp[2 * i + 1]);
   }
   return {out, 256};
 }
@@ -941,6 +951,63 @@ RegisterValue vecUzp(srcValContainer& sourceValues, bool isUzp1) {
   return {out, 256};
 }
 
+/** Helper function for NEON instructions with the format `udot vd.s, vn.b,
+ * vm.b`. D represents the number of elements in the output vector to be updated
+ * (i.e. for vd.2s D = 2). Only 2 or 4 are valid. Returns correctly formatted
+ * RegisterValue. */
+template <int D>
+RegisterValue vecUdot(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  // Check D and N are valid values
+  static_assert((D == 2 || D == 4) &&
+                "D must be either 2 or 4 to align with vd.2s or vd.4s.");
+
+  const uint32_t* vd = sourceValues[0].getAsVector<uint32_t>();
+  const uint8_t* vn = sourceValues[1].getAsVector<uint8_t>();
+  const uint8_t* vm = sourceValues[2].getAsVector<uint8_t>();
+
+  uint32_t out[D] = {0};
+  for (int i = 0; i < D; i++) {
+    out[i] = vd[i];
+    for (int j = 0; j < 4; j++) {
+      out[i] += (static_cast<uint32_t>(vn[(4 * i) + j]) *
+                 static_cast<uint32_t>(vm[(4 * i) + j]));
+    }
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `udot vd.s, vn.b,
+ * vm.4b[index]`.
+ * D represents the number of elements in the output vector to be updated (i.e.
+ * for vd.2s D = 2). Only 2 or 4 are valid.
+ * Returns correctly formatted RegisterValue. */
+template <int D>
+RegisterValue vecUdot_byElement(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata) {
+  // Check D and N are valid values
+  static_assert((D == 2 || D == 4) &&
+                "D must be either 2 or 4 to align with vd.2s or vd.4s.");
+
+  const uint32_t* vd = sourceValues[0].getAsVector<uint32_t>();
+  const uint8_t* vn = sourceValues[1].getAsVector<uint8_t>();
+  const uint8_t* vm = sourceValues[2].getAsVector<uint8_t>();
+  const int index = metadata.operands[2].vector_index;
+
+  uint32_t out[D] = {0};
+  for (int i = 0; i < D; i++) {
+    uint32_t acc = vd[i];
+    for (int j = 0; j < 4; j++) {
+      acc += (static_cast<uint32_t>(vn[(4 * i) + j]) *
+              static_cast<uint32_t>(vm[(4 * index) + j]));
+    }
+    out[i] = acc;
+  }
+  return {out, 256};
+}
+
 /** Helper function for NEON instructions with the format `zip<1,2> vd.T,
  * vn.T, vm.T`.
  * T represents the type of sourceValues (e.g. for vn.d, T = uint64_t).

diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -626,6 +626,27 @@ std::enable_if_t<std::is_floating_point_v<T>, RegisterValue> sveFDivPredicated(
   return {out, 256};
 }
 
+/** Helper function for SVE instructions with the format `faddv rd, pg, zn.
+ * D represents the source vector element type and the destination scalar
+ * register type (i.e. for zn.s and sd, D = float).
+ * Returns correctly formatted RegisterValue. */
+template <typename D>
+RegisterValue sveFaddv_predicated(srcValContainer& sourceValues,
+                                  const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const D* zn = sourceValues[1].getAsVector<D>();
+
+  const uint16_t partition_num = VL_bits / (8 * sizeof(D));
+  D out[256 / sizeof(D)] = {0};
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(D))) * sizeof(D));
+    if (p[i / (64 / sizeof(D))] & shifted_active) {
+      out[0] += zn[i];
+    }
+  }
+  return {out, 256};
+}
+
 /** Helper function for SVE instructions with the format `fmad zd, pg/m, zn,
  * zm`.
  * T represents the type of sourceValues (e.g. for zn.d, T = double).
@@ -1319,6 +1340,40 @@ std::array<uint64_t, 4> svePtrue(
   return out;
 }
 
+/** Helper function for SVE instructions with the format `ptrue pnd.
+ * T represents the type of sourceValues (e.g. for pnd.d, T = uint64_t).
+ * Returns an array of 4 uint64_t elements. */
+template <typename T>
+std::array<uint64_t, 4> svePtrue_counter(const uint16_t VL_bits) {
+  // Predicate as counter is 16-bits and has the following encoding:
+  //    - Up to first 4 bits encode the element size (0b1, 0b10, 0b100, 0b1000
+  //    for b h s d respectively)
+  //            - bits 0->LSZ
+  //    - Bits LSZ -> 14 represent a uint of the number of consecutive elements
+  //    from element 0 that are active / inactive
+  //            - If invert bit = 0 it is number of active elements
+  //            - If invert bit = 1 it is number of inactive elements
+  //    - Bit 15 represents the invert bit
+  std::array<uint64_t, 4> out = {0, 0, 0, 0};
+
+  // Set invert bit to 1 and count to 0
+  // (The first 0 elements are FALSE)
+  out[0] |= 0b1000000000000000;
+
+  // Set Element size field
+  if (sizeof(T) == 1) {
+    out[0] |= 0b1;
+  } else if (sizeof(T) == 2) {
+    out[0] |= 0b10;
+  } else if (sizeof(T) == 4) {
+    out[0] |= 0b100;
+  } else if (sizeof(T) == 8) {
+    out[0] |= 0b1000;
+  }
+
+  return out;
+}
+
 /** Helper function for SVE instructions with the format `punpk<hi,lo> pd.h,
  * pn.b`.
  * If `isHI` = false, then PUNPKLO is performed.
@@ -1563,6 +1618,69 @@ RegisterValue sveTrn2_3vecs(srcValContainer& sourceValues,
   return {out, 256};
 }
 
+/** Helper function for SVE instructions with the format `udot zd, zn, zm`.
+ * D represents the element type of the destination register (i.e. for zd.s,
+ * D = uint32_t).
+ * N represents the element type of the source registers (i.e. for zn.b, N =
+ * uint8_t).
+ * W represents how many source elements are multiplied to form an output
+ * element (i.e. for 4-way, W = 4).
+ * Returns correctly formatted RegisterValue. */
+template <typename D, typename N, int W>
+RegisterValue sveUdot(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const D* zd = sourceValues[0].getAsVector<D>();
+  const N* zn = sourceValues[1].getAsVector<N>();
+  const N* zm = sourceValues[2].getAsVector<N>();
+
+  D out[256 / sizeof(D)] = {0};
+  for (size_t i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) {
+    out[i] = zd[i];
+    for (int j = 0; j < W; j++) {
+      out[i] +=
+          (static_cast<D>(zn[(W * i) + j]) * static_cast<N>(zm[(W * i) + j]));
+    }
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `udot zd, zn,
+ * zm[index]`.
+ * D represents the element type of the destination register (i.e. for uint32_t,
+ * D = uint32_t).
+ * N represents the element type of the source registers (i.e. for uint8_t, N =
+ * uint8_t).
+ * W represents how many source elements are multiplied to form an output
+ * element (i.e. for 4-way, W = 4).
+ * Returns correctly formatted RegisterValue. */
+template <typename D, typename N, int W>
+RegisterValue sveUdot_indexed(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const D* zd = sourceValues[0].getAsVector<D>();
+  const N* zn = sourceValues[1].getAsVector<N>();
+  const N* zm = sourceValues[2].getAsVector<N>();
+  const int index = metadata.operands[2].vector_index;
+
+  D out[256 / sizeof(D)] = {0};
+  for (size_t i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) {
+    D acc = zd[i];
+    // Index into zm selects which D-type element within each 128-bit vector
+    // segment to use
+    int base = i - (i % (128 / (sizeof(D) * 8)));
+    int zmIndex = base + index;
+    for (int j = 0; j < W; j++) {
+      acc += (static_cast<D>(zn[(W * i) + j]) *
+              static_cast<N>(zm[(W * zmIndex) + j]));
+    }
+    out[i] = acc;
+  }
+  return {out, 256};
+}
+
 /** Helper function for SVE instructions with the format `<s,u>unpk>hi,lo> zd,
  * zn`.
  * D represents the type of the destination register (e.g. <u>int32_t for

diff --git a/src/include/simeng/arch/aarch64/operandContainer.hh b/src/include/simeng/arch/aarch64/operandContainer.hh
@@ -10,7 +10,7 @@ namespace arch {
 namespace aarch64 {
 
 /** The maximum number of source registers a non-SME instruction can have. */
-const uint8_t MAX_SOURCE_REGISTERS = 6;
+const uint8_t MAX_SOURCE_REGISTERS = 7;
 
 /** The maximum number of destination registers a non-SME instruction can have.
  */

diff --git a/src/include/simeng/version.hh.in b/src/include/simeng/version.hh.in
@@ -9,5 +9,6 @@
 #define SIMENG_LLVM_VERSION @SIMENG_LLVM_VERSION@
 #define SIMENG_ENABLE_TESTS "${SIMENG_ENABLE_TESTS}"
 #define SIMENG_BUILD_DIR "${CMAKE_BINARY_DIR}"
+#define SIMENG_ENABLE_BF16 ${SIMENG_ENABLE_BF16}
 
 #endif
diff --git a/src/lib/arch/aarch64/ExceptionHandler.cc b/src/lib/arch/aarch64/ExceptionHandler.cc
@@ -626,8 +626,7 @@ bool ExceptionHandler::init() {
 
         break;
       }
-      case 293:  // rseq
-      {
+      case 293: {  // rseq
         stateChange = {ChangeType::REPLACEMENT, {R0}, {0ull}};
         break;
       }
@@ -818,7 +817,7 @@ void ExceptionHandler::readLinkAt(span<char> path) {
   for (size_t i = 0; i < bytesCopied; i += 256) {
     uint8_t size = std::min<uint64_t>(bytesCopied - i, 256ul);
     stateChange.memoryAddresses.push_back({bufAddress + i, size});
-    stateChange.memoryAddressValues.push_back(RegisterValue(bufPtr, size));
+    stateChange.memoryAddressValues.push_back(RegisterValue(bufPtr + i, size));
   }
 
   concludeSyscall(stateChange);

diff --git a/src/lib/arch/aarch64/InstructionMetadata.cc b/src/lib/arch/aarch64/InstructionMetadata.cc
@@ -244,7 +244,7 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
   if (isAlias) {
     exceptionString_ =
         "This instruction is an alias. The printed mnemonic and operand string "
-        "differ from what is expected of the Capstone opcode.";
+        "may differ from the underlying opcode.";
   }
 }