UoB-HPC · FinnWilkinson · May 24, 2024 · May 28, 2024 · May 28, 2024 · May 28, 2024
diff --git a/configs/a64fx_SME.yaml b/configs/a64fx_SME.yaml
@@ -80,15 +80,15 @@ Ports:
     - INT_DIV_OR_SQRT
   5:
     Portname: EAGA
-    Instruction-Support: 
+    Instruction-Group-Support: 
     - LOAD
     - STORE_ADDRESS
     - INT_SIMPLE_ARTH_NOSHIFT
     - INT_SIMPLE_LOGICAL_NOSHIFT
     - INT_SIMPLE_CMP
   6:
     Portname: EAGB
-    Instruction-Support:
+    Instruction-Group-Support:
     - LOAD
     - STORE_ADDRESS
     - INT_SIMPLE_ARTH_NOSHIFT
@@ -98,6 +98,7 @@ Ports:
     Portname: BR
     Instruction-Group-Support:
     - BRANCH
+# Define example SME unit
   8:
     Portname: SME
     Instruction-Group-Support:

diff --git a/docs/sphinx/assets/instruction_groups.png b/docs/sphinx/assets/instruction_groups.png
diff --git a/src/include/simeng/Register.hh b/src/include/simeng/Register.hh
@@ -1,6 +1,5 @@
 #pragma once
 #include <cstdint>
-#include <iostream>
 
 namespace simeng {
 

diff --git a/src/include/simeng/arch/aarch64/Architecture.hh b/src/include/simeng/arch/aarch64/Architecture.hh
@@ -70,6 +70,12 @@ class Architecture : public arch::Architecture {
   /** Returns the current value of SVCRval_. */
   uint64_t getSVCRval() const;
 
+  /** Returns if SVE Streaming Mode is enabled. */
+  bool isStreamingModeEnabled() const;
+
+  /** Returns if the SME ZA Register is enabled. */
+  bool isZARegisterEnabled() const;
+
   /** Update the value of SVCRval_. */
   void setSVCRval(const uint64_t newVal) const;
 

diff --git a/src/include/simeng/arch/aarch64/InstructionGroups.hh b/src/include/simeng/arch/aarch64/InstructionGroups.hh
@@ -4,7 +4,33 @@ namespace simeng {
 namespace arch {
 namespace aarch64 {
 
-/** The IDs of the instruction groups for AArch64 instructions. */
+/** The IDs of the instruction groups for AArch64 instructions.
+ * Each new group must contain 14 entries to ensure correct group assignment and
+ * general functionality.
+ * Their order must be as follows:
+ *  - BASE
+ *  - BASE_SIMPLE
+ *  - BASE_SIMPLE_ARTH
+ *  - BASE_SIMPLE_ARTH_NOSHIFT
+ *  - BASE_SIMPLE_LOGICAL
+ *  - BASE_SIMPLE_LOGICAL_NOSHIFT
+ *  - BASE_SIMPLE_CMP
+ *  - BASE_SIMPLE_CVT
+ *  - BASE_MUL
+ *  - BASE_DIV_OR_SQRT
+ *  - LOAD_BASE
+ *  - STORE_ADDRESS_BASE
+ *  - STORE_DATA_BASE
+ *  - STORE_BASE
+ *
+ * An exception to the above is "Parent" groups which do not require the LOAD_*
+ * or STORE_* groups.
+ * "Parent" groups allow for easier grouping of similar groups that may have
+ * identical execution latencies, ports, etc. For example, FP is the parent
+ * group of SCALAR and VECTOR.
+ * In simulation, an instruction's allocated group will never be a "Parent"
+ * group; they are only used to simplify config file creation and management.
+ */
 namespace InstructionGroups {
 const uint16_t INT = 0;
 const uint16_t INT_SIMPLE = 1;
@@ -102,7 +128,7 @@ static constexpr uint8_t NUM_GROUPS = 88;
 const std::unordered_map<uint16_t, std::vector<uint16_t>> groupInheritance_ = {
     {InstructionGroups::ALL,
      {InstructionGroups::INT, InstructionGroups::FP, InstructionGroups::SVE,
-      InstructionGroups::PREDICATE, InstructionGroups::SME,
+      InstructionGroups::SME, InstructionGroups::PREDICATE,
       InstructionGroups::LOAD, InstructionGroups::STORE,
       InstructionGroups::BRANCH}},
     {InstructionGroups::INT,

diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh
@@ -568,9 +568,14 @@ RegisterValue vecUMaxP(srcValContainer& sourceValues) {
   const T* n = sourceValues[0].getAsVector<T>();
   const T* m = sourceValues[1].getAsVector<T>();
 
+  // Concatenate the vectors
+  T temp[2 * I];
+  memcpy(temp, n, sizeof(T) * I);
+  memcpy(temp + (sizeof(T) * I), m, sizeof(T) * I);
+  // Compare each adjacent pair of elements
   T out[I];
   for (int i = 0; i < I; i++) {
-    out[i] = std::max(n[i], m[i]);
+    out[i] = std::max(temp[2 * i], temp[2 * i + 1]);
   }
   return {out, 256};
 }
@@ -585,9 +590,14 @@ RegisterValue vecUMinP(srcValContainer& sourceValues) {
   const T* n = sourceValues[0].getAsVector<T>();
   const T* m = sourceValues[1].getAsVector<T>();
 
+  // Concatenate the vectors
+  T temp[2 * I];
+  memcpy(temp, n, sizeof(T) * I);
+  memcpy(temp + (sizeof(T) * I), m, sizeof(T) * I);
+
   T out[I];
   for (int i = 0; i < I; i++) {
-    out[i] = std::min(n[i], m[i]);
+    out[i] = std::min(temp[2 * i], temp[2 * i + 1]);
   }
   return {out, 256};
 }

diff --git a/src/lib/arch/aarch64/Architecture.cc b/src/lib/arch/aarch64/Architecture.cc
@@ -284,6 +284,12 @@ void Architecture::setSVCRval(const uint64_t newVal) const {
   SVCRval_ = newVal;
 }
 
+// 0th bit of SVCR register determines if streaming-mode is enabled.
+bool Architecture::isStreamingModeEnabled() const { return SVCRval_ & 1; }
+
+// 1st bit of SVCR register determines if ZA register is enabled.
+bool Architecture::isZARegisterEnabled() const { return SVCRval_ & 2; }
+
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
diff --git a/src/lib/arch/aarch64/InstructionMetadata.cc b/src/lib/arch/aarch64/InstructionMetadata.cc
@@ -232,6 +232,41 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
       operands[2].access = CS_AC_READ;
       operands[3].access = CS_AC_READ;
       break;
+
+    case Opcode::AArch64_INSERT_MXIPZ_H_B:
+      [[fallthrough]];
+    case Opcode::AArch64_INSERT_MXIPZ_H_D:
+      [[fallthrough]];
+    case Opcode::AArch64_INSERT_MXIPZ_H_H:
+      [[fallthrough]];
+    case Opcode::AArch64_INSERT_MXIPZ_H_Q:
+      [[fallthrough]];
+    case Opcode::AArch64_INSERT_MXIPZ_H_S:
+      [[fallthrough]];
+    case Opcode::AArch64_INSERT_MXIPZ_V_B:
+      [[fallthrough]];
+    case Opcode::AArch64_INSERT_MXIPZ_V_D:
+      [[fallthrough]];
+    case Opcode::AArch64_INSERT_MXIPZ_V_H:
+      [[fallthrough]];
+    case Opcode::AArch64_INSERT_MXIPZ_V_Q:
+      [[fallthrough]];
+    case Opcode::AArch64_INSERT_MXIPZ_V_S:
+      // Need to add access specifiers
+      // although operands[0] should be READ | WRITE, due to the implemented
+      // decode logic for SME tile destinations, the register will be added as
+      // both source and destination with just WRITE access.
+      operands[0].access = CS_AC_WRITE;
+      operands[1].access = CS_AC_READ;
+      operands[2].access = CS_AC_READ;
+      break;
+    case Opcode::AArch64_LDR_ZA:
+      // Need to add access specifier
+      // although operands[0] should be READ | WRITE, due to the implemented
+      // decode logic for SME tile destinations, the register will be added as
+      // both source and destination with just WRITE access.
+      operands[0].access = CS_AC_WRITE;
+      break;
     case Opcode::AArch64_ZERO_M: {
       // Incorrect access type: All are READ but should all be WRITE
       for (int i = 0; i < operandCount; i++) {