diff --git a/configs/a64fx_SME.yaml b/configs/a64fx_SME.yaml index 7fe7086d5e..4d1090ae82 100644 --- a/configs/a64fx_SME.yaml +++ b/configs/a64fx_SME.yaml @@ -80,7 +80,7 @@ Ports: - INT_DIV_OR_SQRT 5: Portname: EAGA - Instruction-Support: + Instruction-Group-Support: - LOAD - STORE_ADDRESS - INT_SIMPLE_ARTH_NOSHIFT @@ -88,7 +88,7 @@ Ports: - INT_SIMPLE_CMP 6: Portname: EAGB - Instruction-Support: + Instruction-Group-Support: - LOAD - STORE_ADDRESS - INT_SIMPLE_ARTH_NOSHIFT @@ -98,6 +98,7 @@ Ports: Portname: BR Instruction-Group-Support: - BRANCH +# Define example SME unit 8: Portname: SME Instruction-Group-Support: diff --git a/docs/sphinx/assets/instruction_groups.png b/docs/sphinx/assets/instruction_groups.png deleted file mode 100644 index bf5bf5c73a..0000000000 Binary files a/docs/sphinx/assets/instruction_groups.png and /dev/null differ diff --git a/src/include/simeng/Register.hh b/src/include/simeng/Register.hh index 5758d8e67b..0152813268 100644 --- a/src/include/simeng/Register.hh +++ b/src/include/simeng/Register.hh @@ -1,6 +1,5 @@ #pragma once #include -#include namespace simeng { diff --git a/src/include/simeng/arch/aarch64/Architecture.hh b/src/include/simeng/arch/aarch64/Architecture.hh index 8d4939c991..a654fc897a 100644 --- a/src/include/simeng/arch/aarch64/Architecture.hh +++ b/src/include/simeng/arch/aarch64/Architecture.hh @@ -70,6 +70,12 @@ class Architecture : public arch::Architecture { /** Returns the current value of SVCRval_. */ uint64_t getSVCRval() const; + /** Returns if SVE Streaming Mode is enabled. */ + bool isStreamingModeEnabled() const; + + /** Returns if the SME ZA Register is enabled. */ + bool isZARegisterEnabled() const; + /** Update the value of SVCRval_. */ void setSVCRval(const uint64_t newVal) const; diff --git a/src/include/simeng/arch/aarch64/InstructionGroups.hh b/src/include/simeng/arch/aarch64/InstructionGroups.hh index b50005571c..fc15e95230 100644 --- a/src/include/simeng/arch/aarch64/InstructionGroups.hh +++ b/src/include/simeng/arch/aarch64/InstructionGroups.hh @@ -4,7 +4,33 @@ namespace simeng { namespace arch { namespace aarch64 { -/** The IDs of the instruction groups for AArch64 instructions. */ +/** The IDs of the instruction groups for AArch64 instructions. + * Each new group must contain 14 entries to ensure correct group assignment and + * general functionality. + * Their order must be as follows: + * - BASE + * - BASE_SIMPLE + * - BASE_SIMPLE_ARTH + * - BASE_SIMPLE_ARTH_NOSHIFT + * - BASE_SIMPLE_LOGICAL + * - BASE_SIMPLE_LOGICAL_NOSHIFT + * - BASE_SIMPLE_CMP + * - BASE_SIMPLE_CVT + * - BASE_MUL + * - BASE_DIV_OR_SQRT + * - LOAD_BASE + * - STORE_ADDRESS_BASE + * - STORE_DATA_BASE + * - STORE_BASE + * + * An exception to the above is "Parent" groups which do not require the LOAD_* + * or STORE_* groups. + * "Parent" groups allow for easier grouping of similar groups that may have + * identical execution latencies, ports, etc. For example, FP is the parent + * group of SCALAR and VECTOR. + * In simulation, an instruction's allocated group will never be a "Parent" + * group; they are only used to simplify config file creation and management. + */ namespace InstructionGroups { const uint16_t INT = 0; const uint16_t INT_SIMPLE = 1; @@ -102,7 +128,7 @@ static constexpr uint8_t NUM_GROUPS = 88; const std::unordered_map> groupInheritance_ = { {InstructionGroups::ALL, {InstructionGroups::INT, InstructionGroups::FP, InstructionGroups::SVE, - InstructionGroups::PREDICATE, InstructionGroups::SME, + InstructionGroups::SME, InstructionGroups::PREDICATE, InstructionGroups::LOAD, InstructionGroups::STORE, InstructionGroups::BRANCH}}, {InstructionGroups::INT, diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh index c2626b7e91..cc9aa03461 100644 --- a/src/include/simeng/arch/aarch64/helpers/neon.hh +++ b/src/include/simeng/arch/aarch64/helpers/neon.hh @@ -568,9 +568,14 @@ RegisterValue vecUMaxP(srcValContainer& sourceValues) { const T* n = sourceValues[0].getAsVector(); const T* m = sourceValues[1].getAsVector(); + // Concatenate the vectors + T temp[2 * I]; + memcpy(temp, n, sizeof(T) * I); + memcpy(temp + (sizeof(T) * I), m, sizeof(T) * I); + // Compare each adjacent pair of elements T out[I]; for (int i = 0; i < I; i++) { - out[i] = std::max(n[i], m[i]); + out[i] = std::max(temp[2 * i], temp[2 * i + 1]); } return {out, 256}; } @@ -585,9 +590,14 @@ RegisterValue vecUMinP(srcValContainer& sourceValues) { const T* n = sourceValues[0].getAsVector(); const T* m = sourceValues[1].getAsVector(); + // Concatenate the vectors + T temp[2 * I]; + memcpy(temp, n, sizeof(T) * I); + memcpy(temp + (sizeof(T) * I), m, sizeof(T) * I); + T out[I]; for (int i = 0; i < I; i++) { - out[i] = std::min(n[i], m[i]); + out[i] = std::min(temp[2 * i], temp[2 * i + 1]); } return {out, 256}; } diff --git a/src/lib/arch/aarch64/Architecture.cc b/src/lib/arch/aarch64/Architecture.cc index 015dba62b1..3ff09c5b5c 100644 --- a/src/lib/arch/aarch64/Architecture.cc +++ b/src/lib/arch/aarch64/Architecture.cc @@ -284,6 +284,12 @@ void Architecture::setSVCRval(const uint64_t newVal) const { SVCRval_ = newVal; } +// 0th bit of SVCR register determines if streaming-mode is enabled. +bool Architecture::isStreamingModeEnabled() const { return SVCRval_ & 1; } + +// 1st bit of SVCR register determines if ZA register is enabled. +bool Architecture::isZARegisterEnabled() const { return SVCRval_ & 2; } + } // namespace aarch64 } // namespace arch } // namespace simeng diff --git a/src/lib/arch/aarch64/InstructionMetadata.cc b/src/lib/arch/aarch64/InstructionMetadata.cc index 34ddca07d7..56e438a3d8 100644 --- a/src/lib/arch/aarch64/InstructionMetadata.cc +++ b/src/lib/arch/aarch64/InstructionMetadata.cc @@ -232,6 +232,41 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn) operands[2].access = CS_AC_READ; operands[3].access = CS_AC_READ; break; + + case Opcode::AArch64_INSERT_MXIPZ_H_B: + [[fallthrough]]; + case Opcode::AArch64_INSERT_MXIPZ_H_D: + [[fallthrough]]; + case Opcode::AArch64_INSERT_MXIPZ_H_H: + [[fallthrough]]; + case Opcode::AArch64_INSERT_MXIPZ_H_Q: + [[fallthrough]]; + case Opcode::AArch64_INSERT_MXIPZ_H_S: + [[fallthrough]]; + case Opcode::AArch64_INSERT_MXIPZ_V_B: + [[fallthrough]]; + case Opcode::AArch64_INSERT_MXIPZ_V_D: + [[fallthrough]]; + case Opcode::AArch64_INSERT_MXIPZ_V_H: + [[fallthrough]]; + case Opcode::AArch64_INSERT_MXIPZ_V_Q: + [[fallthrough]]; + case Opcode::AArch64_INSERT_MXIPZ_V_S: + // Need to add access specifiers + // although operands[0] should be READ | WRITE, due to the implemented + // decode logic for SME tile destinations, the register will be added as + // both source and destination with just WRITE access. + operands[0].access = CS_AC_WRITE; + operands[1].access = CS_AC_READ; + operands[2].access = CS_AC_READ; + break; + case Opcode::AArch64_LDR_ZA: + // Need to add access specifier + // although operands[0] should be READ | WRITE, due to the implemented + // decode logic for SME tile destinations, the register will be added as + // both source and destination with just WRITE access. + operands[0].access = CS_AC_WRITE; + break; case Opcode::AArch64_ZERO_M: { // Incorrect access type: All are READ but should all be WRITE for (int i = 0; i < operandCount; i++) { diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index 357077e7b3..ec4f269a8f 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -91,8 +91,25 @@ span Instruction::generateAddresses() { setMemoryAddresses({{sourceValues_[2].get(), 8}}); break; } - case Opcode::AArch64_LD1_MXIPXX_V_D: // ld1d {zatv.d[ws, #imm]}, pg/z, - // [{, xm, lsl #3}] + case Opcode::AArch64_LD1_MXIPXX_V_B: // ld1b {zatv.b[ws, #imm]}, pg/z, + // [{, xm}] + // SME + [[fallthrough]]; + case Opcode::AArch64_LD1_MXIPXX_H_B: { // ld1b {zath.b[ws, #imm]}, pg/z, + // [{, xm}] + // SME + const uint16_t partition_num = VL_bits / 8; + const uint64_t n = sourceValues_[partition_num + 2].get(); + uint64_t m = 0; + if (metadata_.operands[2].mem.index) + m = sourceValues_[partition_num + 3].get(); + setMemoryAddresses({(n + m), static_cast(VL_bits / 8)}); + break; + } + case Opcode::AArch64_LD1_MXIPXX_V_D: // ld1d {zatv.d[ws, #imm]}, pg/z, + // [{, xm, lsl #3}] + // SME + [[fallthrough]]; case Opcode::AArch64_LD1_MXIPXX_H_D: { // ld1d {zath.d[ws, #imm]}, pg/z, // [{, xm, lsl #3}] // SME @@ -104,8 +121,40 @@ span Instruction::generateAddresses() { setMemoryAddresses({(n + m), static_cast(VL_bits / 8)}); break; } - case Opcode::AArch64_LD1_MXIPXX_V_S: // ld1w {zatv.s[ws, #imm]}, pg/z, - // [{, xm, LSL #2}] + case Opcode::AArch64_LD1_MXIPXX_V_H: // ld1h {zatv.h[ws, #imm]}, pg/z, + // [{, xm, lsl #1}] + // SME + [[fallthrough]]; + case Opcode::AArch64_LD1_MXIPXX_H_H: { // ld1h {zath.h[ws, #imm]}, pg/z, + // [{, xm, lsl #1}] + // SME + const uint16_t partition_num = VL_bits / 16; + const uint64_t n = sourceValues_[partition_num + 2].get(); + uint64_t m = 0; + if (metadata_.operands[2].mem.index) + m = sourceValues_[partition_num + 3].get() << 1; + setMemoryAddresses({(n + m), static_cast(VL_bits / 8)}); + break; + } + case Opcode::AArch64_LD1_MXIPXX_V_Q: // ld1q {zatv.q[ws]}, pg/z, + // [{, xm, lsl #4}] + // SME + [[fallthrough]]; + case Opcode::AArch64_LD1_MXIPXX_H_Q: { // ld1q {zath.q[ws]}, pg/z, + // [{, xm, lsl #4}] + // SME + const uint16_t partition_num = VL_bits / 128; + const uint64_t n = sourceValues_[partition_num + 2].get(); + uint64_t m = 0; + if (metadata_.operands[2].mem.index) + m = sourceValues_[partition_num + 3].get() << 4; + setMemoryAddresses({(n + m), static_cast(VL_bits / 8)}); + break; + } + case Opcode::AArch64_LD1_MXIPXX_V_S: // ld1w {zatv.s[ws, #imm]}, pg/z, + // [{, xm, LSL #2}] + // SME + [[fallthrough]]; case Opcode::AArch64_LD1_MXIPXX_H_S: { // ld1w {zath.s[ws, #imm]}, pg/z, // [{, xm, LSL #2}] // SME @@ -459,6 +508,17 @@ span Instruction::generateAddresses() { setMemoryAddresses({{sourceValues_[0].get(), 8}}); break; } + case Opcode::AArch64_LDR_ZA: { // ldr za[wv, #imm], [{, #imm, mul + // vl}] + // SME + // ZA Row count === current VL in bytes + const uint16_t zaRowCount = VL_bits / 8; + const uint64_t xn = sourceValues_[zaRowCount + 1].get(); + const uint64_t imm = + static_cast(metadata_.operands[1].mem.disp); + setMemoryAddresses({xn + (imm * zaRowCount), zaRowCount}); + break; + } case Opcode::AArch64_LDRBBpost: { // ldrb wt, [xn], #imm setMemoryAddresses({{sourceValues_[0].get(), 1}}); break; @@ -501,19 +561,32 @@ span Instruction::generateAddresses() { setMemoryAddresses({{sourceValues_[0].get() + offset, 8}}); break; } - case Opcode::AArch64_LDRBui: // ldr bt, [xn, #imm] - case Opcode::AArch64_LDRBpre: // ldr bt, [xn, #imm]! - case Opcode::AArch64_LDRDui: // ldr dt, [xn, #imm] - case Opcode::AArch64_LDRDpre: // ldr dt, [xn, #imm]! - case Opcode::AArch64_LDRHui: // ldr ht, [xn, #imm] - case Opcode::AArch64_LDRHpre: // ldr ht, [xn, #imm]! - case Opcode::AArch64_LDRQui: // ldr qt, [xn, #imm] - case Opcode::AArch64_LDRQpre: // ldr qt, [xn, #imm]! - case Opcode::AArch64_LDRSui: // ldr st, [xn, #imm] - case Opcode::AArch64_LDRSpre: // ldr st, [xn, #imm]! - case Opcode::AArch64_LDRWui: // ldr wt, [xn, #imm] - case Opcode::AArch64_LDRWpre: // ldr wt, [xn, #imm]! - case Opcode::AArch64_LDRXui: // ldr xt, [xn, #imm] + case Opcode::AArch64_LDRBui: // ldr bt, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_LDRBpre: // ldr bt, [xn, #imm]! + [[fallthrough]]; + case Opcode::AArch64_LDRDui: // ldr dt, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_LDRDpre: // ldr dt, [xn, #imm]! + [[fallthrough]]; + case Opcode::AArch64_LDRHui: // ldr ht, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_LDRHpre: // ldr ht, [xn, #imm]! + [[fallthrough]]; + case Opcode::AArch64_LDRQui: // ldr qt, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_LDRQpre: // ldr qt, [xn, #imm]! + [[fallthrough]]; + case Opcode::AArch64_LDRSui: // ldr st, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_LDRSpre: // ldr st, [xn, #imm]! + [[fallthrough]]; + case Opcode::AArch64_LDRWui: // ldr wt, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_LDRWpre: // ldr wt, [xn, #imm]! + [[fallthrough]]; + case Opcode::AArch64_LDRXui: // ldr xt, [xn, #imm] + [[fallthrough]]; case Opcode::AArch64_LDRXpre: { // ldr xt, [xn, #imm]! std::vector addresses; generateContiguousAddresses( @@ -522,12 +595,18 @@ span Instruction::generateAddresses() { setMemoryAddresses(addresses); break; } - case Opcode::AArch64_LDRBpost: // ldr bt, [xn], #imm - case Opcode::AArch64_LDRDpost: // ldr dt, [xn], #imm - case Opcode::AArch64_LDRHpost: // ldr ht, [xn], #imm - case Opcode::AArch64_LDRQpost: // ldr qt, [xn], #imm - case Opcode::AArch64_LDRSpost: // ldr st, [xn], #imm - case Opcode::AArch64_LDRWpost: // ldr wt, [xn], #imm + case Opcode::AArch64_LDRBpost: // ldr bt, [xn], #imm + [[fallthrough]]; + case Opcode::AArch64_LDRDpost: // ldr dt, [xn], #imm + [[fallthrough]]; + case Opcode::AArch64_LDRHpost: // ldr ht, [xn], #imm + [[fallthrough]]; + case Opcode::AArch64_LDRQpost: // ldr qt, [xn], #imm + [[fallthrough]]; + case Opcode::AArch64_LDRSpost: // ldr st, [xn], #imm + [[fallthrough]]; + case Opcode::AArch64_LDRWpost: // ldr wt, [xn], #imm + [[fallthrough]]; case Opcode::AArch64_LDRXpost: { // ldr xt, [xn], #imm std::vector addresses; generateContiguousAddresses(sourceValues_[0].get(), 1, @@ -645,15 +724,24 @@ span Instruction::generateAddresses() { setMemoryAddresses({{base, 4}, {base + 4, 4}}); break; } - case Opcode::AArch64_LDPDi: // ldp dt1, dt2, [xn, #imm] - case Opcode::AArch64_LDPDpre: // ldp dt1, dt2, [xn, #imm!] - case Opcode::AArch64_LDPQi: // ldp qt1, qt2, [xn, #imm] - case Opcode::AArch64_LDPQpre: // ldp qt1, qt2, [xn, #imm!] - case Opcode::AArch64_LDPSi: // ldp st1, st2, [xn, #imm] - case Opcode::AArch64_LDPSpre: // ldp st1, st2, [xn, #imm!] - case Opcode::AArch64_LDPWi: // ldp wt1, wt2, [xn, #imm] - case Opcode::AArch64_LDPWpre: // ldp wt1, wt2, [xn, #imm!] - case Opcode::AArch64_LDPXi: // ldp xt1, xt2, [xn, #imm] + case Opcode::AArch64_LDPDi: // ldp dt1, dt2, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_LDPDpre: // ldp dt1, dt2, [xn, #imm!] + [[fallthrough]]; + case Opcode::AArch64_LDPQi: // ldp qt1, qt2, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_LDPQpre: // ldp qt1, qt2, [xn, #imm!] + [[fallthrough]]; + case Opcode::AArch64_LDPSi: // ldp st1, st2, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_LDPSpre: // ldp st1, st2, [xn, #imm!] + [[fallthrough]]; + case Opcode::AArch64_LDPWi: // ldp wt1, wt2, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_LDPWpre: // ldp wt1, wt2, [xn, #imm!] + [[fallthrough]]; + case Opcode::AArch64_LDPXi: // ldp xt1, xt2, [xn, #imm] + [[fallthrough]]; case Opcode::AArch64_LDPXpre: { // ldp xt1, xt2, [xn, #imm!] std::vector addresses; generateContiguousAddresses( @@ -662,10 +750,14 @@ span Instruction::generateAddresses() { setMemoryAddresses(addresses); break; } - case Opcode::AArch64_LDPDpost: // ldp dt1, dt2, [xn], #imm - case Opcode::AArch64_LDPQpost: // ldp qt1, qt2, [xn], #imm - case Opcode::AArch64_LDPSpost: // ldp st1, st2, [xn], #imm - case Opcode::AArch64_LDPWpost: // ldp wt1, wt2, [xn], #imm + case Opcode::AArch64_LDPDpost: // ldp dt1, dt2, [xn], #imm + [[fallthrough]]; + case Opcode::AArch64_LDPQpost: // ldp qt1, qt2, [xn], #imm + [[fallthrough]]; + case Opcode::AArch64_LDPSpost: // ldp st1, st2, [xn], #imm + [[fallthrough]]; + case Opcode::AArch64_LDPWpost: // ldp wt1, wt2, [xn], #imm + [[fallthrough]]; case Opcode::AArch64_LDPXpost: { // ldp xt1, xt2, [xn], #imm std::vector addresses; generateContiguousAddresses(sourceValues_[0].get(), 2, @@ -958,8 +1050,33 @@ span Instruction::generateAddresses() { setMemoryAddresses(std::move(addresses)); break; } - case Opcode::AArch64_ST1_MXIPXX_H_D: // st1d {zath.d[ws, #imm]}, pg, - // [{, xm, lsl #3}] + case Opcode::AArch64_ST1_MXIPXX_H_B: // st1b {zath.b[ws, #imm]}, pg, + // [{, xm}] + // SME + [[fallthrough]]; + case Opcode::AArch64_ST1_MXIPXX_V_B: { // st1b {zatv.b[ws, #imm]}, pg, + // [{, xm}] + // SME + const uint16_t partition_num = VL_bits / 8; + const uint64_t* pg = + sourceValues_[partition_num + 1].getAsVector(); + const uint64_t n = sourceValues_[partition_num + 2].get(); + uint64_t m = 0; + if (metadata_.operands[2].mem.index) + m = sourceValues_[partition_num + 3].get(); + + std::vector addresses; + addresses.reserve(partition_num); + + generatePredicatedContiguousAddressBlocks((n + m), partition_num, 1, 1, + pg, addresses); + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_ST1_MXIPXX_H_D: // st1d {zath.d[ws, #imm]}, pg, + // [{, xm, lsl #3}] + // SME + [[fallthrough]]; case Opcode::AArch64_ST1_MXIPXX_V_D: { // st1d {zatv.d[ws, #imm]}, pg, // [{, xm, lsl #3}] // SME @@ -979,8 +1096,56 @@ span Instruction::generateAddresses() { setMemoryAddresses(std::move(addresses)); break; } - case Opcode::AArch64_ST1_MXIPXX_H_S: // st1w {zath.s[ws, #imm]}, pg/z, - // [{, xm, LSL #2}] + case Opcode::AArch64_ST1_MXIPXX_H_H: // st1h {zath.h[ws, #imm]}, pg, + // [{, xm, lsl #1}] + // SME + [[fallthrough]]; + case Opcode::AArch64_ST1_MXIPXX_V_H: { // st1h {zatv.h[ws, #imm]}, pg, + // [{, xm, lsl #1}] + // SME + const uint16_t partition_num = VL_bits / 16; + const uint64_t* pg = + sourceValues_[partition_num + 1].getAsVector(); + const uint64_t n = sourceValues_[partition_num + 2].get(); + uint64_t m = 0; + if (metadata_.operands[2].mem.index) + m = sourceValues_[partition_num + 3].get() << 1; + + std::vector addresses; + addresses.reserve(partition_num); + + generatePredicatedContiguousAddressBlocks((n + m), partition_num, 2, 2, + pg, addresses); + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_ST1_MXIPXX_H_Q: // st1q {zath.q[ws]}, pg, [{, + // xm, lsl #4}] + // SME + [[fallthrough]]; + case Opcode::AArch64_ST1_MXIPXX_V_Q: { // st1q {zatv.q[ws]}, pg, + // [{, xm, lsl #4}] + // SME + const uint16_t partition_num = VL_bits / 128; + const uint64_t* pg = + sourceValues_[partition_num + 1].getAsVector(); + const uint64_t n = sourceValues_[partition_num + 2].get(); + uint64_t m = 0; + if (metadata_.operands[2].mem.index) + m = sourceValues_[partition_num + 3].get() << 4; + + std::vector addresses; + addresses.reserve(partition_num); + + generatePredicatedContiguousAddressBlocks((n + m), partition_num, 16, + 16, pg, addresses); + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_ST1_MXIPXX_H_S: // st1w {zath.s[ws, #imm]}, pg/z, + // [{, xm, LSL #2}] + // SME + [[fallthrough]]; case Opcode::AArch64_ST1_MXIPXX_V_S: { // st1w {zatv.s[ws, #imm]}, pg/z, // [{, xm, LSL #2}] // SME @@ -1358,15 +1523,24 @@ span Instruction::generateAddresses() { setMemoryAddresses({{sourceValues_[1].get(), 8}}); break; } - case Opcode::AArch64_STPDi: // stp dt1, dt2, [xn, #imm] - case Opcode::AArch64_STPDpre: // stp dt1, dt2, [xn, #imm]! - case Opcode::AArch64_STPQi: // stp qt1, qt2, [xn, #imm] - case Opcode::AArch64_STPQpre: // stp qt1, qt2, [xn, #imm]! - case Opcode::AArch64_STPSi: // stp st1, st2, [xn, #imm] - case Opcode::AArch64_STPSpre: // stp st1, st2, [xn, #imm]! - case Opcode::AArch64_STPWi: // stp wt1, wt2, [xn, #imm] - case Opcode::AArch64_STPWpre: // stp wt1, wt2, [xn, #imm]! - case Opcode::AArch64_STPXi: // stp xt1, xt2, [xn, #imm] + case Opcode::AArch64_STPDi: // stp dt1, dt2, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_STPDpre: // stp dt1, dt2, [xn, #imm]! + [[fallthrough]]; + case Opcode::AArch64_STPQi: // stp qt1, qt2, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_STPQpre: // stp qt1, qt2, [xn, #imm]! + [[fallthrough]]; + case Opcode::AArch64_STPSi: // stp st1, st2, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_STPSpre: // stp st1, st2, [xn, #imm]! + [[fallthrough]]; + case Opcode::AArch64_STPWi: // stp wt1, wt2, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_STPWpre: // stp wt1, wt2, [xn, #imm]! + [[fallthrough]]; + case Opcode::AArch64_STPXi: // stp xt1, xt2, [xn, #imm] + [[fallthrough]]; case Opcode::AArch64_STPXpre: { // stp xt1, xt2, [xn, #imm]! std::vector addresses; generateContiguousAddresses( @@ -1375,10 +1549,14 @@ span Instruction::generateAddresses() { setMemoryAddresses(addresses); break; } - case Opcode::AArch64_STPDpost: // stp dt1, dt2, [xn], #imm - case Opcode::AArch64_STPQpost: // stp qt1, qt2, [xn], #imm - case Opcode::AArch64_STPSpost: // stp st1, st2, [xn], #imm - case Opcode::AArch64_STPWpost: // stp wt1, wt2, [xn], #imm + case Opcode::AArch64_STPDpost: // stp dt1, dt2, [xn], #imm + [[fallthrough]]; + case Opcode::AArch64_STPQpost: // stp qt1, qt2, [xn], #imm + [[fallthrough]]; + case Opcode::AArch64_STPSpost: // stp st1, st2, [xn], #imm + [[fallthrough]]; + case Opcode::AArch64_STPWpost: // stp wt1, wt2, [xn], #imm + [[fallthrough]]; case Opcode::AArch64_STPXpost: { // stp xt1, xt2, [xn], #imm std::vector addresses; generateContiguousAddresses(sourceValues_[2].get(), 2, @@ -1428,19 +1606,32 @@ span Instruction::generateAddresses() { setMemoryAddresses({{sourceValues_[1].get() + offset, 8}}); break; } - case Opcode::AArch64_STRBui: // str bt, [xn, #imm] - case Opcode::AArch64_STRBpre: // str bt, [xn, #imm]! - case Opcode::AArch64_STRDui: // str dt, [xn, #imm] - case Opcode::AArch64_STRDpre: // str dt, [xn, #imm]! - case Opcode::AArch64_STRHui: // str ht, [xn, #imm] - case Opcode::AArch64_STRHpre: // str ht, [xn, #imm]! - case Opcode::AArch64_STRQui: // str qt, [xn, #imm] - case Opcode::AArch64_STRQpre: // str qt, [xn, #imm]! - case Opcode::AArch64_STRSui: // str st, [xn, #imm] - case Opcode::AArch64_STRSpre: // str st, [xn, #imm]! - case Opcode::AArch64_STRWui: // str wt, [xn, #imm] - case Opcode::AArch64_STRWpre: // str wt, [xn, #imm]! - case Opcode::AArch64_STRXui: // str xt, [xn, #imm] + case Opcode::AArch64_STRBui: // str bt, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_STRBpre: // str bt, [xn, #imm]! + [[fallthrough]]; + case Opcode::AArch64_STRDui: // str dt, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_STRDpre: // str dt, [xn, #imm]! + [[fallthrough]]; + case Opcode::AArch64_STRHui: // str ht, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_STRHpre: // str ht, [xn, #imm]! + [[fallthrough]]; + case Opcode::AArch64_STRQui: // str qt, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_STRQpre: // str qt, [xn, #imm]! + [[fallthrough]]; + case Opcode::AArch64_STRSui: // str st, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_STRSpre: // str st, [xn, #imm]! + [[fallthrough]]; + case Opcode::AArch64_STRWui: // str wt, [xn, #imm] + [[fallthrough]]; + case Opcode::AArch64_STRWpre: // str wt, [xn, #imm]! + [[fallthrough]]; + case Opcode::AArch64_STRXui: // str xt, [xn, #imm] + [[fallthrough]]; case Opcode::AArch64_STRXpre: { // str xt, [xn, #imm]! std::vector addresses; generateContiguousAddresses( @@ -1449,12 +1640,18 @@ span Instruction::generateAddresses() { setMemoryAddresses(addresses); break; } - case Opcode::AArch64_STRBpost: // str bt, [xn], #imm - case Opcode::AArch64_STRDpost: // str dt, [xn], #imm - case Opcode::AArch64_STRHpost: // str ht, [xn], #imm - case Opcode::AArch64_STRQpost: // str qt, [xn], #imm - case Opcode::AArch64_STRSpost: // str st, [xn], #imm - case Opcode::AArch64_STRWpost: // str wt, [xn], #imm + case Opcode::AArch64_STRBpost: // str bt, [xn], #imm + [[fallthrough]]; + case Opcode::AArch64_STRDpost: // str dt, [xn], #imm + [[fallthrough]]; + case Opcode::AArch64_STRHpost: // str ht, [xn], #imm + [[fallthrough]]; + case Opcode::AArch64_STRQpost: // str qt, [xn], #imm + [[fallthrough]]; + case Opcode::AArch64_STRSpost: // str st, [xn], #imm + [[fallthrough]]; + case Opcode::AArch64_STRWpost: // str wt, [xn], #imm + [[fallthrough]]; case Opcode::AArch64_STRXpost: { // str xt, [xn], #imm std::vector addresses; generateContiguousAddresses(sourceValues_[1].get(), 1, @@ -1545,6 +1742,16 @@ span Instruction::generateAddresses() { setMemoryAddresses({base + (offset * partition_num), partition_num}); break; } + case Opcode::AArch64_STR_ZA: { // str za[wv, #imm], [xn|sp{, #imm, mul + // vl}] + // SME + // ZA Row count === current VL in bytes + const uint16_t zaRowCount = VL_bits / 8; + const uint64_t xn = sourceValues_[zaRowCount + 1].get(); + const uint64_t imm = metadata_.operands[1].mem.disp; + setMemoryAddresses({{xn + (imm * zaRowCount), zaRowCount}}); + break; + } case Opcode::AArch64_STR_ZXI: { // str zt, [xn{, #imm, mul vl}] const uint16_t partition_num = VL_bits / 8; diff --git a/src/lib/arch/aarch64/Instruction_decode.cc b/src/lib/arch/aarch64/Instruction_decode.cc index 6d2007cb55..3535ce590f 100644 --- a/src/lib/arch/aarch64/Instruction_decode.cc +++ b/src/lib/arch/aarch64/Instruction_decode.cc @@ -2,11 +2,6 @@ #include "InstructionMetadata.hh" -#define NOT(bits, length) (~bits & (1 << length - 1)) -#define CONCAT(hi, lo, lowLen) ((hi << lowLen) & lo) -#define ONES(n) ((1 << (n)) - 1) -#define ROR(x, shift, size) ((x >> shift) | (x << (size - shift))) - namespace simeng { namespace arch { namespace aarch64 { diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 20b62904b9..8f4bc38142 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -67,9 +67,9 @@ void Instruction::execute() { canExecute() && "Attempted to execute an instruction before all operands were provided"); // 0th bit of SVCR register determines if streaming-mode is enabled. - const bool SMenabled = architecture_.getSVCRval() & 1; + const bool SMenabled = architecture_.isStreamingModeEnabled(); // 1st bit of SVCR register determines if ZA register is enabled. - const bool ZAenabled = architecture_.getSVCRval() & 2; + const bool ZAenabled = architecture_.isZARegisterEnabled(); // When streaming mode is enabled, the architectural vector length goes from // SVE's VL to SME's SVL. const uint16_t VL_bits = SMenabled ? architecture_.getStreamingVectorLength() @@ -108,6 +108,148 @@ void Instruction::execute() { } } else { switch (metadata_.opcode) { + case Opcode::AArch64_ADDHA_MPPZ_D: { // addha zada.d, pn/m, pm/m, zn.d + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 64; + const uint64_t* pn = sourceValues_[rowCount].getAsVector(); + const uint64_t* pm = + sourceValues_[rowCount + 1].getAsVector(); + const uint64_t* zn = + sourceValues_[rowCount + 2].getAsVector(); + + for (uint16_t row = 0; row < rowCount; row++) { + const uint64_t* zaRow = sourceValues_[row].getAsVector(); + uint64_t out[32] = {0}; + std::memcpy(out, zaRow, rowCount * sizeof(uint64_t)); + // Slice element is active IFF all of the following conditions hold: + // - Element in 1st source pred corresponding to horizontal + // slice is TRUE + // - Corresponding element in 2nd source pred is TRUE + const uint64_t shifted_active_pn = 1ull << ((row % 8) * 8); + if (pn[row / 8] & shifted_active_pn) { + for (uint16_t elem = 0; elem < rowCount; elem++) { + const uint64_t shifted_active_pm = 1ull << ((elem % 8) * 8); + if (pm[elem / 8] & shifted_active_pm) { + out[elem] = zn[elem]; + } + } + } + results_[row] = {out, 256}; + } + break; + } + case Opcode::AArch64_ADDHA_MPPZ_S: { // addha zada.s, pn/m, pm/m, zn.s + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 32; + const uint64_t* pn = sourceValues_[rowCount].getAsVector(); + const uint64_t* pm = + sourceValues_[rowCount + 1].getAsVector(); + const uint32_t* zn = + sourceValues_[rowCount + 2].getAsVector(); + + for (uint16_t row = 0; row < rowCount; row++) { + const uint32_t* zaRow = sourceValues_[row].getAsVector(); + uint32_t out[64] = {0}; + std::memcpy(out, zaRow, rowCount * sizeof(uint32_t)); + // Slice element is active IFF all of the following conditions hold: + // - Element in 1st source pred corresponding to horizontal + // slice is TRUE + // - Corresponding element in 2nd source pred is TRUE + const uint64_t shifted_active_pn = 1ull << ((row % 16) * 4); + if (pn[row / 16] & shifted_active_pn) { + for (uint16_t elem = 0; elem < rowCount; elem++) { + const uint64_t shifted_active_pm = 1ull << ((elem % 16) * 4); + if (pm[elem / 16] & shifted_active_pm) { + out[elem] = zn[elem]; + } + } + } + results_[row] = {out, 256}; + } + break; + } + case Opcode::AArch64_ADDVA_MPPZ_D: { // addva zada.d, pn/m, pm/m, zn.d + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 64; + const uint64_t* pn = sourceValues_[rowCount].getAsVector(); + const uint64_t* pm = + sourceValues_[rowCount + 1].getAsVector(); + const uint64_t* zn = + sourceValues_[rowCount + 2].getAsVector(); + + for (uint16_t row = 0; row < rowCount; row++) { + const uint64_t* zaRow = sourceValues_[row].getAsVector(); + uint64_t out[32] = {0}; + std::memcpy(out, zaRow, rowCount * sizeof(uint64_t)); + // Slice element is active IFF all of the following conditions hold: + // - Corresponding element in 1st source pred is TRUE + // - Element in 2nd source pred corresponding to vertical + // slice is TRUE + const uint64_t shifted_active_pn = 1ull << ((row % 8) * 8); + if (pn[row / 8] & shifted_active_pn) { + // Corresponding slice element is active (i.e. all elements in row). + // Now check if each vertical slice (i.e. each row element) is + // active + for (uint16_t elem = 0; elem < rowCount; elem++) { + const uint64_t shifted_active_pm = 1ull << ((elem % 8) * 8); + if (pm[elem / 8] & shifted_active_pm) { + out[elem] = zn[row]; + } + } + } + results_[row] = {out, 256}; + } + break; + } + case Opcode::AArch64_ADDVA_MPPZ_S: { // addva zada.s, pn/m, pm/m, zn.s + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 32; + const uint64_t* pn = sourceValues_[rowCount].getAsVector(); + const uint64_t* pm = + sourceValues_[rowCount + 1].getAsVector(); + const uint32_t* zn = + sourceValues_[rowCount + 2].getAsVector(); + + for (uint16_t row = 0; row < rowCount; row++) { + const uint32_t* zaRow = sourceValues_[row].getAsVector(); + uint32_t out[64] = {0}; + std::memcpy(out, zaRow, rowCount * sizeof(uint32_t)); + // Slice element is active IFF all of the following conditions hold: + // - Corresponding element in 1st source pred is TRUE + // - Element in 2nd source pred corresponding to vertical + // slice is TRUE + const uint64_t shifted_active_pn = 1ull << ((row % 16) * 4); + if (pn[row / 16] & shifted_active_pn) { + // Corresponding slice element is active (i.e. all elements in row). + // Now check if each vertical slice (i.e. each row element) is + // active in 2nd pred + for (uint16_t elem = 0; elem < rowCount; elem++) { + const uint64_t shifted_active_pm = 1ull << ((elem % 16) * 4); + if (pm[elem / 16] & shifted_active_pm) { + out[elem] = zn[row]; + } + } + } + results_[row] = {out, 256}; + } + break; + } case Opcode::AArch64_ADCXr: { // adc xd, xn, xm auto [result, nzcv] = addCarry_3ops(sourceValues_); (void)nzcv; // Prevent unused variable warnings in GCC7 @@ -700,9 +842,9 @@ void Instruction::execute() { break; } case Opcode::AArch64_CMHSv16i8: { // cmhs vd.16b, vn.16b, vm.16b - results_[0] = vecCompare( + results_[0] = vecCompare( sourceValues_, false, - [](int8_t x, int8_t y) -> bool { return (x >= y); }); + [](uint8_t x, uint8_t y) -> bool { return (x >= y); }); break; } case Opcode::AArch64_CMPEQ_PPzZI_B: { // cmpeq pd.b, pg/z, zn.b, #imm @@ -1227,7 +1369,7 @@ void Instruction::execute() { [](uint8_t x, uint8_t y) -> uint8_t { return x ^ y; }); break; } - case Opcode::AArch64_EXTRACT_ZPMXI_H_B: { // MOVA zd.b, pg/m, zanh.b[ws, + case Opcode::AArch64_EXTRACT_ZPMXI_H_B: { // mova zd.b, pg/m, zanh.b[ws, // #imm] // SME // Check core is in correct context mode (check SM first) @@ -1237,23 +1379,288 @@ void Instruction::execute() { const uint16_t rowCount = VL_bits / 8; const uint8_t* zd = sourceValues_[0].getAsVector(); const uint64_t* pg = sourceValues_[1].getAsVector(); - const uint64_t sliceNum = + const uint32_t sliceNum = (sourceValues_[2 + rowCount].get() + static_cast( metadata_.operands[2].sme.slice_offset.imm)) % rowCount; - const uint8_t* zanRow = + const uint8_t* zaRow = sourceValues_[2 + sliceNum].getAsVector(); + uint8_t out[256] = {0}; + for (int elem = 0; elem < rowCount; elem++) { + uint64_t shifted_active = 1ull << (elem % 64); + if (pg[elem / 64] & shifted_active) + out[elem] = zaRow[elem]; + else + out[elem] = zd[elem]; + } + results_[0] = {out, 256}; + break; + } + case Opcode::AArch64_EXTRACT_ZPMXI_H_D: { // mova zd.d, pg/m, zanh.d[ws, + // #imm] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 64; + const uint64_t* zd = sourceValues_[0].getAsVector(); + const uint64_t* pg = sourceValues_[1].getAsVector(); + const uint32_t sliceNum = + (sourceValues_[2 + rowCount].get() + + static_cast( + metadata_.operands[2].sme.slice_offset.imm)) % + rowCount; + const uint64_t* zaRow = + sourceValues_[2 + sliceNum].getAsVector(); + + uint64_t out[32] = {0}; + for (int elem = 0; elem < rowCount; elem++) { + uint64_t shifted_active = 1ull << ((elem % 8) * 8); + if (pg[elem / 8] & shifted_active) + out[elem] = zaRow[elem]; + else + out[elem] = zd[elem]; + } + results_[0] = {out, 256}; + break; + } + case Opcode::AArch64_EXTRACT_ZPMXI_H_H: { // mova zd.h, pg/m, zanh.h[ws, + // #imm] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 16; + const uint16_t* zd = sourceValues_[0].getAsVector(); + const uint64_t* pg = sourceValues_[1].getAsVector(); + const uint32_t sliceNum = + (sourceValues_[2 + rowCount].get() + + static_cast( + metadata_.operands[2].sme.slice_offset.imm)) % + rowCount; + const uint16_t* zaRow = + sourceValues_[2 + sliceNum].getAsVector(); + + uint16_t out[128] = {0}; + for (int elem = 0; elem < rowCount; elem++) { + uint64_t shifted_active = 1ull << ((elem % 32) * 2); + if (pg[elem / 32] & shifted_active) + out[elem] = zaRow[elem]; + else + out[elem] = zd[elem]; + } + results_[0] = {out, 256}; + break; + } + case Opcode::AArch64_EXTRACT_ZPMXI_H_Q: { // mova zd.q, pg/m, zanh.q[ws] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 128; + // Use uint64_t as no 128-bit + const uint64_t* zd = sourceValues_[0].getAsVector(); + const uint64_t* pg = sourceValues_[1].getAsVector(); + const uint32_t sliceNum = + sourceValues_[2 + rowCount].get() % rowCount; + // Use uint64_t as no 128-bit + const uint64_t* zaRow = + sourceValues_[2 + sliceNum].getAsVector(); + + // Use uint64_t as no 128-bit + uint64_t out[32] = {0}; + for (int elem = 0; elem < rowCount; elem++) { + // For 128-bit there are 16-bit for each active element + uint64_t shifted_active = 1ull << ((elem % 4) * 16); + if (pg[elem / 4] & shifted_active) { + // Need to move two consecutive 64-bit elements + out[2 * elem] = zaRow[2 * elem]; + out[2 * elem + 1] = zaRow[2 * elem + 1]; + } else { + // Need to move two consecutive 64-bit elements + out[2 * elem] = zd[2 * elem]; + out[2 * elem + 1] = zd[2 * elem + 1]; + } + } + results_[0] = {out, 256}; + break; + } + case Opcode::AArch64_EXTRACT_ZPMXI_H_S: { // mova zd.s, pg/m, zanh.s[ws, + // #imm] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 32; + const uint32_t* zd = sourceValues_[0].getAsVector(); + const uint64_t* pg = sourceValues_[1].getAsVector(); + const uint32_t sliceNum = + (sourceValues_[2 + rowCount].get() + + static_cast( + metadata_.operands[2].sme.slice_offset.imm)) % + rowCount; + const uint32_t* zaRow = + sourceValues_[2 + sliceNum].getAsVector(); + uint32_t out[64] = {0}; for (int elem = 0; elem < rowCount; elem++) { - uint64_t shifted_active = 1ull << ((elem % 64)); + uint64_t shifted_active = 1ull << ((elem % 16) * 4); + if (pg[elem / 16] & shifted_active) + out[elem] = zaRow[elem]; + else + out[elem] = zd[elem]; + } + results_[0] = {out, 256}; + break; + } + case Opcode::AArch64_EXTRACT_ZPMXI_V_B: { // mova zd.b, pg/m, zanv.b[ws, + // #imm] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 8; + const uint8_t* zd = sourceValues_[0].getAsVector(); + const uint64_t* pg = sourceValues_[1].getAsVector(); + const uint32_t sliceNum = + (sourceValues_[2 + rowCount].get() + + static_cast( + metadata_.operands[2].sme.slice_offset.imm)) % + rowCount; + + uint8_t out[256] = {0}; + for (uint16_t elem = 0; elem < rowCount; elem++) { + uint64_t shifted_active = 1ull << (elem % 64); if (pg[elem / 64] & shifted_active) - out[elem] = zanRow[elem]; + out[elem] = + sourceValues_[2 + elem].getAsVector()[sliceNum]; + else + out[elem] = zd[elem]; + } + results_[0] = {out, 256}; + break; + } + case Opcode::AArch64_EXTRACT_ZPMXI_V_D: { // mova zd.d, pg/m, zanv.d[ws, + // #imm] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 64; + const uint64_t* zd = sourceValues_[0].getAsVector(); + const uint64_t* pg = sourceValues_[1].getAsVector(); + const uint32_t sliceNum = + (sourceValues_[2 + rowCount].get() + + static_cast( + metadata_.operands[2].sme.slice_offset.imm)) % + rowCount; + + uint64_t out[32] = {0}; + for (uint16_t elem = 0; elem < rowCount; elem++) { + uint64_t shifted_active = 1ull << ((elem % 8) * 8); + if (pg[elem / 8] & shifted_active) + out[elem] = + sourceValues_[2 + elem].getAsVector()[sliceNum]; + else + out[elem] = zd[elem]; + } + results_[0] = {out, 256}; + break; + } + case Opcode::AArch64_EXTRACT_ZPMXI_V_H: { // mova zd.h, pg/m, zanv.h[ws, + // #imm] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 16; + const uint16_t* zd = sourceValues_[0].getAsVector(); + const uint64_t* pg = sourceValues_[1].getAsVector(); + const uint32_t sliceNum = + (sourceValues_[2 + rowCount].get() + + static_cast( + metadata_.operands[2].sme.slice_offset.imm)) % + rowCount; + + uint16_t out[128] = {0}; + for (uint16_t elem = 0; elem < rowCount; elem++) { + uint64_t shifted_active = 1ull << ((elem % 32) * 2); + if (pg[elem / 32] & shifted_active) + out[elem] = + sourceValues_[2 + elem].getAsVector()[sliceNum]; else out[elem] = zd[elem]; } + results_[0] = {out, 256}; + break; + } + case Opcode::AArch64_EXTRACT_ZPMXI_V_Q: { // mova zd.q, pg/m, zanv.q[ws] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 128; + // Use uint64_t as no 128-bit + const uint64_t* zd = sourceValues_[0].getAsVector(); + const uint64_t* pg = sourceValues_[1].getAsVector(); + const uint32_t sliceNum = + sourceValues_[2 + rowCount].get() % rowCount; + + // Use uint64_t as no 128-bit + uint64_t out[32] = {0}; + for (uint16_t elem = 0; elem < rowCount; elem++) { + // For 128-bit there are 16-bit for each active element + uint64_t shifted_active = 1ull << ((elem % 4) * 16); + if (pg[elem / 4] & shifted_active) { + // Need to move two consecutive 64-bit elements + const uint64_t* zaRow = + sourceValues_[2 + elem].getAsVector(); + out[2 * elem] = zaRow[2 * sliceNum]; + out[2 * elem + 1] = zaRow[2 * sliceNum + 1]; + } else { + // Need to move two consecutive 64-bit elements + out[2 * elem] = zd[2 * elem]; + out[2 * elem + 1] = zd[2 * elem + 1]; + } + } + results_[0] = {out, 256}; + break; + } + case Opcode::AArch64_EXTRACT_ZPMXI_V_S: { // mova zd.s, pg/m, zanv.s[ws, + // #imm] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 32; + const uint32_t* zd = sourceValues_[0].getAsVector(); + const uint64_t* pg = sourceValues_[1].getAsVector(); + const uint32_t sliceNum = + (sourceValues_[2 + rowCount].get() + + static_cast( + metadata_.operands[2].sme.slice_offset.imm)) % + rowCount; + uint32_t out[64] = {0}; + for (uint16_t elem = 0; elem < rowCount; elem++) { + uint64_t shifted_active = 1ull << ((elem % 16) * 4); + if (pg[elem / 16] & shifted_active) + out[elem] = + sourceValues_[2 + elem].getAsVector()[sliceNum]; + else + out[elem] = zd[elem]; + } results_[0] = {out, 256}; break; } @@ -1948,20 +2355,84 @@ void Instruction::execute() { } break; } - case Opcode::AArch64_FMOVDXHighr: { // fmov xd, vn.d[1] - results_[0] = sourceValues_[0].getAsVector()[1]; - break; - } - case Opcode::AArch64_FMOVDXr: { // fmov xd, dn - results_[0] = sourceValues_[0].get(); - break; - } - case Opcode::AArch64_FMOVDi: { // fmov dn, #imm - results_[0] = {metadata_.operands[1].fp, 256}; - break; - } - case Opcode::AArch64_FMOVDr: { // fmov dd, dn - results_[0] = {sourceValues_[0].get(), 256}; + case Opcode::AArch64_FMOPS_MPPZZ_D: { // fmops zada.d, pn/m, pm/m, zn.d, + // zm.d + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 64; + const uint64_t* pn = sourceValues_[rowCount].getAsVector(); + const uint64_t* pm = + sourceValues_[rowCount + 1].getAsVector(); + const double* zn = sourceValues_[rowCount + 2].getAsVector(); + const double* zm = sourceValues_[rowCount + 3].getAsVector(); + + // zn is row, zm is col + for (int row = 0; row < rowCount; row++) { + double outRow[32] = {0}; + uint64_t shifted_active_row = 1ull << ((row % 8) * 8); + const double* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < rowCount; col++) { + double zadaElem = zadaRow[col]; + uint64_t shifted_active_col = 1ull << ((col % 8) * 8); + if ((pm[col / 8] & shifted_active_col) && + (pn[row / 8] & shifted_active_row)) + outRow[col] = zadaElem - (zn[row] * zm[col]); + else + outRow[col] = zadaElem; + } + results_[row] = {outRow, 256}; + } + break; + } + case Opcode::AArch64_FMOPS_MPPZZ_S: { // fmops zada.s, pn/m, pm/m, zn.s, + // zm.s + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 32; + const uint64_t* pn = sourceValues_[rowCount].getAsVector(); + const uint64_t* pm = + sourceValues_[rowCount + 1].getAsVector(); + const float* zn = sourceValues_[rowCount + 2].getAsVector(); + const float* zm = sourceValues_[rowCount + 3].getAsVector(); + + // zn is row, zm is col + for (int row = 0; row < rowCount; row++) { + float outRow[64] = {0}; + uint64_t shifted_active_row = 1ull << ((row % 16) * 4); + const float* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < rowCount; col++) { + float zadaElem = zadaRow[col]; + uint64_t shifted_active_col = 1ull << ((col % 16) * 4); + if ((pm[col / 16] & shifted_active_col) && + (pn[row / 16] & shifted_active_row)) + outRow[col] = zadaElem - (zn[row] * zm[col]); + else + outRow[col] = zadaElem; + } + results_[row] = {outRow, 256}; + } + break; + } + case Opcode::AArch64_FMOVDXHighr: { // fmov xd, vn.d[1] + results_[0] = sourceValues_[0].getAsVector()[1]; + break; + } + case Opcode::AArch64_FMOVDXr: { // fmov xd, dn + results_[0] = sourceValues_[0].get(); + break; + } + case Opcode::AArch64_FMOVDi: { // fmov dn, #imm + results_[0] = {metadata_.operands[1].fp, 256}; + break; + } + case Opcode::AArch64_FMOVDr: { // fmov dd, dn + results_[0] = {sourceValues_[0].get(), 256}; break; } case Opcode::AArch64_FMOVSWr: { // fmov wd, sn @@ -2529,6 +3000,325 @@ void Instruction::execute() { VL_bits, false, false); break; } + + case Opcode::AArch64_INSERT_MXIPZ_H_B: { // mova zadh.b[ws, #imm], pg/m, + // zn.b + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 8; + const uint32_t sliceNum = + (sourceValues_[rowCount].get() + + static_cast( + metadata_.operands[0].sme.slice_offset.imm)) % + rowCount; + const uint8_t* zaRow = sourceValues_[sliceNum].getAsVector(); + const uint64_t* pg = + sourceValues_[rowCount + 1].getAsVector(); + const uint8_t* zn = sourceValues_[rowCount + 2].getAsVector(); + + uint8_t out[256] = {0}; + for (uint16_t elem = 0; elem < rowCount; elem++) { + uint64_t shifted_active = 1ull << (elem % 64); + if (pg[elem / 64] & shifted_active) + out[elem] = zn[elem]; + else + out[elem] = zaRow[elem]; + } + // Need to update whole za tile + for (uint16_t row = 0; row < rowCount; row++) { + results_[row] = + (row == sliceNum) ? RegisterValue(out, 256) : sourceValues_[row]; + } + break; + } + case Opcode::AArch64_INSERT_MXIPZ_H_D: { // mova zadh.d[ws, #imm], pg/m, + // zn.d + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 64; + const uint32_t sliceNum = + (sourceValues_[rowCount].get() + + static_cast( + metadata_.operands[0].sme.slice_offset.imm)) % + rowCount; + const uint64_t* zaRow = sourceValues_[sliceNum].getAsVector(); + const uint64_t* pg = + sourceValues_[rowCount + 1].getAsVector(); + const uint64_t* zn = + sourceValues_[rowCount + 2].getAsVector(); + + uint64_t out[32] = {0}; + for (uint16_t elem = 0; elem < rowCount; elem++) { + uint64_t shifted_active = 1ull << ((elem % 8) * 8); + if (pg[elem / 8] & shifted_active) + out[elem] = zn[elem]; + else + out[elem] = zaRow[elem]; + } + // Need to update whole za tile + for (uint16_t row = 0; row < rowCount; row++) { + results_[row] = + (row == sliceNum) ? RegisterValue(out, 256) : sourceValues_[row]; + } + break; + } + case Opcode::AArch64_INSERT_MXIPZ_H_H: { // mova zadh.h[ws, #imm], pg/m, + // zn.h + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 16; + const uint32_t sliceNum = + (sourceValues_[rowCount].get() + + static_cast( + metadata_.operands[0].sme.slice_offset.imm)) % + rowCount; + const uint16_t* zaRow = sourceValues_[sliceNum].getAsVector(); + const uint64_t* pg = + sourceValues_[rowCount + 1].getAsVector(); + const uint16_t* zn = + sourceValues_[rowCount + 2].getAsVector(); + + uint16_t out[128] = {0}; + for (uint16_t elem = 0; elem < rowCount; elem++) { + uint64_t shifted_active = 1ull << ((elem % 32) * 2); + if (pg[elem / 32] & shifted_active) + out[elem] = zn[elem]; + else + out[elem] = zaRow[elem]; + } + // Need to update whole za tile + for (uint16_t row = 0; row < rowCount; row++) { + results_[row] = + (row == sliceNum) ? RegisterValue(out, 256) : sourceValues_[row]; + } + break; + } + case Opcode::AArch64_INSERT_MXIPZ_H_Q: { // mova zadh.q[ws], pg/m, zn.q + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 128; + const uint32_t sliceNum = + sourceValues_[rowCount].get() % rowCount; + // Use uint64_t in place of 128-bit + const uint64_t* zaRow = sourceValues_[sliceNum].getAsVector(); + + const uint64_t* pg = + sourceValues_[rowCount + 1].getAsVector(); + // Use uint64_t in place of 128-bit + const uint64_t* zn = + sourceValues_[rowCount + 2].getAsVector(); + + // Use uint64_t in place of 128-bit + uint64_t out[32] = {0}; + for (uint16_t elem = 0; elem < rowCount; elem++) { + // For 128-bit there are 16-bit for each active element + uint64_t shifted_active = 1ull << ((elem % 4) * 16); + if (pg[elem / 4] & shifted_active) { + // Need to move two consecutive 64-bit elements + out[(2 * elem)] = zn[(2 * elem)]; + out[(2 * elem + 1)] = zn[(2 * elem + 1)]; + } else { + // Need to move two consecutive 64-bit elements + out[(2 * elem)] = zaRow[(2 * elem)]; + out[(2 * elem + 1)] = zaRow[(2 * elem + 1)]; + } + } + // Need to update whole za tile + for (uint16_t row = 0; row < rowCount; row++) { + results_[row] = + (row == sliceNum) ? RegisterValue(out, 256) : sourceValues_[row]; + } + break; + } + case Opcode::AArch64_INSERT_MXIPZ_H_S: { // mova zadh.s[ws, #imm], pg/m, + // zn.s + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 32; + const uint32_t sliceNum = + (sourceValues_[rowCount].get() + + static_cast( + metadata_.operands[0].sme.slice_offset.imm)) % + rowCount; + const uint32_t* zaRow = sourceValues_[sliceNum].getAsVector(); + const uint64_t* pg = + sourceValues_[rowCount + 1].getAsVector(); + const uint32_t* zn = + sourceValues_[rowCount + 2].getAsVector(); + + uint32_t out[64] = {0}; + for (uint16_t elem = 0; elem < rowCount; elem++) { + uint64_t shifted_active = 1ull << ((elem % 16) * 4); + if (pg[elem / 16] & shifted_active) + out[elem] = zn[elem]; + else + out[elem] = zaRow[elem]; + } + // Need to update whole za tile + for (uint16_t row = 0; row < rowCount; row++) { + results_[row] = + (row == sliceNum) ? RegisterValue(out, 256) : sourceValues_[row]; + } + break; + } + case Opcode::AArch64_INSERT_MXIPZ_V_B: { // mova zadv.b[ws, #imm], pg/m, + // zn.b + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 8; + const uint32_t sliceNum = + (sourceValues_[rowCount].get() + + static_cast( + metadata_.operands[0].sme.slice_offset.imm)) % + rowCount; + const uint64_t* pg = + sourceValues_[rowCount + 1].getAsVector(); + const uint8_t* zn = sourceValues_[rowCount + 2].getAsVector(); + + for (uint16_t i = 0; i < rowCount; i++) { + const uint8_t* row = sourceValues_[i].getAsVector(); + uint8_t out[256] = {0}; + memcpy(out, row, rowCount * sizeof(uint8_t)); + uint64_t shifted_active = 1ull << (i % 64); + if (pg[i / 64] & shifted_active) out[sliceNum] = zn[i]; + results_[i] = {out, 256}; + } + break; + } + case Opcode::AArch64_INSERT_MXIPZ_V_D: { // mova zadv.d[ws, #imm], pg/m, + // zn.d + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 64; + const uint32_t sliceNum = + (sourceValues_[rowCount].get() + + static_cast( + metadata_.operands[0].sme.slice_offset.imm)) % + rowCount; + const uint64_t* pg = + sourceValues_[rowCount + 1].getAsVector(); + const uint64_t* zn = + sourceValues_[rowCount + 2].getAsVector(); + + for (uint16_t i = 0; i < rowCount; i++) { + const uint64_t* row = sourceValues_[i].getAsVector(); + uint64_t out[32] = {0}; + memcpy(out, row, rowCount * sizeof(uint64_t)); + uint64_t shifted_active = 1ull << ((i % 8) * 8); + if (pg[i / 8] & shifted_active) out[sliceNum] = zn[i]; + results_[i] = {out, 256}; + } + break; + } + case Opcode::AArch64_INSERT_MXIPZ_V_H: { // mova zadv.h[ws, #imm], pg/m, + // zn.h + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 16; + const uint32_t sliceNum = + (sourceValues_[rowCount].get() + + static_cast( + metadata_.operands[0].sme.slice_offset.imm)) % + rowCount; + const uint64_t* pg = + sourceValues_[rowCount + 1].getAsVector(); + const uint16_t* zn = + sourceValues_[rowCount + 2].getAsVector(); + + for (uint16_t i = 0; i < rowCount; i++) { + const uint16_t* row = sourceValues_[i].getAsVector(); + uint16_t out[128] = {0}; + memcpy(out, row, rowCount * sizeof(uint16_t)); + uint64_t shifted_active = 1ull << ((i % 32) * 2); + if (pg[i / 32] & shifted_active) out[sliceNum] = zn[i]; + results_[i] = {out, 256}; + } + break; + } + case Opcode::AArch64_INSERT_MXIPZ_V_Q: { // mova zadv.q[ws], pg/m, zn.q + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 128; + const uint32_t sliceNum = + sourceValues_[rowCount].get() % rowCount; + const uint64_t* pg = + sourceValues_[rowCount + 1].getAsVector(); + // Use uint64_t in place of 128-bit + const uint64_t* zn = + sourceValues_[rowCount + 2].getAsVector(); + + for (uint16_t i = 0; i < rowCount; i++) { + // Use uint64_t in place of 128-bit + const uint64_t* row = sourceValues_[i].getAsVector(); + uint64_t out[32] = {0}; + // *2 in memcpy as need 128-bit elements but using uint64_t + memcpy(out, row, rowCount * sizeof(uint64_t) * 2); + // For 128-bit there are 16-bit for each active element + uint64_t shifted_active = 1ull << ((i % 4) * 16); + if (pg[i / 4] & shifted_active) { + // Need to move two consecutive 64-bit elements + out[2 * sliceNum] = zn[2 * i]; + out[2 * sliceNum + 1] = zn[2 * i + 1]; + } + results_[i] = {out, 256}; + } + break; + } + case Opcode::AArch64_INSERT_MXIPZ_V_S: { // mova zadv.s[ws, #imm], pg/m, + // zn.s + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 32; + const uint32_t sliceNum = + (sourceValues_[rowCount].get() + + static_cast( + metadata_.operands[0].sme.slice_offset.imm)) % + rowCount; + const uint64_t* pg = + sourceValues_[rowCount + 1].getAsVector(); + const uint32_t* zn = + sourceValues_[rowCount + 2].getAsVector(); + + for (uint16_t i = 0; i < rowCount; i++) { + const uint32_t* row = sourceValues_[i].getAsVector(); + uint32_t out[64] = {0}; + memcpy(out, row, rowCount * sizeof(uint32_t)); + uint64_t shifted_active = 1ull << ((i % 16) * 4); + if (pg[i / 16] & shifted_active) out[sliceNum] = zn[i]; + results_[i] = {out, 256}; + } + break; + } case Opcode::AArch64_INSvi16gpr: { // ins vd.h[index], wn results_[0] = vecInsIndex_gpr(sourceValues_, metadata_); @@ -2557,25 +3347,91 @@ void Instruction::execute() { vecInsIndex_gpr(sourceValues_, metadata_); break; } + case Opcode::AArch64_LD1_MXIPXX_H_B: { // ld1b {zath.b[ws, #imm]}, pg/z, + // [{, xm}] + // SME, LOAD + // If not in right context mode, raise exception + if (!ZAenabled) return ZAdisabled(); + + const uint16_t partition_num = VL_bits / 8; + const uint32_t ws = sourceValues_[partition_num].get(); + const uint64_t* pg = + sourceValues_[partition_num + 1].getAsVector(); + + const uint16_t sliceNum = + (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num; + const uint8_t* data = memoryData_[0].getAsVector(); + + uint8_t out[256] = {0}; + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << (i % 64); + if (pg[i / 64] & shifted_active) { + out[i] = data[i]; + } else { + out[i] = 0; + } + } + + // All Slice vectors are added to results[] so need to update the + // correct one + for (uint16_t i = 0; i < partition_num; i++) { + results_[i] = sourceValues_[i]; + } + results_[sliceNum] = {out, 256}; + break; + } case Opcode::AArch64_LD1_MXIPXX_H_D: { // ld1d {zath.d[ws, #imm]}, pg/z, // [{, xm, lsl #3}] // SME, LOAD - // Not in right context mode. Raise exception + // If not in right context mode, raise exception + if (!ZAenabled) return ZAdisabled(); + + const uint16_t partition_num = VL_bits / 64; + const uint32_t ws = sourceValues_[partition_num].get(); + const uint64_t* pg = + sourceValues_[partition_num + 1].getAsVector(); + + const uint16_t sliceNum = + (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num; + const uint64_t* data = memoryData_[0].getAsVector(); + + uint64_t out[32] = {0}; + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % 8) * 8); + if (pg[i / 8] & shifted_active) { + out[i] = data[i]; + } else { + out[i] = 0; + } + } + + // All Slice vectors are added to results[] so need to update the + // correct one + for (uint16_t i = 0; i < partition_num; i++) { + results_[i] = sourceValues_[i]; + } + results_[sliceNum] = {out, 256}; + break; + } + case Opcode::AArch64_LD1_MXIPXX_H_H: { // ld1h {zath.h[ws, #imm]}, pg/z, + // [{, xm, LSL #1}] + // SME, LOAD + // If not in right context mode, raise exception if (!ZAenabled) return ZAdisabled(); - const uint16_t partition_num = VL_bits / 64; + const uint16_t partition_num = VL_bits / 16; const uint32_t ws = sourceValues_[partition_num].get(); const uint64_t* pg = sourceValues_[partition_num + 1].getAsVector(); - const uint16_t sliceNum = + const uint32_t sliceNum = (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num; - const uint64_t* data = memoryData_[0].getAsVector(); + const uint16_t* data = memoryData_[0].getAsVector(); - uint64_t out[32] = {0}; + uint16_t out[128] = {0}; for (int i = 0; i < partition_num; i++) { - uint64_t shifted_active = 1ull << ((i % 8) * 8); - if (pg[i / 8] & shifted_active) { + uint64_t shifted_active = 1ull << ((i % 32) * 2); + if (pg[i / 32] & shifted_active) { out[i] = data[i]; } else { out[i] = 0; @@ -2585,46 +3441,53 @@ void Instruction::execute() { // All Slice vectors are added to results[] so need to update the // correct one for (uint16_t i = 0; i < partition_num; i++) { - if (i == sliceNum) - results_[i] = {out, 256}; - else - // Maintain un-updated rows. - results_[i] = sourceValues_[i]; + results_[i] = sourceValues_[i]; } + results_[sliceNum] = {out, 256}; break; } - case Opcode::AArch64_LD1_MXIPXX_V_D: { // ld1d {zatv.d[ws, #imm]}, pg/z, - // [{, xm, lsl #3}] + case Opcode::AArch64_LD1_MXIPXX_H_Q: { // ld1q {zath.q[ws]}, pg/z, + // [{, xm, LSL #4}] // SME, LOAD - // Not in right context mode. Raise exception + // If not in right context mode, raise exception if (!ZAenabled) return ZAdisabled(); - const uint16_t partition_num = VL_bits / 64; + const uint16_t partition_num = VL_bits / 128; const uint32_t ws = sourceValues_[partition_num].get(); const uint64_t* pg = sourceValues_[partition_num + 1].getAsVector(); - const uint32_t sliceNum = - (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num; + const uint32_t sliceNum = ws % partition_num; + // Use uint64_t as no 128-bit type const uint64_t* data = memoryData_[0].getAsVector(); + // Use uint64_t as no 128-bit type + uint64_t out[32] = {0}; for (int i = 0; i < partition_num; i++) { - uint64_t* row = - const_cast(sourceValues_[i].getAsVector()); - uint64_t shifted_active = 1ull << ((i % 8) * 8); - if (pg[i / 8] & shifted_active) { - row[sliceNum] = data[i]; + // For 128-bit there are 16-bit for each active element + uint64_t shifted_active = 1ull << ((i % 4) * 16); + if (pg[i / 4] & shifted_active) { + // As using uint64_t need to modify 2 elements + out[2 * i] = data[2 * i]; + out[2 * i + 1] = data[2 * i + 1]; } else { - row[sliceNum] = 0; + out[2 * i] = 0; + out[2 * i + 1] = 0; } - results_[i] = RegisterValue(reinterpret_cast(row), 256); } + + // All Slice vectors are added to results[] so need to update the + // correct one + for (uint16_t i = 0; i < partition_num; i++) { + results_[i] = sourceValues_[i]; + } + results_[sliceNum] = {out, 256}; break; } case Opcode::AArch64_LD1_MXIPXX_H_S: { // ld1w {zath.s[ws, #imm]}, pg/z, // [{, xm, LSL #2}] // SME, LOAD - // Not in right context mode. Raise exception + // If not in right context mode, raise exception if (!ZAenabled) return ZAdisabled(); const uint16_t partition_num = VL_bits / 32; @@ -2648,19 +3511,129 @@ void Instruction::execute() { // All Slice vectors are added to results[] so need to update the // correct one - for (uint32_t i = 0; i < partition_num; i++) { - if (i == sliceNum) - results_[i] = {out, 256}; - else - // Maintain un-updated rows. - results_[i] = sourceValues_[i]; + for (uint16_t i = 0; i < partition_num; i++) { + results_[i] = sourceValues_[i]; + } + results_[sliceNum] = {out, 256}; + break; + } + case Opcode::AArch64_LD1_MXIPXX_V_B: { // ld1b {zatv.b[ws, #imm]}, pg/z, + // [{, xm}] + // SME, LOAD + // If not in right context mode, raise exception + if (!ZAenabled) return ZAdisabled(); + + const uint16_t partition_num = VL_bits / 8; + const uint32_t ws = sourceValues_[partition_num].get(); + const uint64_t* pg = + sourceValues_[partition_num + 1].getAsVector(); + + const uint32_t sliceNum = + (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num; + const uint8_t* data = memoryData_[0].getAsVector(); + + for (int i = 0; i < partition_num; i++) { + const uint8_t* row = sourceValues_[i].getAsVector(); + uint8_t out[256] = {0}; + memcpy(out, row, partition_num * sizeof(uint8_t)); + uint64_t shifted_active = 1ull << (i % 64); + if (pg[i / 64] & shifted_active) { + out[sliceNum] = data[i]; + } + results_[i] = RegisterValue(out, 256); + } + break; + } + case Opcode::AArch64_LD1_MXIPXX_V_D: { // ld1d {zatv.d[ws, #imm]}, pg/z, + // [{, xm, lsl #3}] + // SME, LOAD + // If not in right context mode, raise exception + if (!ZAenabled) return ZAdisabled(); + + const uint16_t partition_num = VL_bits / 64; + const uint32_t ws = sourceValues_[partition_num].get(); + const uint64_t* pg = + sourceValues_[partition_num + 1].getAsVector(); + + const uint32_t sliceNum = + (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num; + const uint64_t* data = memoryData_[0].getAsVector(); + + for (int i = 0; i < partition_num; i++) { + const uint64_t* row = sourceValues_[i].getAsVector(); + uint64_t out[32] = {0}; + memcpy(out, row, partition_num * sizeof(uint64_t)); + uint64_t shifted_active = 1ull << ((i % 8) * 8); + if (pg[i / 8] & shifted_active) { + out[sliceNum] = data[i]; + } + results_[i] = RegisterValue(out, 256); + } + break; + } + case Opcode::AArch64_LD1_MXIPXX_V_H: { // ld1h {zatv.h[ws, #imm]}, pg/z, + // [{, xm, lsl #1}] + // SME, LOAD + // If not in right context mode, raise exception + if (!ZAenabled) return ZAdisabled(); + + const uint16_t partition_num = VL_bits / 16; + const uint32_t ws = sourceValues_[partition_num].get(); + const uint64_t* pg = + sourceValues_[partition_num + 1].getAsVector(); + + const uint32_t sliceNum = + (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num; + const uint16_t* data = memoryData_[0].getAsVector(); + + for (int i = 0; i < partition_num; i++) { + const uint16_t* row = sourceValues_[i].getAsVector(); + uint16_t out[128] = {0}; + memcpy(out, row, partition_num * sizeof(uint16_t)); + uint64_t shifted_active = 1ull << ((i % 32) * 2); + if (pg[i / 32] & shifted_active) { + out[sliceNum] = data[i]; + } + results_[i] = RegisterValue(out, 256); + } + break; + } + case Opcode::AArch64_LD1_MXIPXX_V_Q: { // ld1q {zatv.q[ws]}, pg/z, + // [{, xm, lsl #4}] + // SME, LOAD + // If not in right context mode, raise exception + if (!ZAenabled) return ZAdisabled(); + + const uint16_t partition_num = VL_bits / 128; + const uint32_t ws = sourceValues_[partition_num].get(); + const uint64_t* pg = + sourceValues_[partition_num + 1].getAsVector(); + + const uint32_t sliceNum = ws % partition_num; + // Using uint64_t as no 128-bit data type + const uint64_t* data = memoryData_[0].getAsVector(); + + for (int i = 0; i < partition_num; i++) { + // Using uint64_t as no 128-bit data type + const uint64_t* row = sourceValues_[i].getAsVector(); + uint64_t out[32] = {0}; + // *2 in memcpy as need 128-bit but using uint64_t + memcpy(out, row, partition_num * sizeof(uint64_t) * 2); + // For 128-bit there are 16-bit for each active element + uint64_t shifted_active = 1ull << ((i % 4) * 16); + if (pg[i / 4] & shifted_active) { + // As using uint64_t need to modify 2 elements + out[2 * sliceNum] = data[2 * i]; + out[2 * sliceNum + 1] = data[2 * i + 1]; + } + results_[i] = RegisterValue(out, 256); } break; } case Opcode::AArch64_LD1_MXIPXX_V_S: { // ld1w {zatv.s[ws, #imm]}, pg/z, // [{, xm, LSL #2}] // SME, LOAD - // Not in right context mode. Raise exception + // If not in right context mode, raise exception if (!ZAenabled) return ZAdisabled(); const uint16_t partition_num = VL_bits / 32; @@ -2673,15 +3646,14 @@ void Instruction::execute() { const uint32_t* data = memoryData_[0].getAsVector(); for (int i = 0; i < partition_num; i++) { - uint32_t* row = - const_cast(sourceValues_[i].getAsVector()); + const uint32_t* row = sourceValues_[i].getAsVector(); + uint32_t out[64] = {0}; + memcpy(out, row, partition_num * sizeof(uint32_t)); uint64_t shifted_active = 1ull << ((i % 16) * 4); if (pg[i / 16] & shifted_active) { - row[sliceNum] = data[i]; - } else { - row[sliceNum] = 0; + out[sliceNum] = data[i]; } - results_[i] = RegisterValue(reinterpret_cast(row), 256); + results_[i] = RegisterValue(out, 256); } break; } @@ -3671,6 +4643,31 @@ void Instruction::execute() { results_[0] = {out, 256}; break; } + case Opcode::AArch64_LDR_ZA: { // ldr za[wv, #imm], [{, #imm, mul + // vl}] + // SME, LOAD + // If not in right context mode, raise exception + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 8; + const uint32_t wn = sourceValues_[rowCount].get(); + const uint32_t sliceNum = + wn + + static_cast(metadata_.operands[0].sme.slice_offset.imm); + + const uint8_t* data = memoryData_[0].getAsVector(); + uint8_t out[256] = {0}; + for (uint16_t i = 0; i < rowCount; i++) { + out[i] = data[i]; + } + + for (uint16_t row = 0; row < rowCount; row++) { + results_[row] = (row == sliceNum) + ? RegisterValue(out, 256) + : results_[row] = sourceValues_[row]; + } + break; + } case Opcode::AArch64_LDTRSBXi: { // ldtrsb xt, [xn, #imm] // LOAD // TODO: implement @@ -4334,6 +5331,158 @@ void Instruction::execute() { [](int32_t x, int32_t y) -> int32_t { return std::min(x, y); }); break; } + case Opcode::AArch64_SMOPA_MPPZZ_D: { // smopa zada.d, pn/m, pm/m, zn.h, + // zm.h + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t tileDim = VL_bits / 64; + const uint64_t* pn = sourceValues_[tileDim].getAsVector(); + const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector(); + const int16_t* zn = sourceValues_[tileDim + 2].getAsVector(); + const int16_t* zm = sourceValues_[tileDim + 3].getAsVector(); + + // zn is a SVLd x 4 sub matrix + // zm is a 4 x SVLd sub matrix + // Resulting SVLd x SVLd matrix has results widened to 64-bit + for (int row = 0; row < tileDim; row++) { + int64_t outRow[32] = {0}; + const int64_t* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < tileDim; col++) { + // Get corresponding output element + int64_t sum = zadaRow[col]; + for (int k = 0; k < 4; k++) { + const uint16_t znIndex = 4 * row + k; + const uint16_t zmIndex = 4 * col + k; + const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2); + const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2); + if ((pn[znIndex / 32] & shifted_active_zn) && + (pm[zmIndex / 32] & shifted_active_zm)) + sum += (static_cast(zn[znIndex]) * + static_cast(zm[zmIndex])); + } + outRow[col] = sum; + } + results_[row] = {outRow, 256}; + } + break; + } + case Opcode::AArch64_SMOPA_MPPZZ_S: { // smopa zada.s, pn/m, pm/m, zn.b, + // zm.b + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t tileDim = VL_bits / 32; + const uint64_t* pn = sourceValues_[tileDim].getAsVector(); + const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector(); + const int8_t* zn = sourceValues_[tileDim + 2].getAsVector(); + const int8_t* zm = sourceValues_[tileDim + 3].getAsVector(); + + // zn is a SVLs x 4 sub matrix + // zm is a 4 x SVLs sub matrix + // Resulting SVLs x SVLs matrix has results widened to 32-bit + for (int row = 0; row < tileDim; row++) { + int32_t outRow[64] = {0}; + const int32_t* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < tileDim; col++) { + // Get corresponding output element + int32_t sum = zadaRow[col]; + for (int k = 0; k < 4; k++) { + const uint16_t znIndex = 4 * row + k; + const uint16_t zmIndex = 4 * col + k; + const uint64_t shifted_active_zn = 1ull << (znIndex % 64); + const uint64_t shifted_active_zm = 1ull << (zmIndex % 64); + if ((pn[znIndex / 64] & shifted_active_zn) && + (pm[zmIndex / 64] & shifted_active_zm)) + sum += (static_cast(zn[znIndex]) * + static_cast(zm[zmIndex])); + } + outRow[col] = sum; + } + results_[row] = {outRow, 256}; + } + break; + } + case Opcode::AArch64_SMOPS_MPPZZ_D: { // smops zada.d, pn/m, pm/m, zn.h, + // zm.h + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t tileDim = VL_bits / 64; + const uint64_t* pn = sourceValues_[tileDim].getAsVector(); + const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector(); + const int16_t* zn = sourceValues_[tileDim + 2].getAsVector(); + const int16_t* zm = sourceValues_[tileDim + 3].getAsVector(); + + // zn is a SVLd x 4 sub matrix + // zm is a 4 x SVLd sub matrix + // Resulting SVLd x SVLd matrix has results widened to 64-bit + for (int row = 0; row < tileDim; row++) { + int64_t outRow[32] = {0}; + const int64_t* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < tileDim; col++) { + // Get corresponding output element + int64_t sum = zadaRow[col]; + for (int k = 0; k < 4; k++) { + const uint16_t znIndex = 4 * row + k; + const uint16_t zmIndex = 4 * col + k; + const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2); + const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2); + if ((pn[znIndex / 32] & shifted_active_zn) && + (pm[zmIndex / 32] & shifted_active_zm)) + sum -= (static_cast(zn[znIndex]) * + static_cast(zm[zmIndex])); + } + outRow[col] = sum; + } + results_[row] = {outRow, 256}; + } + break; + } + case Opcode::AArch64_SMOPS_MPPZZ_S: { // smops zada.s, pn/m, pm/m, zn.b, + // zm.b + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t tileDim = VL_bits / 32; + const uint64_t* pn = sourceValues_[tileDim].getAsVector(); + const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector(); + const int8_t* zn = sourceValues_[tileDim + 2].getAsVector(); + const int8_t* zm = sourceValues_[tileDim + 3].getAsVector(); + + // zn is a SVLs x 4 sub matrix + // zm is a 4 x SVLs sub matrix + // Resulting SVLs x SVLs matrix has results widened to 32-bit + for (int row = 0; row < tileDim; row++) { + int32_t outRow[64] = {0}; + const int32_t* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < tileDim; col++) { + // Get corresponding output element + int32_t sum = zadaRow[col]; + for (int k = 0; k < 4; k++) { + const uint16_t znIndex = 4 * row + k; + const uint16_t zmIndex = 4 * col + k; + const uint64_t shifted_active_zn = 1ull << (znIndex % 64); + const uint64_t shifted_active_zm = 1ull << (zmIndex % 64); + if ((pn[znIndex / 64] & shifted_active_zn) && + (pm[zmIndex / 64] & shifted_active_zm)) + sum -= (static_cast(zn[znIndex]) * + static_cast(zm[zmIndex])); + } + outRow[col] = sum; + } + results_[row] = {outRow, 256}; + } + break; + } case Opcode::AArch64_SMSUBLrrr: { // smsubl xd, wn, wm, xa results_[0] = msubl_4ops(sourceValues_); break; @@ -4410,40 +5559,195 @@ void Instruction::execute() { } break; } - case Opcode::AArch64_SST1D_IMM: { // st1d {zd.d}, pg, [zn.d{, #imm}] - // STORE - const uint64_t* t = sourceValues_[0].getAsVector(); - const uint64_t* p = sourceValues_[1].getAsVector(); + case Opcode::AArch64_SST1D_IMM: { // st1d {zd.d}, pg, [zn.d{, #imm}] + // STORE + const uint64_t* t = sourceValues_[0].getAsVector(); + const uint64_t* p = sourceValues_[1].getAsVector(); + + const uint16_t partition_num = VL_bits / 64; + uint16_t index = 0; + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % 8) * 8); + if (p[i / 8] & shifted_active) { + memoryData_[index] = t[i]; + index++; + } + } + break; + } + case Opcode::AArch64_SST1D_SCALED: { // st1d {zt.d}, pg, [xn, + // zm.d, lsl #3] + // STORE + const uint64_t* d = sourceValues_[0].getAsVector(); + const uint64_t* p = sourceValues_[1].getAsVector(); + + const uint16_t partition_num = VL_bits / 64; + uint16_t index = 0; + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % 8) * 8); + if (p[i / 8] & shifted_active) { + memoryData_[index] = d[i]; + index++; + } + } + break; + } + case Opcode::AArch64_ST1_MXIPXX_H_B: { // st1b {zath.b[ws, #imm]}, pg, + // [{, xm}] + // SME, STORE + // If not in right context mode, raise exception + if (!ZAenabled) return ZAdisabled(); + + const uint16_t partition_num = VL_bits / 8; + const uint32_t ws = sourceValues_[partition_num].get(); + const uint64_t* pg = + sourceValues_[partition_num + 1].getAsVector(); + + const uint32_t sliceNum = + (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num; + + const uint8_t* tileSlice = + sourceValues_[sliceNum].getAsVector(); + memoryData_ = sve_merge_store_data(tileSlice, pg, VL_bits); + break; + } + case Opcode::AArch64_ST1_MXIPXX_H_D: { // st1d {zath.d[ws, #imm]}, pg, + // [{, xm, lsl #3}] + // SME, STORE + // If not in right context mode, raise exception + if (!ZAenabled) return ZAdisabled(); + + const uint16_t partition_num = VL_bits / 64; + const uint32_t ws = sourceValues_[partition_num].get(); + const uint64_t* pg = + sourceValues_[partition_num + 1].getAsVector(); + + const uint32_t sliceNum = + (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num; + + const uint64_t* tileSlice = + sourceValues_[sliceNum].getAsVector(); + memoryData_ = sve_merge_store_data(tileSlice, pg, VL_bits); + break; + } + case Opcode::AArch64_ST1_MXIPXX_H_H: { // st1h {zath.h[ws, #imm]}, pg, + // [{, xm, lsl #1}] + // SME, STORE + // If not in right context mode, raise exception + if (!ZAenabled) return ZAdisabled(); + + const uint16_t partition_num = VL_bits / 16; + const uint32_t ws = sourceValues_[partition_num].get(); + const uint64_t* pg = + sourceValues_[partition_num + 1].getAsVector(); + + const uint32_t sliceNum = + (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num; + + const uint16_t* tileSlice = + sourceValues_[sliceNum].getAsVector(); + memoryData_ = sve_merge_store_data(tileSlice, pg, VL_bits); + break; + } + case Opcode::AArch64_ST1_MXIPXX_H_Q: { // st1q {zath.q[ws]}, pg, + // [{, xm, lsl #4}] + // SME, STORE + // If not in right context mode, raise exception + if (!ZAenabled) return ZAdisabled(); + + const uint16_t partition_num = VL_bits / 128; + const uint32_t ws = sourceValues_[partition_num].get(); + const uint64_t* pg = + sourceValues_[partition_num + 1].getAsVector(); + + const uint32_t sliceNum = ws % partition_num; + + // Using uint64_t as no 128-bit type + const uint64_t* tileSlice = + sourceValues_[sliceNum].getAsVector(); + + // Need to combine active adjacent elements into RegisterValues and + // place into each memoryData_ index. + int index = 0; + std::vector memData; + for (uint16_t i = 0; i < partition_num; i++) { + // For 128-bit there are 16-bit for each active element + uint64_t shifted_active = 1ull << ((i % 4) * 16); + if (pg[i / 4] & shifted_active) { + // As using uint64_t need to push_back 2 elements + memData.push_back(tileSlice[2 * i]); + memData.push_back(tileSlice[2 * i + 1]); + } else if (memData.size() > 0) { + // Predicate false, save current data + memoryData_[index] = RegisterValue( + (char*)memData.data(), memData.size() * sizeof(uint64_t)); + index++; + memData.clear(); + } + } + // Check if final data needs putting into memoryData_ + if (memData.size() > 0) { + memoryData_[index] = RegisterValue((char*)memData.data(), + memData.size() * sizeof(uint64_t)); + } + break; + } + case Opcode::AArch64_ST1_MXIPXX_H_S: { // st1w {zath.s[ws, #imm]}, pg, + // [{, xm, lsl #2}] + // SME, STORE + // If not in right context mode, raise exception + if (!ZAenabled) return ZAdisabled(); - const uint16_t partition_num = VL_bits / 64; - uint16_t index = 0; - for (int i = 0; i < partition_num; i++) { - uint64_t shifted_active = 1ull << ((i % 8) * 8); - if (p[i / 8] & shifted_active) { - memoryData_[index] = t[i]; - index++; - } - } + const uint16_t partition_num = VL_bits / 32; + const uint32_t ws = sourceValues_[partition_num].get(); + const uint64_t* pg = + sourceValues_[partition_num + 1].getAsVector(); + + const uint32_t sliceNum = + (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num; + + const uint32_t* tileSlice = + sourceValues_[sliceNum].getAsVector(); + memoryData_ = sve_merge_store_data(tileSlice, pg, VL_bits); break; } - case Opcode::AArch64_SST1D_SCALED: { // st1d {zt.d}, pg, [xn, - // zm.d, lsl #3] - // STORE - const uint64_t* d = sourceValues_[0].getAsVector(); - const uint64_t* p = sourceValues_[1].getAsVector(); + case Opcode::AArch64_ST1_MXIPXX_V_B: { // st1b {zatv.b[ws, #imm]}, pg, + // [{, xm}] + // SME, STORE + // Not in right context mode. Raise exception + if (!ZAenabled) return ZAdisabled(); - const uint16_t partition_num = VL_bits / 64; + const uint16_t partition_num = VL_bits / 8; + const uint32_t ws = sourceValues_[partition_num].get(); + const uint64_t* pg = + sourceValues_[partition_num + 1].getAsVector(); + + const uint32_t sliceNum = + (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num; + + std::vector memData; uint16_t index = 0; - for (int i = 0; i < partition_num; i++) { - uint64_t shifted_active = 1ull << ((i % 8) * 8); - if (p[i / 8] & shifted_active) { - memoryData_[index] = d[i]; + + for (uint16_t x = 0; x < partition_num; x++) { + uint64_t shifted_active = 1ull << (x % 64); + if (pg[x / 64] & shifted_active) { + memData.push_back( + sourceValues_[x].getAsVector()[sliceNum]); + } else if (memData.size() > 0) { + memoryData_[index] = + RegisterValue((char*)memData.data(), memData.size()); index++; + memData.clear(); } } + + if (memData.size() > 0) { + memoryData_[index] = + RegisterValue((char*)memData.data(), memData.size()); + } break; } - case Opcode::AArch64_ST1_MXIPXX_H_D: { // st1d {zath.d[ws, #imm]}, pg, + case Opcode::AArch64_ST1_MXIPXX_V_D: { // st1d {zatv.d[ws, #imm]}, pg, // [{, xm, lsl #3}] // SME, STORE // Not in right context mode. Raise exception @@ -4457,19 +5761,35 @@ void Instruction::execute() { const uint32_t sliceNum = (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num; - const uint64_t* tileSlice = - sourceValues_[sliceNum].getAsVector(); - memoryData_ = sve_merge_store_data(tileSlice, pg, VL_bits); + std::vector memData; + uint16_t index = 0; + + for (uint16_t x = 0; x < partition_num; x++) { + uint64_t shifted_active = 1ull << ((x % 8) * 8); + if (pg[x / 8] & shifted_active) { + memData.push_back( + sourceValues_[x].getAsVector()[sliceNum]); + } else if (memData.size() > 0) { + memoryData_[index] = + RegisterValue((char*)memData.data(), memData.size() * 8); + index++; + memData.clear(); + } + } + if (memData.size() > 0) { + memoryData_[index] = + RegisterValue((char*)memData.data(), memData.size() * 8); + } break; } - case Opcode::AArch64_ST1_MXIPXX_V_D: { // st1d {zatv.d[ws, #imm]}, pg, - // [{, xm, lsl #3}] + case Opcode::AArch64_ST1_MXIPXX_V_H: { // st1h {zatv.h[ws, #imm]}, pg, + // [{, xm, LSL #1}] // SME, STORE // Not in right context mode. Raise exception if (!ZAenabled) return ZAdisabled(); - const uint16_t partition_num = VL_bits / 64; + const uint16_t partition_num = VL_bits / 16; const uint32_t ws = sourceValues_[partition_num].get(); const uint64_t* pg = sourceValues_[partition_num + 1].getAsVector(); @@ -4477,45 +5797,68 @@ void Instruction::execute() { const uint32_t sliceNum = (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num; - std::array mdata; - uint16_t md_size = 0; + std::vector memData; uint16_t index = 0; for (uint16_t x = 0; x < partition_num; x++) { - uint64_t shifted_active = 1ull << ((x % 8) * 8); - if (pg[x / 8] & shifted_active) { - mdata[md_size] = sourceValues_[x].getAsVector()[sliceNum]; - md_size++; - } else if (md_size) { + uint64_t shifted_active = 1ull << ((x % 32) * 2); + if (pg[x / 32] & shifted_active) { + memData.push_back( + sourceValues_[x].getAsVector()[sliceNum]); + } else if (memData.size() > 0) { memoryData_[index] = - RegisterValue((char*)mdata.data(), md_size * 8); - md_size = 0; + RegisterValue((char*)memData.data(), memData.size() * 2); + index++; + memData.clear(); } } - if (md_size) { - memoryData_[index] = RegisterValue((char*)mdata.data(), md_size * 8); + if (memData.size() > 0) { + memoryData_[index] = + RegisterValue((char*)memData.data(), memData.size() * 2); } break; } - case Opcode::AArch64_ST1_MXIPXX_H_S: { // st1w {zath.s[ws, #imm]}, pg, - // [{, xm, LSL #2}] + case Opcode::AArch64_ST1_MXIPXX_V_Q: { // st1h {zatv.q[ws]}, pg, + // [{, xm, LSL #4}] // SME, STORE // Not in right context mode. Raise exception if (!ZAenabled) return ZAdisabled(); - const uint16_t partition_num = VL_bits / 32; + const uint16_t partition_num = VL_bits / 128; const uint32_t ws = sourceValues_[partition_num].get(); const uint64_t* pg = sourceValues_[partition_num + 1].getAsVector(); - const uint32_t sliceNum = - (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num; + const uint32_t sliceNum = ws % partition_num; - const uint32_t* tileSlice = - sourceValues_[sliceNum].getAsVector(); - memoryData_ = sve_merge_store_data(tileSlice, pg, VL_bits); + // Need to combine active adjacent elements into RegisterValues and + // place into each memoryData_ index. + std::vector memData; + uint16_t index = 0; + for (uint16_t x = 0; x < partition_num; x++) { + // For 128-bit there are 16-bit for each active element + uint64_t shifted_active = 1ull << ((x % 4) * 16); + if (pg[x / 4] & shifted_active) { + // As using uint64_t need to push_back 2 elements + memData.push_back( + sourceValues_[x].getAsVector()[2 * sliceNum]); + memData.push_back( + sourceValues_[x].getAsVector()[2 * sliceNum + 1]); + } else if (memData.size() > 0) { + // Predicate false, save current data + memoryData_[index] = RegisterValue( + (char*)memData.data(), memData.size() * sizeof(uint64_t)); + index++; + memData.clear(); + } + } + // Check if final data needs putting into memoryData_ + if (memData.size() > 0) { + memoryData_[index] = RegisterValue((char*)memData.data(), + memData.size() * sizeof(uint64_t)); + } break; } case Opcode::AArch64_ST1_MXIPXX_V_S: { // st1w {zatv.s[ws, #imm]}, pg, @@ -4532,26 +5875,26 @@ void Instruction::execute() { const uint32_t sliceNum = (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num; - std::array mdata; - uint16_t md_size = 0; + std::vector memData; uint16_t index = 0; for (uint16_t x = 0; x < partition_num; x++) { uint64_t shifted_active = 1ull << ((x % 16) * 4); if (pg[x / 16] & shifted_active) { - mdata[md_size] = sourceValues_[x].getAsVector()[sliceNum]; - md_size++; - } else if (md_size) { + memData.push_back( + sourceValues_[x].getAsVector()[sliceNum]); + } else if (memData.size() > 0) { memoryData_[index] = - RegisterValue((char*)mdata.data(), md_size * 4); - md_size = 0; + RegisterValue((char*)memData.data(), memData.size() * 4); + index++; + memData.clear(); } } - if (md_size) { - memoryData_[index] = RegisterValue((char*)mdata.data(), md_size * 4); + if (memData.size() > 0) { + memoryData_[index] = + RegisterValue((char*)memData.data(), memData.size() * 4); } - break; } case Opcode::AArch64_SST1W_D_IMM: { // st1w {zt.d}, pg, [zn.d{, #imm}] @@ -5133,6 +6476,21 @@ void Instruction::execute() { memoryData_[0] = RegisterValue((char*)p, partition_num); break; } + case Opcode::AArch64_STR_ZA: { // str za[wv, #imm], [xn|sp{, #imm, mul + // vl}] + // SME, STORE + // If not in right context mode, raise exception + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint32_t wv = sourceValues_[zaRowCount].get(); + const uint32_t imm = metadata_.operands[0].sme.slice_offset.imm; + + const uint8_t* zaRow = + sourceValues_[(wv + imm) % zaRowCount].getAsVector(); + memoryData_[0] = RegisterValue((char*)zaRow, zaRowCount); + break; + } case Opcode::AArch64_STR_ZXI: { // str zt, [xn{, #imm, mul vl}] // STORE const uint16_t partition_num = VL_bits / 8; @@ -5316,6 +6674,158 @@ void Instruction::execute() { [](uint8_t x, uint8_t y) -> uint8_t { return x - y; }); break; } + case Opcode::AArch64_SUMOPA_MPPZZ_D: { // sumopa zada.d, pn/m, pm/m, + // zn.h, zm.h + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t tileDim = VL_bits / 64; + const uint64_t* pn = sourceValues_[tileDim].getAsVector(); + const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector(); + const int16_t* zn = sourceValues_[tileDim + 2].getAsVector(); + const uint16_t* zm = sourceValues_[tileDim + 3].getAsVector(); + + // zn is a SVLd x 4 sub matrix + // zm is a 4 x SVLd sub matrix + // Resulting SVLd x SVLd matrix has results widened to 64-bit + for (int row = 0; row < tileDim; row++) { + int64_t outRow[32] = {0}; + const int64_t* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < tileDim; col++) { + // Get corresponding output element + int64_t sum = zadaRow[col]; + for (int k = 0; k < 4; k++) { + const uint16_t znIndex = 4 * row + k; + const uint16_t zmIndex = 4 * col + k; + const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2); + const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2); + if ((pn[znIndex / 32] & shifted_active_zn) && + (pm[zmIndex / 32] & shifted_active_zm)) + sum += (static_cast(zn[znIndex]) * + static_cast(zm[zmIndex])); + } + outRow[col] = sum; + } + results_[row] = {outRow, 256}; + } + break; + } + case Opcode::AArch64_SUMOPA_MPPZZ_S: { // sumopa zada.s, pn/m, pm/m, + // zn.b, zm.b + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t tileDim = VL_bits / 32; + const uint64_t* pn = sourceValues_[tileDim].getAsVector(); + const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector(); + const int8_t* zn = sourceValues_[tileDim + 2].getAsVector(); + const uint8_t* zm = sourceValues_[tileDim + 3].getAsVector(); + + // zn is a SVLs x 4 sub matrix + // zm is a 4 x SVLs sub matrix + // Resulting SVLs x SVLs matrix has results widened to 32-bit + for (int row = 0; row < tileDim; row++) { + int32_t outRow[64] = {0}; + const int32_t* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < tileDim; col++) { + // Get corresponding output element + int32_t sum = zadaRow[col]; + for (int k = 0; k < 4; k++) { + const uint16_t znIndex = 4 * row + k; + const uint16_t zmIndex = 4 * col + k; + const uint64_t shifted_active_zn = 1ull << (znIndex % 64); + const uint64_t shifted_active_zm = 1ull << (zmIndex % 64); + if ((pn[znIndex / 64] & shifted_active_zn) && + (pm[zmIndex / 64] & shifted_active_zm)) + sum += (static_cast(zn[znIndex]) * + static_cast(zm[zmIndex])); + } + outRow[col] = sum; + } + results_[row] = {outRow, 256}; + } + break; + } + case Opcode::AArch64_SUMOPS_MPPZZ_D: { // sumops zada.d, pn/m, pm/m, + // zn.h, zm.h + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t tileDim = VL_bits / 64; + const uint64_t* pn = sourceValues_[tileDim].getAsVector(); + const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector(); + const int16_t* zn = sourceValues_[tileDim + 2].getAsVector(); + const uint16_t* zm = sourceValues_[tileDim + 3].getAsVector(); + + // zn is a SVLd x 4 sub matrix + // zm is a 4 x SVLd sub matrix + // Resulting SVLd x SVLd matrix has results widened to 64-bit + for (int row = 0; row < tileDim; row++) { + int64_t outRow[32] = {0}; + const int64_t* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < tileDim; col++) { + // Get corresponding output element + int64_t sum = zadaRow[col]; + for (int k = 0; k < 4; k++) { + const uint16_t znIndex = 4 * row + k; + const uint16_t zmIndex = 4 * col + k; + const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2); + const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2); + if ((pn[znIndex / 32] & shifted_active_zn) && + (pm[zmIndex / 32] & shifted_active_zm)) + sum -= (static_cast(zn[znIndex]) * + static_cast(zm[zmIndex])); + } + outRow[col] = sum; + } + results_[row] = {outRow, 256}; + } + break; + } + case Opcode::AArch64_SUMOPS_MPPZZ_S: { // sumops zada.s, pn/m, pm/m, + // zn.b, zm.b + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t tileDim = VL_bits / 32; + const uint64_t* pn = sourceValues_[tileDim].getAsVector(); + const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector(); + const int8_t* zn = sourceValues_[tileDim + 2].getAsVector(); + const uint8_t* zm = sourceValues_[tileDim + 3].getAsVector(); + + // zn is a SVLs x 4 sub matrix + // zm is a 4 x SVLs sub matrix + // Resulting SVLs x SVLs matrix has results widened to 32-bit + for (int row = 0; row < tileDim; row++) { + int32_t outRow[64] = {0}; + const int32_t* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < tileDim; col++) { + // Get corresponding output element + int32_t sum = zadaRow[col]; + for (int k = 0; k < 4; k++) { + const uint16_t znIndex = 4 * row + k; + const uint16_t zmIndex = 4 * col + k; + const uint64_t shifted_active_zn = 1ull << (znIndex % 64); + const uint64_t shifted_active_zm = 1ull << (zmIndex % 64); + if ((pn[znIndex / 64] & shifted_active_zn) && + (pm[zmIndex / 64] & shifted_active_zm)) + sum -= (static_cast(zn[znIndex]) * + static_cast(zm[zmIndex])); + } + outRow[col] = sum; + } + results_[row] = {outRow, 256}; + } + break; + } case Opcode::AArch64_SVC: { // svc #imm exceptionEncountered_ = true; exception_ = InstructionException::SupervisorCall; @@ -5568,6 +7078,158 @@ void Instruction::execute() { results_[0] = vecUMinP(sourceValues_); break; } + case Opcode::AArch64_UMOPA_MPPZZ_D: { // umopa zada.d, pn/m, pm/m, zn.h, + // zm.h + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t tileDim = VL_bits / 64; + const uint64_t* pn = sourceValues_[tileDim].getAsVector(); + const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector(); + const uint16_t* zn = sourceValues_[tileDim + 2].getAsVector(); + const uint16_t* zm = sourceValues_[tileDim + 3].getAsVector(); + + // zn is a SVLd x 4 sub matrix + // zm is a 4 x SVLd sub matrix + // Resulting SVLd x SVLd matrix has results widened to 64-bit + for (int row = 0; row < tileDim; row++) { + uint64_t outRow[32] = {0}; + const uint64_t* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < tileDim; col++) { + // Get corresponding output element + uint64_t sum = zadaRow[col]; + for (int k = 0; k < 4; k++) { + const uint16_t znIndex = 4 * row + k; + const uint16_t zmIndex = 4 * col + k; + const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2); + const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2); + if ((pn[znIndex / 32] & shifted_active_zn) && + (pm[zmIndex / 32] & shifted_active_zm)) + sum += (static_cast(zn[znIndex]) * + static_cast(zm[zmIndex])); + } + outRow[col] = sum; + } + results_[row] = {outRow, 256}; + } + break; + } + case Opcode::AArch64_UMOPA_MPPZZ_S: { // umopa zada.s, pn/m, pm/m, zn.b, + // zm.b + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t tileDim = VL_bits / 32; + const uint64_t* pn = sourceValues_[tileDim].getAsVector(); + const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector(); + const uint8_t* zn = sourceValues_[tileDim + 2].getAsVector(); + const uint8_t* zm = sourceValues_[tileDim + 3].getAsVector(); + + // zn is a SVLs x 4 sub matrix + // zm is a 4 x SVLs sub matrix + // Resulting SVLs x SVLs matrix has results widened to 32-bit + for (int row = 0; row < tileDim; row++) { + uint32_t outRow[64] = {0}; + const uint32_t* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < tileDim; col++) { + // Get corresponding output element + uint32_t sum = zadaRow[col]; + for (int k = 0; k < 4; k++) { + const uint16_t znIndex = 4 * row + k; + const uint16_t zmIndex = 4 * col + k; + const uint64_t shifted_active_zn = 1ull << (znIndex % 64); + const uint64_t shifted_active_zm = 1ull << (zmIndex % 64); + if ((pn[znIndex / 64] & shifted_active_zn) && + (pm[zmIndex / 64] & shifted_active_zm)) + sum += (static_cast(zn[znIndex]) * + static_cast(zm[zmIndex])); + } + outRow[col] = sum; + } + results_[row] = {outRow, 256}; + } + break; + } + case Opcode::AArch64_UMOPS_MPPZZ_D: { // umops zada.d, pn/m, pm/m, zn.h, + // zm.h + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t tileDim = VL_bits / 64; + const uint64_t* pn = sourceValues_[tileDim].getAsVector(); + const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector(); + const uint16_t* zn = sourceValues_[tileDim + 2].getAsVector(); + const uint16_t* zm = sourceValues_[tileDim + 3].getAsVector(); + + // zn is a SVLd x 4 sub matrix + // zm is a 4 x SVLd sub matrix + // Resulting SVLd x SVLd matrix has results widened to 64-bit + for (int row = 0; row < tileDim; row++) { + uint64_t outRow[32] = {0}; + const uint64_t* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < tileDim; col++) { + // Get corresponding output element + uint64_t sum = zadaRow[col]; + for (int k = 0; k < 4; k++) { + const uint16_t znIndex = 4 * row + k; + const uint16_t zmIndex = 4 * col + k; + const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2); + const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2); + if ((pn[znIndex / 32] & shifted_active_zn) && + (pm[zmIndex / 32] & shifted_active_zm)) + sum -= (static_cast(zn[znIndex]) * + static_cast(zm[zmIndex])); + } + outRow[col] = sum; + } + results_[row] = {outRow, 256}; + } + break; + } + case Opcode::AArch64_UMOPS_MPPZZ_S: { // umops zada.s, pn/m, pm/m, zn.b, + // zm.b + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t tileDim = VL_bits / 32; + const uint64_t* pn = sourceValues_[tileDim].getAsVector(); + const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector(); + const uint8_t* zn = sourceValues_[tileDim + 2].getAsVector(); + const uint8_t* zm = sourceValues_[tileDim + 3].getAsVector(); + + // zn is a SVLs x 4 sub matrix + // zm is a 4 x SVLs sub matrix + // Resulting SVLs x SVLs matrix has results widened to 32-bit + for (int row = 0; row < tileDim; row++) { + uint32_t outRow[64] = {0}; + const uint32_t* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < tileDim; col++) { + // Get corresponding output element + uint32_t sum = zadaRow[col]; + for (int k = 0; k < 4; k++) { + const uint16_t znIndex = 4 * row + k; + const uint16_t zmIndex = 4 * col + k; + const uint64_t shifted_active_zn = 1ull << (znIndex % 64); + const uint64_t shifted_active_zm = 1ull << (zmIndex % 64); + if ((pn[znIndex / 64] & shifted_active_zn) && + (pm[zmIndex / 64] & shifted_active_zm)) + sum -= (static_cast(zn[znIndex]) * + static_cast(zm[zmIndex])); + } + outRow[col] = sum; + } + results_[row] = {outRow, 256}; + } + break; + } case Opcode::AArch64_UMOVvi32_idx0: // umov wd, vn.s[0] case Opcode::AArch64_UMOVvi32: { // umov wd, vn.s[index] const uint32_t* vec = sourceValues_[0].getAsVector(); @@ -5635,6 +7297,158 @@ void Instruction::execute() { sourceValues_, metadata_, false); break; } + case Opcode::AArch64_USMOPA_MPPZZ_D: { // usmopa zada.d, pn/m, pm/m, + // zn.h, zm.h + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t tileDim = VL_bits / 64; + const uint64_t* pn = sourceValues_[tileDim].getAsVector(); + const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector(); + const uint16_t* zn = sourceValues_[tileDim + 2].getAsVector(); + const int16_t* zm = sourceValues_[tileDim + 3].getAsVector(); + + // zn is a SVLd x 4 sub matrix + // zm is a 4 x SVLd sub matrix + // Resulting SVLd x SVLd matrix has results widened to 64-bit + for (int row = 0; row < tileDim; row++) { + int64_t outRow[32] = {0}; + const int64_t* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < tileDim; col++) { + // Get corresponding output element + int64_t sum = zadaRow[col]; + for (int k = 0; k < 4; k++) { + const uint16_t znIndex = 4 * row + k; + const uint16_t zmIndex = 4 * col + k; + const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2); + const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2); + if ((pn[znIndex / 32] & shifted_active_zn) && + (pm[zmIndex / 32] & shifted_active_zm)) + sum += (static_cast(zn[znIndex]) * + static_cast(zm[zmIndex])); + } + outRow[col] = sum; + } + results_[row] = {outRow, 256}; + } + break; + } + case Opcode::AArch64_USMOPA_MPPZZ_S: { // usmopa zada.s, pn/m, pm/m, + // zn.b, zm.b + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t tileDim = VL_bits / 32; + const uint64_t* pn = sourceValues_[tileDim].getAsVector(); + const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector(); + const uint8_t* zn = sourceValues_[tileDim + 2].getAsVector(); + const int8_t* zm = sourceValues_[tileDim + 3].getAsVector(); + + // zn is a SVLs x 4 sub matrix + // zm is a 4 x SVLs sub matrix + // Resulting SVLs x SVLs matrix has results widened to 32-bit + for (int row = 0; row < tileDim; row++) { + int32_t outRow[64] = {0}; + const int32_t* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < tileDim; col++) { + // Get corresponding output element + int32_t sum = zadaRow[col]; + for (int k = 0; k < 4; k++) { + const uint16_t znIndex = 4 * row + k; + const uint16_t zmIndex = 4 * col + k; + const uint64_t shifted_active_zn = 1ull << (znIndex % 64); + const uint64_t shifted_active_zm = 1ull << (zmIndex % 64); + if ((pn[znIndex / 64] & shifted_active_zn) && + (pm[zmIndex / 64] & shifted_active_zm)) + sum += (static_cast(zn[znIndex]) * + static_cast(zm[zmIndex])); + } + outRow[col] = sum; + } + results_[row] = {outRow, 256}; + } + break; + } + case Opcode::AArch64_USMOPS_MPPZZ_D: { // usmops zada.d, pn/m, pm/m, + // zn.h, zm.h + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t tileDim = VL_bits / 64; + const uint64_t* pn = sourceValues_[tileDim].getAsVector(); + const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector(); + const uint16_t* zn = sourceValues_[tileDim + 2].getAsVector(); + const int16_t* zm = sourceValues_[tileDim + 3].getAsVector(); + + // zn is a SVLd x 4 sub matrix + // zm is a 4 x SVLd sub matrix + // Resulting SVLd x SVLd matrix has results widened to 64-bit + for (int row = 0; row < tileDim; row++) { + int64_t outRow[32] = {0}; + const int64_t* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < tileDim; col++) { + // Get corresponding output element + int64_t sum = zadaRow[col]; + for (int k = 0; k < 4; k++) { + const uint16_t znIndex = 4 * row + k; + const uint16_t zmIndex = 4 * col + k; + const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2); + const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2); + if ((pn[znIndex / 32] & shifted_active_zn) && + (pm[zmIndex / 32] & shifted_active_zm)) + sum -= (static_cast(zn[znIndex]) * + static_cast(zm[zmIndex])); + } + outRow[col] = sum; + } + results_[row] = {outRow, 256}; + } + break; + } + case Opcode::AArch64_USMOPS_MPPZZ_S: { // usmops zada.s, pn/m, pm/m, + // zn.b, zm.b + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t tileDim = VL_bits / 32; + const uint64_t* pn = sourceValues_[tileDim].getAsVector(); + const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector(); + const uint8_t* zn = sourceValues_[tileDim + 2].getAsVector(); + const int8_t* zm = sourceValues_[tileDim + 3].getAsVector(); + + // zn is a SVLs x 4 sub matrix + // zm is a 4 x SVLs sub matrix + // Resulting SVLs x SVLs matrix has results widened to 32-bit + for (int row = 0; row < tileDim; row++) { + int32_t outRow[64] = {0}; + const int32_t* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < tileDim; col++) { + // Get corresponding output element + int32_t sum = zadaRow[col]; + for (int k = 0; k < 4; k++) { + const uint16_t znIndex = 4 * row + k; + const uint16_t zmIndex = 4 * col + k; + const uint64_t shifted_active_zn = 1ull << (znIndex % 64); + const uint64_t shifted_active_zm = 1ull << (zmIndex % 64); + if ((pn[znIndex / 64] & shifted_active_zn) && + (pm[zmIndex / 64] & shifted_active_zm)) + sum -= (static_cast(zn[znIndex]) * + static_cast(zm[zmIndex])); + } + outRow[col] = sum; + } + results_[row] = {outRow, 256}; + } + break; + } case Opcode::AArch64_UUNPKHI_ZZ_D: { // uunpkhi zd.d, zn.s results_[0] = sveUnpk_vecs(sourceValues_, VL_bits, true); diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc index a4731f388f..96d23590a6 100644 --- a/test/regression/aarch64/instructions/neon.cc +++ b/test/regression/aarch64/instructions/neon.cc @@ -727,8 +727,8 @@ TEST_P(InstNeon, cmhs) { heap[1] = 0x7F; heap[2] = INT8_MAX; heap[3] = 1; - heap[4] = -128; - heap[5] = -1; + heap[4] = 128; + heap[5] = 1; heap[6] = 0xAA; heap[7] = 0xBB; heap[8] = 0xCC; @@ -744,7 +744,7 @@ TEST_P(InstNeon, cmhs) { heap[16] = INT8_MAX; heap[17] = 0x7F; heap[18] = 0; - heap[19] = -128; + heap[19] = 128; heap[20] = 1; heap[21] = 0; heap[22] = 0xAA; @@ -772,10 +772,10 @@ TEST_P(InstNeon, cmhs) { )"); CHECK_NEON(2, uint8_t, - {0x00, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + {0x00, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}); CHECK_NEON(3, uint8_t, - {0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, + {0xFF, 0xFF, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF}); } @@ -2684,8 +2684,8 @@ TEST_P(InstNeon, uminp) { )"); CHECK_NEON(2, uint8_t, - {0x00, 0x00, 0xEE, 0x11, 0x22, 0x33, 0x44, 0x55, 0x01, 0x02, 0x03, - 0x04, 0x05, 0x06, 0x07, 0x08}); + {0x00, 0xAA, 0xBB, 0xDD, 0x01, 0x03, 0x05, 0x07, 0x00, 0x11, 0x22, + 0x44, 0xEE, 0xCC, 0xAA, 0x88}); } TEST_P(InstNeon, umaxp) { // umaxp vd.16b vn.16b vm.16b @@ -2738,12 +2738,12 @@ TEST_P(InstNeon, umaxp) { ldr q0, [x0] ldr q1, [x0, #16] - umaxp v2.16b, v0.16b, v1.16b + umaxp v2.16b, v1.16b, v0.16b )"); CHECK_NEON(2, uint8_t, - {0x01, 0x00, 0xFF, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0xEE, 0xDD, - 0xCC, 0xBB, 0xAA, 0x99, 0x88}); + {0x00, 0xEE, 0x33, 0x55, 0xFF, 0xDD, 0xBB, 0x99, 0x01, 0xFF, 0xCC, + 0xEE, 0x02, 0x04, 0x06, 0x08}); } TEST_P(InstNeon, smax) { diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index 55c7b945f3..a54c0c981a 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -8,110 +8,269 @@ namespace { using InstSme = AArch64RegressionTest; #if SIMENG_LLVM_VERSION >= 14 -TEST_P(InstSme, mova) { - // 8-bit +TEST_P(InstSme, addha) { + // 32-bit RUN_AARCH64(R"( smstart + zero {za} + ptrue p0.s - ptrue p1.s + pfalse p1.b + zip1 p1.s, p0.s, p1.s - fdup z1.s, #1.0 - mov w0, #1 - index z2.s, #1, w0 - scvtf z2.s, p0/m, z2.s + dup z0.s, #65 + index z1.s, #0, #1 - fdup z4.s, #5.0 - fdup z5.s, #10.0 - fdup z6.s, #5.0 - fdup z7.s, #10.0 - fmopa za0.s, p0/m, p1/m, z2.s, z1.s + # Add to all rows and elems + addha za0.s, p0/m, p0/m, z1.s - ptrue p2.b - mov x2, #0 - mov x3, #2 - addvl x2, x2, #1 - sdiv x2, x2, x3 - whilelo p3.b, xzr, x2 + # Add to all rows, even numbered elements + addha za1.s, p0/m, p0/m, z0.s + addha za1.s, p0/m, p1/m, z1.s - mov w12, #0 - mov w15, #2 + # Add to even rows, all elements + addha za2.s, p0/m, p0/m, z0.s + addha za2.s, p1/m, p0/m, z1.s + + # Even numbered rows, even numbered elements + addha za3.s, p0/m, p0/m, z0.s + addha za3.s, p1/m, p1/m, z1.s + )"); + std::vector full32(64, 0); + std::vector index32(64, 0); + std::vector inter32(64, 0); + for (uint16_t i = 0; i < 64; i++) { + full32[i] = 65; + index32[i] = i; + inter32[i] = (i % 2 == 0) ? i : 65; + } + + for (uint32_t i = 0; i < (SVL / 32); i++) { + // All rows, all elems + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t, + fillNeon(index32, (SVL / 8))); + CHECK_MAT_COL(AARCH64_REG_ZAS0, i, uint32_t, + fillNeon({i}, (SVL / 8))); + // All rows, even elements + CHECK_MAT_ROW(AARCH64_REG_ZAS1, i, uint32_t, + fillNeon(inter32, (SVL / 8))); + if (i % 2 == 0) { + // Even rows, all elements + CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t, + fillNeon(index32, (SVL / 8))); + // Even rows, even elements + CHECK_MAT_ROW(AARCH64_REG_ZAS3, i, uint32_t, + fillNeon(inter32, (SVL / 8))); + } else { + // Even rows, all elements + CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t, + fillNeon(full32, (SVL / 8))); + // Even rows, even elements + CHECK_MAT_ROW(AARCH64_REG_ZAS3, i, uint32_t, + fillNeon(full32, (SVL / 8))); + } + } + + // 64-bit + RUN_AARCH64(R"( + smstart + + zero {za} + + ptrue p0.d + pfalse p1.b + zip1 p1.d, p0.d, p1.d + + dup z0.d, #65 + index z1.d, #0, #1 + + # Add to all rows and elems + addha za0.d, p0/m, p0/m, z1.d - mova z4.b, p2/m, za0h.b[w12, #0] - mova z5.b, p2/m, za0h.b[w12, #4] - mova z6.b, p3/m, za0h.b[w15, #6] - mova z7.b, p3/m, za0h.b[w15, #10] + # Add to all rows, even numbered elements + addha za1.d, p0/m, p0/m, z0.d + addha za1.d, p0/m, p1/m, z1.d + + # Add to even rows, all elements + addha za2.d, p0/m, p0/m, z0.d + addha za2.d, p1/m, p0/m, z1.d + + # Even numbered rows, even numbered elements + addha za3.d, p0/m, p0/m, z0.d + addha za3.d, p1/m, p1/m, z1.d )"); - CHECK_NEON(4, float, fillNeon({1}, SVL / 8)); - CHECK_NEON(5, float, fillNeon({2}, SVL / 8)); - CHECK_NEON(6, float, fillNeonCombined({3}, {5}, SVL / 8)); - CHECK_NEON(7, float, fillNeonCombined({4}, {10}, SVL / 8)); + std::vector full64(32, 0); + std::vector index64(32, 0); + std::vector inter64(32, 0); + for (uint16_t i = 0; i < 32; i++) { + full64[i] = 65; + index64[i] = i; + inter64[i] = (i % 2 == 0) ? i : 65; + } + + for (uint64_t i = 0; i < (SVL / 64); i++) { + // All rows, all elems + CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, uint64_t, + fillNeon(index64, (SVL / 8))); + CHECK_MAT_COL(AARCH64_REG_ZAD0, i, uint64_t, + fillNeon({i}, (SVL / 8))); + // All rows, even elements + CHECK_MAT_ROW(AARCH64_REG_ZAD1, i, uint64_t, + fillNeon(inter64, (SVL / 8))); + if (i % 2 == 0) { + // Even rows, all elements + CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, uint64_t, + fillNeon(index64, (SVL / 8))); + // Even rows, even elements + CHECK_MAT_ROW(AARCH64_REG_ZAD3, i, uint64_t, + fillNeon(inter64, (SVL / 8))); + } else { + // Even rows, all elements + CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, uint64_t, + fillNeon(full64, (SVL / 8))); + // Even rows, even elements + CHECK_MAT_ROW(AARCH64_REG_ZAD3, i, uint64_t, + fillNeon(full64, (SVL / 8))); + } + } } -TEST_P(InstSme, fmopa) { +TEST_P(InstSme, addva) { // 32-bit RUN_AARCH64(R"( smstart - fdup z1.s, #2.0 - fdup z2.s, #5.0 + zero {za} + ptrue p0.s - ptrue p1.s + pfalse p1.b + zip1 p1.s, p0.s, p1.s - fmopa za0.s, p0/m, p1/m, z1.s, z2.s + dup z0.s, #65 + index z1.s, #0, #1 - fdup z3.s, #3.0 - fdup z4.s, #8.0 - mov x0, #0 - mov x1, #8 - addvl x0, x0, #1 - udiv x0, x0, x1 - whilelo p2.s, xzr, x0 + # Add to all cols and elems + addva za0.s, p0/m, p0/m, z1.s - fmopa za2.s, p0/m, p2/m, z3.s, z4.s + # All cols, even elements + addva za1.s, p0/m, p0/m, z0.s + addva za1.s, p1/m, p0/m, z1.s + + # Add to even numbered cols, all elements + addva za2.s, p0/m, p0/m, z0.s + addva za2.s, p0/m, p1/m, z1.s + + # Even numbered cols, even numbered elements + addva za3.s, p0/m, p0/m, z0.s + addva za3.s, p1/m, p1/m, z1.s )"); - for (uint64_t i = 0; i < (SVL / 32); i++) { - CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, float, - fillNeon({10.0f}, (SVL / 8))); - CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, float, - fillNeon({24.0f}, (SVL / 16))); + std::vector full32(64, 0); + std::vector index32(64, 0); + std::vector inter32(64, 0); + for (uint16_t i = 0; i < 64; i++) { + full32[i] = 65; + index32[i] = i; + inter32[i] = (i % 2 == 0) ? i : 65; + } + + for (uint32_t i = 0; i < (SVL / 32); i++) { + // All cols, all elems + CHECK_MAT_COL(AARCH64_REG_ZAS0, i, uint32_t, + fillNeon(index32, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t, + fillNeon({i}, (SVL / 8))); + // All cols, even elements + CHECK_MAT_COL(AARCH64_REG_ZAS1, i, uint32_t, + fillNeon(inter32, (SVL / 8))); + if (i % 2 == 0) { + // Even cols, all elements + CHECK_MAT_COL(AARCH64_REG_ZAS2, i, uint32_t, + fillNeon(index32, (SVL / 8))); + // Even cols, even elements + CHECK_MAT_COL(AARCH64_REG_ZAS3, i, uint32_t, + fillNeon(inter32, (SVL / 8))); + } else { + // Even cols, all elements + CHECK_MAT_COL(AARCH64_REG_ZAS2, i, uint32_t, + fillNeon(full32, (SVL / 8))); + // Even cols, even elements + CHECK_MAT_COL(AARCH64_REG_ZAS3, i, uint32_t, + fillNeon(full32, (SVL / 8))); + } } // 64-bit RUN_AARCH64(R"( smstart - fdup z1.d, #2.0 - fdup z2.d, #5.0 + zero {za} + ptrue p0.d - ptrue p1.d + pfalse p1.b + zip1 p1.d, p0.d, p1.d - fmopa za0.d, p0/m, p1/m, z1.d, z2.d + dup z0.d, #65 + index z1.d, #0, #1 - fdup z3.d, #3.0 - fdup z4.d, #8.0 - mov x0, #0 - mov x1, #16 - addvl x0, x0, #1 - udiv x0, x0, x1 - whilelo p2.d, xzr, x0 + # Add to all cols and elems + addva za0.d, p0/m, p0/m, z1.d - fmopa za2.d, p0/m, p2/m, z3.d, z4.d + # All cols, even elements + addva za1.d, p0/m, p0/m, z0.d + addva za1.d, p1/m, p0/m, z1.d + + # Add to even numbered cols, all elements + addva za2.d, p0/m, p0/m, z0.d + addva za2.d, p0/m, p1/m, z1.d + + # Even numbered cols, even numbered elements + addva za3.d, p0/m, p0/m, z0.d + addva za3.d, p1/m, p1/m, z1.d )"); + std::vector full64(32, 0); + std::vector index64(32, 0); + std::vector inter64(32, 0); + for (uint16_t i = 0; i < 32; i++) { + full64[i] = 65; + index64[i] = i; + inter64[i] = (i % 2 == 0) ? i : 65; + } + for (uint64_t i = 0; i < (SVL / 64); i++) { - CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, double, - fillNeon({10.0}, (SVL / 8))); - CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, double, - fillNeon({24.0}, (SVL / 16))); + // All cols, all elems + CHECK_MAT_COL(AARCH64_REG_ZAD0, i, uint64_t, + fillNeon(index64, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, uint64_t, + fillNeon({i}, (SVL / 8))); + // All cols, even elements + CHECK_MAT_COL(AARCH64_REG_ZAD1, i, uint64_t, + fillNeon(inter64, (SVL / 8))); + if (i % 2 == 0) { + // Even cols, all elements + CHECK_MAT_COL(AARCH64_REG_ZAD2, i, uint64_t, + fillNeon(index64, (SVL / 8))); + // Even cols, even elements + CHECK_MAT_COL(AARCH64_REG_ZAD3, i, uint64_t, + fillNeon(inter64, (SVL / 8))); + } else { + // Even cols, all elements + CHECK_MAT_COL(AARCH64_REG_ZAD2, i, uint64_t, + fillNeon(full64, (SVL / 8))); + // Even cols, even elements + CHECK_MAT_COL(AARCH64_REG_ZAD3, i, uint64_t, + fillNeon(full64, (SVL / 8))); + } } } -TEST_P(InstSme, ld1d) { - // Horizontal +TEST_P(InstSme, mova_tileToVec) { + // 8-bit initialHeapData_.resize(SVL / 4); - uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); - std::vector src = {0xDEADBEEF12345678, 0x98765432ABCDEF01}; - fillHeap(heap64, src, SVL / 32); + uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + std::vector src8 = {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, + 0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01}; + fillHeap(heap8, src8, SVL / 4); RUN_AARCH64(R"( # Get heap address mov x0, 0 @@ -120,38 +279,77 @@ TEST_P(InstSme, ld1d) { smstart - mov x1, #1 - ptrue p0.d - mov w12, #0 - # Load and broadcast values from heap - ld1d {za0h.d[w12, 0]}, p0/z, [x0, x1, lsl #3] - ld1d {za0h.d[w12, 1]}, p0/z, [x0] + zero {za} - # Test for inactive lanes - mov x1, #0 - mov x3, #16 - # TODO change to addsvl when implemented - addvl x1, x1, #1 - udiv x1, x1, x3 - mov x2, #0 - whilelo p1.d, xzr, x1 - ld1d {za1h.d[w12, 1]}, p1/z, [x0, x2, lsl #3] - )"); - CHECK_MAT_ROW( - AARCH64_REG_ZAD0, 0, uint64_t, - fillNeon({0x98765432ABCDEF01, 0xDEADBEEF12345678}, SVL / 8)); - CHECK_MAT_ROW( - AARCH64_REG_ZAD0, 1, uint64_t, - fillNeon({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8)); - CHECK_MAT_ROW(AARCH64_REG_ZAD1, 1, uint64_t, - fillNeonCombined( - {0xDEADBEEF12345678, 0x98765432ABCDEF01}, {0}, SVL / 8)); + ptrue p0.b + pfalse p1.b + zip1 p1.b, p0.b, p1.b - // Vertical + mov w12, #0 + dup z0.b, #1 + dup z1.b, #2 + dup z2.b, #3 + dup z3.b, #4 + dup z4.b, #5 + dup z5.b, #6 + dup z6.b, #7 + dup z7.b, #8 + + # Horizontal + ld1b {za0h.b[w12, #0]}, p0/z, [x0] + mova z0.b, p0/m, za0h.b[w12, #0] + mova z1.b, p1/m, za0h.b[w12, #0] + #Alias + mov z4.b, p0/m, za0h.b[w12, #0] + mov z5.b, p1/m, za0h.b[w12, #0] + + # Vertical + ld1b {za0v.b[w12, #3]}, p0/z, [x0] + mova z2.b, p0/m, za0v.b[w12, #3] + mova z3.b, p1/m, za0v.b[w12, #3] + #Alias + mov z6.b, p0/m, za0v.b[w12, #3] + mov z7.b, p1/m, za0v.b[w12, #3] + )"); + CHECK_NEON(0, uint8_t, + fillNeon({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, + 0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01}, + SVL / 8)); + CHECK_NEON(1, uint8_t, + fillNeon({0xDE, 2, 0xBE, 2, 0x12, 2, 0x56, 2, 0x98, 2, + 0x54, 2, 0xAB, 2, 0xEF, 2}, + SVL / 8)); + CHECK_NEON(2, uint8_t, + fillNeon({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, + 0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01}, + SVL / 8)); + CHECK_NEON(3, uint8_t, + fillNeon({0xDE, 4, 0xBE, 4, 0x12, 4, 0x56, 4, 0x98, 4, + 0x54, 4, 0xAB, 4, 0xEF, 4}, + SVL / 8)); + CHECK_NEON(4, uint8_t, + fillNeon({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, + 0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01}, + SVL / 8)); + CHECK_NEON(5, uint8_t, + fillNeon({0xDE, 6, 0xBE, 6, 0x12, 6, 0x56, 6, 0x98, 6, + 0x54, 6, 0xAB, 6, 0xEF, 6}, + SVL / 8)); + CHECK_NEON(6, uint8_t, + fillNeon({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, + 0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01}, + SVL / 8)); + CHECK_NEON(7, uint8_t, + fillNeon({0xDE, 8, 0xBE, 8, 0x12, 8, 0x56, 8, 0x98, 8, + 0x54, 8, 0xAB, 8, 0xEF, 8}, + SVL / 8)); + + // 16-bit initialHeapData_.resize(SVL / 4); - uint64_t* heap64_vert = reinterpret_cast(initialHeapData_.data()); - std::vector src_vert = {0xDEADBEEF12345678, 0x98765432ABCDEF01}; - fillHeap(heap64_vert, src_vert, SVL / 32); + uint16_t* heap16 = reinterpret_cast(initialHeapData_.data()); + std::vector src16 = {0xDEAD, 0xBEEF, 0x1234, 0x5678, + 0x9876, 0x5432, 0xABCD, 0xEF01}; + fillHeap(heap16, src16, SVL / 8); RUN_AARCH64(R"( # Get heap address mov x0, 0 @@ -160,40 +358,73 @@ TEST_P(InstSme, ld1d) { smstart - mov x1, #1 - ptrue p0.d - mov w12, #0 - # Load and broadcast values from heap - ld1d {za0v.d[w12, 0]}, p0/z, [x0, x1, lsl #3] - ld1d {za0v.d[w12, 1]}, p0/z, [x0] + zero {za} - # Test for inactive lanes - mov x1, #0 - mov x3, #16 - # TODO change to addsvl when implemented - addvl x1, x1, #1 - udiv x1, x1, x3 - mov x2, #0 - whilelo p1.d, xzr, x1 - ld1d {za1v.d[w12, 1]}, p1/z, [x0, x2, lsl #3] + ptrue p0.h + pfalse p1.b + zip1 p1.h, p0.h, p1.h + + mov w12, #0 + dup z0.h, #1 + dup z1.h, #2 + dup z2.h, #3 + dup z3.h, #4 + dup z4.h, #5 + dup z5.h, #6 + dup z6.h, #7 + dup z7.h, #8 + + # Horizontal + ld1h {za0h.h[w12, #0]}, p0/z, [x0] + mova z0.h, p0/m, za0h.h[w12, #0] + mova z1.h, p1/m, za0h.h[w12, #0] + #Alias + mov z4.h, p0/m, za0h.h[w12, #0] + mov z5.h, p1/m, za0h.h[w12, #0] + + # Vertical + ld1h {za0v.h[w12, #3]}, p0/z, [x0] + mova z2.h, p0/m, za0v.h[w12, #3] + mova z3.h, p1/m, za0v.h[w12, #3] + #Alias + mov z6.h, p0/m, za0v.h[w12, #3] + mov z7.h, p1/m, za0v.h[w12, #3] )"); - CHECK_MAT_COL( - AARCH64_REG_ZAD0, 0, uint64_t, - fillNeon({0x98765432ABCDEF01, 0xDEADBEEF12345678}, SVL / 8)); - CHECK_MAT_COL( - AARCH64_REG_ZAD0, 1, uint64_t, - fillNeon({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8)); - CHECK_MAT_COL(AARCH64_REG_ZAD1, 1, uint64_t, - fillNeonCombined( - {0xDEADBEEF12345678, 0x98765432ABCDEF01}, {0}, SVL / 8)); -} + CHECK_NEON(0, uint16_t, + fillNeon({0xDEAD, 0xBEEF, 0x1234, 0x5678, 0x9876, 0x5432, + 0xABCD, 0xEF01}, + SVL / 8)); + CHECK_NEON(1, uint16_t, + fillNeon({0xDEAD, 2, 0x1234, 2, 0x9876, 2, 0xABCD, 2}, + SVL / 8)); + CHECK_NEON(2, uint16_t, + fillNeon({0xDEAD, 0xBEEF, 0x1234, 0x5678, 0x9876, 0x5432, + 0xABCD, 0xEF01}, + SVL / 8)); + CHECK_NEON(3, uint16_t, + fillNeon({0xDEAD, 4, 0x1234, 4, 0x9876, 4, 0xABCD, 4}, + SVL / 8)); + CHECK_NEON(4, uint16_t, + fillNeon({0xDEAD, 0xBEEF, 0x1234, 0x5678, 0x9876, 0x5432, + 0xABCD, 0xEF01}, + SVL / 8)); + CHECK_NEON(5, uint16_t, + fillNeon({0xDEAD, 6, 0x1234, 6, 0x9876, 6, 0xABCD, 6}, + SVL / 8)); + CHECK_NEON(6, uint16_t, + fillNeon({0xDEAD, 0xBEEF, 0x1234, 0x5678, 0x9876, 0x5432, + 0xABCD, 0xEF01}, + SVL / 8)); + CHECK_NEON(7, uint16_t, + fillNeon({0xDEAD, 8, 0x1234, 8, 0x9876, 8, 0xABCD, 8}, + SVL / 8)); -TEST_P(InstSme, ld1w) { - // Horizontal + // 32-bit initialHeapData_.resize(SVL / 4); uint32_t* heap32 = reinterpret_cast(initialHeapData_.data()); - std::vector src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}; - fillHeap(heap32, src, SVL / 16); + std::vector src32 = {0xDEADBEEF, 0x12345678, 0x98765432, + 0xABCDEF01}; + fillHeap(heap32, src32, SVL / 16); RUN_AARCH64(R"( # Get heap address mov x0, 0 @@ -202,39 +433,64 @@ TEST_P(InstSme, ld1w) { smstart - mov x1, #1 + zero {za} + ptrue p0.s - mov w12, #1 - # Load and broadcast values from heap - ld1w {za0h.s[w12, 0]}, p0/z, [x0, x1, lsl #2] - ld1w {za0h.s[w12, 2]}, p0/z, [x0] + pfalse p1.b + zip1 p1.s, p0.s, p1.s - # Test for inactive lanes - mov x1, #0 - mov x3, #8 - # TODO change to addsvl when implemented - addvl x1, x1, #1 - udiv x1, x1, x3 - mov x2, #0 - whilelo p1.s, xzr, x1 - ld1w {za1h.s[w12, 0]}, p1/z, [x0, x2, lsl #2] + mov w12, #0 + dup z0.s, #1 + dup z1.s, #2 + dup z2.s, #3 + dup z3.s, #4 + dup z4.s, #5 + dup z5.s, #6 + dup z6.s, #7 + dup z7.s, #8 + + # Horizontal + ld1w {za0h.s[w12, #0]}, p0/z, [x0] + mova z0.s, p0/m, za0h.s[w12, #0] + mova z1.s, p1/m, za0h.s[w12, #0] + #Alias + mov z4.s, p0/m, za0h.s[w12, #0] + mov z5.s, p1/m, za0h.s[w12, #0] + + # Vertical + ld1w {za0v.s[w12, #3]}, p0/z, [x0] + mova z2.s, p0/m, za0v.s[w12, #3] + mova z3.s, p1/m, za0v.s[w12, #3] + #Alias + mov z6.s, p0/m, za0v.s[w12, #3] + mov z7.s, p1/m, za0v.s[w12, #3] )"); - CHECK_MAT_ROW( - AARCH64_REG_ZAS0, 1, uint64_t, - fillNeon({0x9876543212345678, 0xDEADBEEFABCDEF01}, SVL / 8)); - CHECK_MAT_ROW( - AARCH64_REG_ZAS0, 3, uint64_t, - fillNeon({0x12345678DEADBEEF, 0xABCDEF0198765432}, SVL / 8)); - CHECK_MAT_ROW(AARCH64_REG_ZAS1, 1, uint64_t, - fillNeonCombined( - {0x12345678DEADBEEF, 0xABCDEF0198765432}, {0}, SVL / 8)); + CHECK_NEON(0, uint32_t, + fillNeon( + {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8)); + CHECK_NEON(1, uint32_t, + fillNeon({0xDEADBEEF, 2, 0x98765432, 2}, SVL / 8)); + CHECK_NEON(2, uint32_t, + fillNeon( + {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8)); + CHECK_NEON(3, uint32_t, + fillNeon({0xDEADBEEF, 4, 0x98765432, 4}, SVL / 8)); + CHECK_NEON(4, uint32_t, + fillNeon( + {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8)); + CHECK_NEON(5, uint32_t, + fillNeon({0xDEADBEEF, 6, 0x98765432, 6}, SVL / 8)); + CHECK_NEON(6, uint32_t, + fillNeon( + {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8)); + CHECK_NEON(7, uint32_t, + fillNeon({0xDEADBEEF, 8, 0x98765432, 8}, SVL / 8)); - // Vertical + // 64-bit initialHeapData_.resize(SVL / 4); - uint32_t* heap32_vert = reinterpret_cast(initialHeapData_.data()); - std::vector src_vert = {0xDEADBEEF, 0x12345678, 0x98765432, - 0xABCDEF01}; - fillHeap(heap32_vert, src_vert, SVL / 16); + uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + std::vector src64 = {0xDEADBEEF12345678, 0x98765432ABCDEF01}; + fillHeap(heap64, src64, SVL / 32); RUN_AARCH64(R"( # Get heap address mov x0, 0 @@ -243,42 +499,60 @@ TEST_P(InstSme, ld1w) { smstart - mov x1, #1 - ptrue p0.s - mov w12, #1 - # Load and broadcast values from heap - ld1w {za0v.s[w12, 0]}, p0/z, [x0, x1, lsl #2] - ld1w {za0v.s[w12, 2]}, p0/z, [x0] + zero {za} - # Test for inactive lanes - mov x1, #0 - mov x3, #8 - # TODO change to addsvl when implemented - addvl x1, x1, #1 - udiv x1, x1, x3 - mov x2, #0 - whilelo p1.s, xzr, x1 - ld1w {za1v.s[w12, 0]}, p1/z, [x0, x2, lsl #2] + ptrue p0.d + pfalse p1.b + zip1 p1.d, p0.d, p1.d + + mov w12, #0 + dup z0.d, #1 + dup z1.d, #2 + dup z2.d, #3 + dup z3.d, #4 + dup z4.d, #5 + dup z5.d, #6 + dup z6.d, #7 + dup z7.d, #8 + + # Horizontal + ld1d {za0h.d[w12, #0]}, p0/z, [x0] + mova z0.d, p0/m, za0h.d[w12, #0] + mova z1.d, p1/m, za0h.d[w12, #0] + #Alias + mov z4.d, p0/m, za0h.d[w12, #0] + mov z5.d, p1/m, za0h.d[w12, #0] + + # Vertical + ld1d {za0v.d[w12, #1]}, p0/z, [x0] + mova z2.d, p0/m, za0v.d[w12, #1] + mova z3.d, p1/m, za0v.d[w12, #1] + #Alias + mov z6.d, p0/m, za0v.d[w12, #1] + mov z7.d, p1/m, za0v.d[w12, #1] )"); - CHECK_MAT_COL(AARCH64_REG_ZAS0, 1, uint32_t, - fillNeon( - {0x12345678, 0x98765432, 0xABCDEF01, 0xDEADBEEF}, SVL / 8)); - CHECK_MAT_COL(AARCH64_REG_ZAS0, 3, uint32_t, - fillNeon( - {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8)); - CHECK_MAT_COL( - AARCH64_REG_ZAS1, 1, uint32_t, - fillNeonCombined( - {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, {0}, SVL / 8)); -} + CHECK_NEON( + 0, uint64_t, + fillNeon({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8)); + CHECK_NEON(1, uint64_t, fillNeon({0xDEADBEEF12345678, 2}, SVL / 8)); + CHECK_NEON( + 2, uint64_t, + fillNeon({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8)); + CHECK_NEON(3, uint64_t, fillNeon({0xDEADBEEF12345678, 4}, SVL / 8)); + CHECK_NEON( + 4, uint64_t, + fillNeon({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8)); + CHECK_NEON(5, uint64_t, fillNeon({0xDEADBEEF12345678, 6}, SVL / 8)); + CHECK_NEON( + 6, uint64_t, + fillNeon({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8)); + CHECK_NEON(7, uint64_t, fillNeon({0xDEADBEEF12345678, 8}, SVL / 8)); -TEST_P(InstSme, st1d) { - // Horizontal + // 128-bit + // Re-use 64-bit heap initialHeapData_.resize(SVL / 4); - uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); - std::vector src = {0xDEADBEEF12345678, 0x98765432ABCDEF01}; - fillHeap(heap64, src, SVL / 32); - + heap64 = reinterpret_cast(initialHeapData_.data()); + fillHeap(heap64, src64, SVL / 32); RUN_AARCH64(R"( # Get heap address mov x0, 0 @@ -287,235 +561,2852 @@ TEST_P(InstSme, st1d) { smstart - sub sp, sp, #4095 - mov x1, #0 - mov x4, #0 - addvl x4, x4, #1 + zero {za} + ptrue p0.d + pfalse p1.b + # Zip1 twice to get on-off-on-off pattern with quadwords + zip1 p1.d, p0.d, p1.d + zip1 p1.d, p1.d, p1.d mov w12, #0 - ld1d {za0h.d[w12, 0]}, p0/z, [x0, x1, lsl #3] - ld1d {za1h.d[w12, 1]}, p0/z, [x0, x1, lsl #3] - st1d {za0h.d[w12, 0]}, p0, [sp, x1, lsl #3] - st1d {za1h.d[w12, 1]}, p0, [x4] + dup z0.d, #1 + dup z1.d, #2 + dup z2.d, #3 + dup z3.d, #4 + dup z4.d, #5 + dup z5.d, #6 + dup z6.d, #7 + dup z7.d, #8 + + # Horizontal + ld1d {za0h.d[w12, #0]}, p0/z, [x0] + mova z0.q, p0/m, za0h.q[w12, #0] + mova z1.q, p1/m, za0h.q[w12, #0] + #Alias + mov z4.q, p0/m, za0h.q[w12, #0] + mov z5.q, p1/m, za0h.q[w12, #0] + + # Vertical + mov w12, #1 + ld1d {z8.d}, p0/z, [x0] + mova za0v.q[w12, #0], p0/m, z8.q + mova z2.q, p0/m, za0v.q[w12, #0] + mova z3.q, p1/m, za0v.q[w12, #0] + #Alias + mov z6.q, p0/m, za0v.q[w12, #0] + mov z7.q, p1/m, za0v.q[w12, #0] )"); - for (uint64_t i = 0; i < (SVL / 64); i++) { - EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - - 4095 + (i * 8)), - src[i % 2]); - EXPECT_EQ(getMemoryValue((SVL / 8) + (i * 8)), src[i % 2]); - } + // Horizontal + CHECK_NEON( + 0, uint64_t, + fillNeon({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8)); + CHECK_NEON(1, uint64_t, + fillNeon({0xDEADBEEF12345678, 0x98765432ABCDEF01, 2, 2}, + SVL / 8)); + // Vertical + CHECK_NEON( + 2, uint64_t, + fillNeon({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8)); + CHECK_NEON(3, uint64_t, + fillNeon({0xDEADBEEF12345678, 0x98765432ABCDEF01, 4, 4}, + SVL / 8)); + // Horizontal + CHECK_NEON( + 4, uint64_t, + fillNeon({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8)); + CHECK_NEON(5, uint64_t, + fillNeon({0xDEADBEEF12345678, 0x98765432ABCDEF01, 6, 6}, + SVL / 8)); + // Vertical + CHECK_NEON( + 6, uint64_t, + fillNeon({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8)); + CHECK_NEON(7, uint64_t, + fillNeon({0xDEADBEEF12345678, 0x98765432ABCDEF01, 8, 8}, + SVL / 8)); +} +TEST_P(InstSme, mova_b_vecToTile) { + // 8-bit RUN_AARCH64(R"( - # Get heap address - mov x0, 0 - mov x8, 214 - svc #0 - smstart - mov x2, #0 - mov x4, #16 - addvl x2, x2, #1 - udiv x2, x2, x4 - mov x3, #2 - whilelo p1.d, xzr, x2 - mov x5, #800 + zero {za} + + ptrue p0.b + pfalse p1.b + zip1 p1.b, p0.b, p1.b mov w12, #0 - mov w13, #1 - ld1d {za3h.d[w12, 0]}, p1/z, [x0, x3, lsl #3] - st1d {za3h.d[w12, 0]}, p1, [x5] - ld1d {za1h.d[w13, 1]}, p1/z, [x0, x3, lsl #3] - st1d {za1h.d[w13, 1]}, p1, [x5, x3, lsl #3] + dup z0.b, #1 + dup z1.b, #2 + + # Horizontal + mova za0h.b[w12, #0], p0/m, z0.b + mova za0h.b[w12, #1], p1/m, z1.b )"); - for (uint64_t i = 0; i < (SVL / 128); i++) { - EXPECT_EQ(getMemoryValue(800 + (i * 8)), src[i % 2]); - EXPECT_EQ(getMemoryValue(800 + 16 + (i * 8)), src[i % 2]); + CHECK_MAT_ROW(AARCH64_REG_ZAB0, 0, uint8_t, + fillNeon({1}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAB0, 1, uint8_t, + fillNeon({2, 0}, (SVL / 8))); + for (uint16_t i = 2; i < SVL / 8; i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAB0, i, uint8_t, + fillNeon({0}, (SVL / 8))); } - // Vertical - initialHeapData_.resize(SVL / 4); - uint64_t* heap64_vert = reinterpret_cast(initialHeapData_.data()); - std::vector src_vert = {0xDEADBEEF12345678, 0x98765432ABCDEF01}; - fillHeap(heap64_vert, src_vert, SVL / 32); + RUN_AARCH64(R"( + smstart + + zero {za} + + ptrue p0.b + pfalse p1.b + zip1 p1.b, p0.b, p1.b + + mov w12, #0 + dup z0.b, #1 + dup z1.b, #2 + + # Horizontal Alias + mov za0h.b[w12, #0], p0/m, z0.b + mov za0h.b[w12, #1], p1/m, z1.b + )"); + CHECK_MAT_ROW(AARCH64_REG_ZAB0, 0, uint8_t, + fillNeon({1}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAB0, 1, uint8_t, + fillNeon({2, 0}, (SVL / 8))); + for (uint16_t i = 2; i < SVL / 8; i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAB0, i, uint8_t, + fillNeon({0}, (SVL / 8))); + } RUN_AARCH64(R"( - # Get heap address - mov x0, 0 - mov x8, 214 - svc #0 + smstart + + zero {za} + + ptrue p0.b + pfalse p1.b + zip1 p1.b, p0.b, p1.b + + mov w12, #0 + dup z0.b, #1 + dup z1.b, #2 + + # Vertical + mova za0v.b[w12, #0], p0/m, z0.b + mova za0v.b[w12, #1], p1/m, z1.b + )"); + CHECK_MAT_COL(AARCH64_REG_ZAB0, 0, uint8_t, + fillNeon({1}, (SVL / 8))); + CHECK_MAT_COL(AARCH64_REG_ZAB0, 1, uint8_t, + fillNeon({2, 0}, (SVL / 8))); + for (uint16_t i = 2; i < SVL / 8; i++) { + CHECK_MAT_COL(AARCH64_REG_ZAB0, i, uint8_t, + fillNeon({0}, (SVL / 8))); + } + RUN_AARCH64(R"( smstart - sub sp, sp, #4095 - mov x1, #0 - mov x4, #0 - addvl x4, x4, #1 - ptrue p0.d + zero {za} + + ptrue p0.b + pfalse p1.b + zip1 p1.b, p0.b, p1.b mov w12, #0 - ld1d {za0v.d[w12, 0]}, p0/z, [x0, x1, lsl #3] - ld1d {za1v.d[w12, 1]}, p0/z, [x0, x1, lsl #3] - st1d {za0v.d[w12, 0]}, p0, [sp, x1, lsl #3] - st1d {za1v.d[w12, 1]}, p0, [x4] + dup z0.b, #1 + dup z1.b, #2 + + # Vertical Alias + mov za0v.b[w12, #0], p0/m, z0.b + mov za0v.b[w12, #1], p1/m, z1.b )"); - for (uint64_t i = 0; i < (SVL / 64); i++) { - EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - - 4095 + (i * 8)), - src_vert[i % 2]); - EXPECT_EQ(getMemoryValue((SVL / 8) + (i * 8)), src_vert[i % 2]); + CHECK_MAT_COL(AARCH64_REG_ZAB0, 0, uint8_t, + fillNeon({1}, (SVL / 8))); + CHECK_MAT_COL(AARCH64_REG_ZAB0, 1, uint8_t, + fillNeon({2, 0}, (SVL / 8))); + for (uint16_t i = 2; i < SVL / 8; i++) { + CHECK_MAT_COL(AARCH64_REG_ZAB0, i, uint8_t, + fillNeon({0}, (SVL / 8))); } +} +TEST_P(InstSme, mova_h_vecToTile) { + // 16-bit RUN_AARCH64(R"( - # Get heap address - mov x0, 0 - mov x8, 214 - svc #0 + smstart + + zero {za} + + ptrue p0.h + pfalse p1.b + zip1 p1.h, p0.h, p1.h + + mov w12, #0 + dup z0.h, #1 + dup z1.h, #2 + + # Horizontal + mova za0h.h[w12, #0], p0/m, z0.h + mova za0h.h[w12, #1], p1/m, z1.h + )"); + CHECK_MAT_ROW(AARCH64_REG_ZAH0, 0, uint16_t, + fillNeon({1}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAH0, 1, uint16_t, + fillNeon({2, 0}, (SVL / 8))); + for (uint16_t i = 2; i < SVL / 16; i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAH0, i, uint16_t, + fillNeon({0}, (SVL / 8))); + } + RUN_AARCH64(R"( smstart - mov x2, #0 - mov x4, #16 - addvl x2, x2, #1 - udiv x2, x2, x4 - mov x3, #2 - whilelo p1.d, xzr, x2 - mov x5, #800 + zero {za} + + ptrue p0.h + pfalse p1.b + zip1 p1.h, p0.h, p1.h mov w12, #0 - mov w13, #1 - ld1d {za3v.d[w12, 0]}, p1/z, [x0, x3, lsl #3] - st1d {za3v.d[w12, 0]}, p1, [x5] - ld1d {za1v.d[w13, 1]}, p1/z, [x0, x3, lsl #3] - st1d {za1v.d[w13, 1]}, p1, [x5, x3, lsl #3] + dup z0.h, #1 + dup z1.h, #2 + + # Horizontal Alias + mov za0h.h[w12, #0], p0/m, z0.h + mov za0h.h[w12, #1], p1/m, z1.h )"); - for (uint64_t i = 0; i < (SVL / 128); i++) { - EXPECT_EQ(getMemoryValue(800 + (i * 8)), src_vert[i % 2]); - EXPECT_EQ(getMemoryValue(800 + 16 + (i * 8)), src_vert[i % 2]); + CHECK_MAT_ROW(AARCH64_REG_ZAH0, 0, uint16_t, + fillNeon({1}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAH0, 1, uint16_t, + fillNeon({2, 0}, (SVL / 8))); + for (uint16_t i = 2; i < SVL / 16; i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAH0, i, uint16_t, + fillNeon({0}, (SVL / 8))); + } + + RUN_AARCH64(R"( + smstart + + zero {za} + + ptrue p0.h + pfalse p1.b + zip1 p1.h, p0.h, p1.h + + mov w12, #0 + dup z0.h, #1 + dup z1.h, #2 + + # Vertical + mova za0v.h[w12, #0], p0/m, z0.h + mova za0v.h[w12, #1], p1/m, z1.h + )"); + CHECK_MAT_COL(AARCH64_REG_ZAH0, 0, uint16_t, + fillNeon({1}, (SVL / 8))); + CHECK_MAT_COL(AARCH64_REG_ZAH0, 1, uint16_t, + fillNeon({2, 0}, (SVL / 8))); + for (uint16_t i = 2; i < SVL / 16; i++) { + CHECK_MAT_COL(AARCH64_REG_ZAH0, i, uint16_t, + fillNeon({0}, (SVL / 8))); + } + + RUN_AARCH64(R"( + smstart + + zero {za} + + ptrue p0.h + pfalse p1.b + zip1 p1.h, p0.h, p1.h + + mov w12, #0 + dup z0.h, #1 + dup z1.h, #2 + + # Vertical Alias + mov za0v.h[w12, #0], p0/m, z0.h + mov za0v.h[w12, #1], p1/m, z1.h + )"); + CHECK_MAT_COL(AARCH64_REG_ZAH0, 0, uint16_t, + fillNeon({1}, (SVL / 8))); + CHECK_MAT_COL(AARCH64_REG_ZAH0, 1, uint16_t, + fillNeon({2, 0}, (SVL / 8))); + for (uint16_t i = 2; i < SVL / 16; i++) { + CHECK_MAT_COL(AARCH64_REG_ZAH0, i, uint16_t, + fillNeon({0}, (SVL / 8))); } } -TEST_P(InstSme, st1w) { - // Horizontal - initialHeapData_.resize(SVL / 4); - uint32_t* heap32 = reinterpret_cast(initialHeapData_.data()); - std::vector src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}; - fillHeap(heap32, src, SVL / 16); +TEST_P(InstSme, mova_s_vecToTile) { + // 32-bit + RUN_AARCH64(R"( + smstart + + zero {za} + + ptrue p0.s + pfalse p1.b + zip1 p1.s, p0.s, p1.s + + mov w12, #0 + dup z0.s, #1 + dup z1.s, #2 + + # Horizontal + mova za0h.s[w12, #0], p0/m, z0.s + mova za0h.s[w12, #1], p1/m, z1.s + )"); + CHECK_MAT_ROW(AARCH64_REG_ZAS0, 0, uint32_t, + fillNeon({1}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS0, 1, uint32_t, + fillNeon({2, 0}, (SVL / 8))); + for (uint16_t i = 2; i < SVL / 32; i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t, + fillNeon({0}, (SVL / 8))); + } RUN_AARCH64(R"( - # Get heap address - mov x0, 0 - mov x8, 214 - svc #0 + smstart + + zero {za} + + ptrue p0.s + pfalse p1.b + zip1 p1.s, p0.s, p1.s + + mov w12, #0 + dup z0.s, #1 + dup z1.s, #2 + # Horizontal Alias + mov za0h.s[w12, #0], p0/m, z0.s + mov za0h.s[w12, #1], p1/m, z1.s + )"); + CHECK_MAT_ROW(AARCH64_REG_ZAS0, 0, uint32_t, + fillNeon({1}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS0, 1, uint32_t, + fillNeon({2, 0}, (SVL / 8))); + for (uint16_t i = 2; i < SVL / 32; i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t, + fillNeon({0}, (SVL / 8))); + } + + RUN_AARCH64(R"( smstart - sub sp, sp, #4095 - mov x1, #0 - mov x4, #0 - addvl x4, x4, #1 + zero {za} + ptrue p0.s + pfalse p1.b + zip1 p1.s, p0.s, p1.s mov w12, #0 - ld1w {za0h.s[w12, 0]}, p0/z, [x0, x1, lsl #2] - ld1w {za1h.s[w12, 1]}, p0/z, [x0, x1, lsl #2] - st1w {za0h.s[w12, 0]}, p0, [sp, x1, lsl #2] - st1w {za1h.s[w12, 1]}, p0, [x4] + dup z0.s, #1 + dup z1.s, #2 + + # Vertical + mova za0v.s[w12, #0], p0/m, z0.s + mova za0v.s[w12, #1], p1/m, z1.s )"); - for (uint64_t i = 0; i < (SVL / 32); i++) { - EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - - 4095 + (i * 4)), - src[i % 4]); - EXPECT_EQ(getMemoryValue((SVL / 8) + (i * 4)), src[i % 4]); + CHECK_MAT_COL(AARCH64_REG_ZAS0, 0, uint32_t, + fillNeon({1}, (SVL / 8))); + CHECK_MAT_COL(AARCH64_REG_ZAS0, 1, uint32_t, + fillNeon({2, 0}, (SVL / 8))); + for (uint16_t i = 2; i < SVL / 32; i++) { + CHECK_MAT_COL(AARCH64_REG_ZAS0, i, uint32_t, + fillNeon({0}, (SVL / 8))); } RUN_AARCH64(R"( - # Get heap address - mov x0, 0 - mov x8, 214 - svc #0 + smstart + + zero {za} + + ptrue p0.s + pfalse p1.b + zip1 p1.s, p0.s, p1.s + + mov w12, #0 + dup z0.s, #1 + dup z1.s, #2 + + # Vertical Alias + mov za0v.s[w12, #0], p0/m, z0.s + mov za0v.s[w12, #1], p1/m, z1.s + )"); + CHECK_MAT_COL(AARCH64_REG_ZAS0, 0, uint32_t, + fillNeon({1}, (SVL / 8))); + CHECK_MAT_COL(AARCH64_REG_ZAS0, 1, uint32_t, + fillNeon({2, 0}, (SVL / 8))); + for (uint16_t i = 2; i < SVL / 32; i++) { + CHECK_MAT_COL(AARCH64_REG_ZAS0, i, uint32_t, + fillNeon({0}, (SVL / 8))); + } +} +TEST_P(InstSme, mova_d_vecToTile) { + // 64-bit + RUN_AARCH64(R"( smstart - mov x2, #0 - mov x4, #8 - addvl x2, x2, #1 - udiv x2, x2, x4 - mov x3, #4 - whilelo p1.s, xzr, x2 - mov x5, #800 + zero {za} + + ptrue p0.d + pfalse p1.b + zip1 p1.d, p0.d, p1.d mov w12, #0 - ld1w {za3h.s[w12, 0]}, p1/z, [x0, x3, lsl #2] - st1w {za3h.s[w12, 0]}, p1, [x5] - ld1w {za1h.s[w12, 2]}, p1/z, [x0, x3, lsl #2] - st1w {za1h.s[w12, 2]}, p1, [x5, x3, lsl #2] + dup z0.d, #1 + dup z1.d, #2 + + # Horizontal + mova za0h.d[w12, #0], p0/m, z0.d + mova za0h.d[w12, #1], p1/m, z1.d )"); - for (uint64_t i = 0; i < (SVL / 64); i++) { - EXPECT_EQ(getMemoryValue(800 + (i * 4)), src[i % 4]); - EXPECT_EQ(getMemoryValue(800 + 16 + (i * 4)), src[i % 4]); + CHECK_MAT_ROW(AARCH64_REG_ZAD0, 0, uint64_t, + fillNeon({1}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAD0, 1, uint64_t, + fillNeon({2, 0}, (SVL / 8))); + for (uint16_t i = 2; i < SVL / 64; i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, uint64_t, + fillNeon({0}, (SVL / 8))); } - // Vertical - initialHeapData_.resize(SVL / 4); - uint32_t* heap32_vert = reinterpret_cast(initialHeapData_.data()); - std::vector src_vert = {0xDEADBEEF, 0x12345678, 0x98765432, - 0xABCDEF01}; - fillHeap(heap32_vert, src_vert, SVL / 16); + RUN_AARCH64(R"( + smstart + + zero {za} + + ptrue p0.d + pfalse p1.b + zip1 p1.d, p0.d, p1.d + + mov w12, #0 + dup z0.d, #1 + dup z1.d, #2 + + # Horizontal Alias + mov za0h.d[w12, #0], p0/m, z0.d + mov za0h.d[w12, #1], p1/m, z1.d + )"); + CHECK_MAT_ROW(AARCH64_REG_ZAD0, 0, uint64_t, + fillNeon({1}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAD0, 1, uint64_t, + fillNeon({2, 0}, (SVL / 8))); + for (uint16_t i = 2; i < SVL / 64; i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, uint64_t, + fillNeon({0}, (SVL / 8))); + } RUN_AARCH64(R"( - # Get heap address - mov x0, 0 - mov x8, 214 - svc #0 + smstart + + zero {za} + + ptrue p0.d + pfalse p1.b + zip1 p1.d, p0.d, p1.d + + mov w12, #0 + dup z0.d, #1 + dup z1.d, #2 + + # Vertical + mova za0v.d[w12, #0], p0/m, z0.d + mova za0v.d[w12, #1], p1/m, z1.d + )"); + CHECK_MAT_COL(AARCH64_REG_ZAD0, 0, uint64_t, + fillNeon({1}, (SVL / 8))); + CHECK_MAT_COL(AARCH64_REG_ZAD0, 1, uint64_t, + fillNeon({2, 0}, (SVL / 8))); + for (uint16_t i = 2; i < SVL / 64; i++) { + CHECK_MAT_COL(AARCH64_REG_ZAD0, i, uint64_t, + fillNeon({0}, (SVL / 8))); + } + RUN_AARCH64(R"( smstart - sub sp, sp, #4095 - mov x1, #0 - mov x4, #0 - addvl x4, x4, #1 - ptrue p0.s + zero {za} + + ptrue p0.d + pfalse p1.b + zip1 p1.d, p0.d, p1.d mov w12, #0 - ld1w {za0v.s[w12, 0]}, p0/z, [x0, x1, lsl #2] - ld1w {za1v.s[w12, 1]}, p0/z, [x0, x1, lsl #2] - st1w {za0v.s[w12, 0]}, p0, [sp, x1, lsl #2] - st1w {za1v.s[w12, 1]}, p0, [x4] + dup z0.d, #1 + dup z1.d, #2 + + # Vertical Alias + mov za0v.d[w12, #0], p0/m, z0.d + mov za0v.d[w12, #1], p1/m, z1.d )"); - for (uint64_t i = 0; i < (SVL / 32); i++) { - EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - - 4095 + (i * 4)), - src_vert[i % 4]); - EXPECT_EQ(getMemoryValue((SVL / 8) + (i * 4)), src_vert[i % 4]); + CHECK_MAT_COL(AARCH64_REG_ZAD0, 0, uint64_t, + fillNeon({1}, (SVL / 8))); + CHECK_MAT_COL(AARCH64_REG_ZAD0, 1, uint64_t, + fillNeon({2, 0}, (SVL / 8))); + for (uint16_t i = 2; i < SVL / 64; i++) { + CHECK_MAT_COL(AARCH64_REG_ZAD0, i, uint64_t, + fillNeon({0}, (SVL / 8))); } +} +TEST_P(InstSme, mova_q_vecToTile) { + // 128-bit RUN_AARCH64(R"( - # Get heap address - mov x0, 0 - mov x8, 214 - svc #0 + smstart + + zero {za} + + ptrue p0.d + pfalse p1.b + # Zip1 twice to get on-off-on-off pattern with quadwords + zip1 p1.d, p0.d, p1.d + zip1 p1.d, p1.d, p1.d + + mov w12, #0 + dup z0.d, #1 + dup z1.d, #2 + + # Horizontal + mova za0h.q[w12, #0], p0/m, z0.q + mova za0h.q[w12, #0], p1/m, z1.q + )"); + CHECK_MAT_ROW(AARCH64_REG_ZAQ0, 0, uint64_t, + fillNeon({2, 2, 1, 1}, (SVL / 8))); + for (uint16_t i = 1; i < SVL / 128; i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, + fillNeon({0}, (SVL / 8))); + } + RUN_AARCH64(R"( smstart - mov x2, #0 - mov x4, #8 - addvl x2, x2, #1 - udiv x2, x2, x4 - mov x3, #4 - whilelo p1.s, xzr, x2 - mov x5, #800 + zero {za} + + ptrue p0.d + pfalse p1.b + # Zip1 twice to get on-off-on-off pattern with quadwords + zip1 p1.d, p0.d, p1.d + zip1 p1.d, p1.d, p1.d mov w12, #0 - ld1w {za3v.s[w12, 0]}, p1/z, [x0, x3, lsl #2] - st1w {za3v.s[w12, 0]}, p1, [x5] - ld1w {za1v.s[w12, 2]}, p1/z, [x0, x3, lsl #2] - st1w {za1v.s[w12, 2]}, p1, [x5, x3, lsl #2] + dup z0.d, #1 + dup z1.d, #2 + + # Horizontal Alias + mov za0h.q[w12, #0], p0/m, z0.q + mov za0h.q[w12, #0], p1/m, z1.q )"); - for (uint64_t i = 0; i < (SVL / 64); i++) { - EXPECT_EQ(getMemoryValue(800 + (i * 4)), src_vert[i % 4]); - EXPECT_EQ(getMemoryValue(800 + 16 + (i * 4)), src_vert[i % 4]); + CHECK_MAT_ROW(AARCH64_REG_ZAQ0, 0, uint64_t, + fillNeon({2, 2, 1, 1}, (SVL / 8))); + for (uint16_t i = 1; i < SVL / 128; i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, + fillNeon({0}, (SVL / 8))); + } + + RUN_AARCH64(R"( + smstart + + zero {za} + + ptrue p0.d + pfalse p1.b + # Zip1 twice to get on-off-on-off pattern with quadwords + zip1 p1.d, p0.d, p1.d + zip1 p1.d, p1.d, p1.d + + mov w12, #0 + dup z0.d, #1 + dup z1.d, #2 + + # Vertical + mova za0v.q[w12, #0], p0/m, z0.q + mova za0v.q[w12, #0], p1/m, z1.q + )"); + auto onRow = fillNeon({0}, (SVL / 8)); + auto offRow = fillNeon({0}, (SVL / 8)); + onRow[0] = 2; + onRow[1] = 2; + offRow[0] = 1; + offRow[1] = 1; + for (uint16_t i = 0; i < SVL / 128; i++) { + if (i % 2 == 0) { + CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, onRow); + } else { + CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, offRow); + } + } + + RUN_AARCH64(R"( + smstart + + zero {za} + + ptrue p0.d + pfalse p1.b + # Zip1 twice to get on-off-on-off pattern with quadwords + zip1 p1.d, p0.d, p1.d + zip1 p1.d, p1.d, p1.d + + mov w12, #0 + dup z0.d, #1 + dup z1.d, #2 + + # Vertical Alias + mov za0v.q[w12, #0], p0/m, z0.q + mov za0v.q[w12, #0], p1/m, z1.q + )"); + for (uint16_t i = 0; i < SVL / 128; i++) { + if (i % 2 == 0) { + CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, onRow); + } else { + CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, offRow); + } + } +} + +TEST_P(InstSme, fmopa) { + // 32-bit + RUN_AARCH64(R"( + smstart + + fdup z1.s, #2.0 + fdup z2.s, #5.0 + ptrue p0.s + ptrue p1.s + + zero {za} + + fmopa za0.s, p0/m, p1/m, z1.s, z2.s + + fdup z3.s, #3.0 + fdup z4.s, #8.0 + mov x0, #0 + mov x1, #8 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.s, xzr, x0 + + fmopa za2.s, p0/m, p2/m, z3.s, z4.s + )"); + for (uint64_t i = 0; i < (SVL / 32); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, float, + fillNeon({10.0f}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, float, + fillNeon({24.0f}, (SVL / 16))); + } + + // 64-bit + RUN_AARCH64(R"( + smstart + + fdup z1.d, #2.0 + fdup z2.d, #5.0 + ptrue p0.d + ptrue p1.d + + zero {za} + + fmopa za0.d, p0/m, p1/m, z1.d, z2.d + + fdup z3.d, #3.0 + fdup z4.d, #8.0 + mov x0, #0 + mov x1, #16 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.d, xzr, x0 + + fmopa za2.d, p0/m, p2/m, z3.d, z4.d + )"); + for (uint64_t i = 0; i < (SVL / 64); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, double, + fillNeon({10.0}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, double, + fillNeon({24.0}, (SVL / 16))); + } +} + +TEST_P(InstSme, fmops) { + // 32-bit + RUN_AARCH64(R"( + smstart + + fdup z1.s, #2.0 + fdup z2.s, #5.0 + ptrue p0.s + ptrue p1.s + + zero {za} + + fmops za0.s, p0/m, p1/m, z1.s, z2.s + + fdup z3.s, #3.0 + fdup z4.s, #8.0 + mov x0, #0 + mov x1, #8 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.s, xzr, x0 + + fmops za2.s, p0/m, p2/m, z3.s, z4.s + )"); + for (uint64_t i = 0; i < (SVL / 32); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, float, + fillNeon({-10.0f}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, float, + fillNeon({-24.0f}, (SVL / 16))); + } + + // 64-bit + RUN_AARCH64(R"( + smstart + + fdup z1.d, #2.0 + fdup z2.d, #5.0 + ptrue p0.d + ptrue p1.d + + zero {za} + + fmops za0.d, p0/m, p1/m, z1.d, z2.d + + fdup z3.d, #3.0 + fdup z4.d, #8.0 + mov x0, #0 + mov x1, #16 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.d, xzr, x0 + + fmops za2.d, p0/m, p2/m, z3.d, z4.d + )"); + for (uint64_t i = 0; i < (SVL / 64); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, double, + fillNeon({-10.0}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, double, + fillNeon({-24.0}, (SVL / 16))); + } +} + +TEST_P(InstSme, ld1b) { + // Horizontal + initialHeapData_.resize(SVL / 4); + uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, + 0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01}; + fillHeap(heap8, src, SVL / 4); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x1, #1 + ptrue p0.b + mov w12, #1 + # Load and broadcast values from heap + ld1b {za0h.b[w12, 0]}, p0/z, [x0, x1] + ld1b {za0h.b[w12, 2]}, p0/z, [x0] + + # Test for inactive lanes + mov x1, #0 + mov x3, #2 + # TODO change to addsvl when implemented + addvl x1, x1, #1 + udiv x1, x1, x3 + mov x2, #0 + whilelo p1.b, xzr, x1 + mov w12, #15 + ld1b {za0h.b[w12, 0]}, p1/z, [x0, x2] + )"); + CHECK_MAT_ROW( + AARCH64_REG_ZAB0, 1, uint8_t, + fillNeon({0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98, 0x76, + 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01, 0xDE}, + SVL / 8)); + CHECK_MAT_ROW( + AARCH64_REG_ZAB0, 3, uint8_t, + fillNeon({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98, + 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01}, + SVL / 8)); + CHECK_MAT_ROW(AARCH64_REG_ZAB0, 15, uint8_t, + fillNeonCombined( + {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98, 0x76, + 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01}, + {0}, SVL / 8)); + + // Vertical + initialHeapData_.resize(SVL / 4); + uint8_t* heap8_vert = reinterpret_cast(initialHeapData_.data()); + std::vector src_vert = {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, + 0x56, 0x78, 0x98, 0x76, 0x54, 0x32, + 0xAB, 0xCD, 0xEF, 0x01}; + fillHeap(heap8_vert, src_vert, SVL / 4); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x1, #1 + ptrue p0.b + mov w12, #1 + # Load and broadcast values from heap + ld1b {za0v.b[w12, 0]}, p0/z, [x0, x1] + ld1b {za0v.b[w12, 2]}, p0/z, [x0] + + # Test for inactive lanes + mov x1, #0 + mov x3, #2 + # TODO change to addsvl when implemented + addvl x1, x1, #1 + udiv x1, x1, x3 + mov x2, #0 + whilelo p1.b, xzr, x1 + mov w12, #15 + ld1b {za0v.b[w12, 0]}, p1/z, [x0, x2] + )"); + CHECK_MAT_COL( + AARCH64_REG_ZAB0, 1, uint8_t, + fillNeon({0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98, 0x76, + 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01, 0xDE}, + SVL / 8)); + CHECK_MAT_COL( + AARCH64_REG_ZAB0, 3, uint8_t, + fillNeon({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98, + 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01}, + SVL / 8)); + CHECK_MAT_COL(AARCH64_REG_ZAB0, 15, uint8_t, + fillNeonCombined( + {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98, 0x76, + 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01}, + {0}, SVL / 8)); +} + +TEST_P(InstSme, ld1d) { + // Horizontal + initialHeapData_.resize(SVL / 4); + uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0xDEADBEEF12345678, 0x98765432ABCDEF01}; + fillHeap(heap64, src, SVL / 32); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x1, #1 + ptrue p0.d + mov w12, #0 + # Load and broadcast values from heap + ld1d {za0h.d[w12, 0]}, p0/z, [x0, x1, lsl #3] + ld1d {za0h.d[w12, 1]}, p0/z, [x0] + + # Test for inactive lanes + mov x1, #0 + mov x3, #16 + # TODO change to addsvl when implemented + addvl x1, x1, #1 + udiv x1, x1, x3 + mov x2, #0 + whilelo p1.d, xzr, x1 + ld1d {za1h.d[w12, 1]}, p1/z, [x0, x2, lsl #3] + )"); + CHECK_MAT_ROW( + AARCH64_REG_ZAD0, 0, uint64_t, + fillNeon({0x98765432ABCDEF01, 0xDEADBEEF12345678}, SVL / 8)); + CHECK_MAT_ROW( + AARCH64_REG_ZAD0, 1, uint64_t, + fillNeon({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8)); + CHECK_MAT_ROW(AARCH64_REG_ZAD1, 1, uint64_t, + fillNeonCombined( + {0xDEADBEEF12345678, 0x98765432ABCDEF01}, {0}, SVL / 8)); + + // Vertical + initialHeapData_.resize(SVL / 4); + uint64_t* heap64_vert = reinterpret_cast(initialHeapData_.data()); + std::vector src_vert = {0xDEADBEEF12345678, 0x98765432ABCDEF01}; + fillHeap(heap64_vert, src_vert, SVL / 32); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x1, #1 + ptrue p0.d + mov w12, #0 + # Load and broadcast values from heap + ld1d {za0v.d[w12, 0]}, p0/z, [x0, x1, lsl #3] + ld1d {za0v.d[w12, 1]}, p0/z, [x0] + + # Test for inactive lanes + mov x1, #0 + mov x3, #16 + # TODO change to addsvl when implemented + addvl x1, x1, #1 + udiv x1, x1, x3 + mov x2, #0 + whilelo p1.d, xzr, x1 + ld1d {za1v.d[w12, 1]}, p1/z, [x0, x2, lsl #3] + )"); + CHECK_MAT_COL( + AARCH64_REG_ZAD0, 0, uint64_t, + fillNeon({0x98765432ABCDEF01, 0xDEADBEEF12345678}, SVL / 8)); + CHECK_MAT_COL( + AARCH64_REG_ZAD0, 1, uint64_t, + fillNeon({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8)); + CHECK_MAT_COL(AARCH64_REG_ZAD1, 1, uint64_t, + fillNeonCombined( + {0xDEADBEEF12345678, 0x98765432ABCDEF01}, {0}, SVL / 8)); +} + +TEST_P(InstSme, ld1h) { + // Horizontal + initialHeapData_.resize(SVL / 4); + uint16_t* heap16 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0xDEAD, 0xBEEF, 0x1234, 0x5678, + 0x9876, 0x5432, 0xABCD, 0xEF01}; + fillHeap(heap16, src, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x1, #1 + ptrue p0.h + mov w12, #1 + # Load and broadcast values from heap + ld1h {za0h.h[w12, 0]}, p0/z, [x0, x1, lsl #1] + ld1h {za0h.h[w12, 2]}, p0/z, [x0] + + # Test for inactive lanes + mov x1, #0 + mov x3, #4 + # TODO change to addsvl when implemented + addvl x1, x1, #1 + udiv x1, x1, x3 + mov x2, #0 + whilelo p1.h, xzr, x1 + ld1h {za1h.h[w12, 0]}, p1/z, [x0, x2, lsl #1] + )"); + CHECK_MAT_ROW(AARCH64_REG_ZAH0, 1, uint16_t, + fillNeon({0xBEEF, 0x1234, 0x5678, 0x9876, 0x5432, + 0xABCD, 0xEF01, 0xDEAD}, + SVL / 8)); + CHECK_MAT_ROW(AARCH64_REG_ZAH0, 3, uint16_t, + fillNeon({0xDEAD, 0xBEEF, 0x1234, 0x5678, 0x9876, + 0x5432, 0xABCD, 0xEF01}, + SVL / 8)); + CHECK_MAT_ROW(AARCH64_REG_ZAH1, 1, uint16_t, + fillNeonCombined({0xDEAD, 0xBEEF, 0x1234, 0x5678, + 0x9876, 0x5432, 0xABCD, 0xEF01}, + {0}, SVL / 8)); + + // Vertical + initialHeapData_.resize(SVL / 4); + uint16_t* heap16_vert = reinterpret_cast(initialHeapData_.data()); + std::vector src_vert = {0xDEAD, 0xBEEF, 0x1234, 0x5678, + 0x9876, 0x5432, 0xABCD, 0xEF01}; + fillHeap(heap16_vert, src_vert, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x1, #1 + ptrue p0.h + mov w12, #1 + # Load and broadcast values from heap + ld1h {za0v.h[w12, 0]}, p0/z, [x0, x1, lsl #1] + ld1h {za0v.h[w12, 2]}, p0/z, [x0] + + # Test for inactive lanes + mov x1, #0 + mov x3, #4 + # TODO change to addsvl when implemented + addvl x1, x1, #1 + udiv x1, x1, x3 + mov x2, #0 + whilelo p1.h, xzr, x1 + ld1h {za1v.h[w12, 0]}, p1/z, [x0, x2, lsl #1] + )"); + CHECK_MAT_COL(AARCH64_REG_ZAH0, 1, uint16_t, + fillNeon({0xBEEF, 0x1234, 0x5678, 0x9876, 0x5432, + 0xABCD, 0xEF01, 0xDEAD}, + SVL / 8)); + CHECK_MAT_COL(AARCH64_REG_ZAH0, 3, uint16_t, + fillNeon({0xDEAD, 0xBEEF, 0x1234, 0x5678, 0x9876, + 0x5432, 0xABCD, 0xEF01}, + SVL / 8)); + CHECK_MAT_COL(AARCH64_REG_ZAH1, 1, uint16_t, + fillNeonCombined({0xDEAD, 0xBEEF, 0x1234, 0x5678, + 0x9876, 0x5432, 0xABCD, 0xEF01}, + {0}, SVL / 8)); +} + +TEST_P(InstSme, ld1q) { + // Horizontal + initialHeapData_.resize(SVL / 4); + uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0xDEADBEEF12345678, 0x98765432ABCDEF01, + 0x98765432ABCDEF01, 0xDEADBEEF12345678}; + fillHeap(heap64, src, SVL / 32); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x1, #1 + ptrue p0.b + mov w12, #1 + # Load and broadcast values from heap + ld1q {za0h.q[w12, 0]}, p0/z, [x0, x1, lsl #4] + + # Test for inactive lanes - zip twice to get on-off for 128-bits + pfalse p1.b + zip1 p0.d, p0.d, p1.d + zip1 p0.d, p0.d, p0.d + ld1q {za15h.q[w12, 0]}, p0/z, [x0] + )"); + CHECK_MAT_ROW(AARCH64_REG_ZAQ0, 1 % (SVL / 128), uint64_t, + fillNeon({0x98765432ABCDEF01, 0xDEADBEEF12345678, + 0xDEADBEEF12345678, 0x98765432ABCDEF01}, + SVL / 8)); + CHECK_MAT_ROW(AARCH64_REG_ZAQ15, 1 % (SVL / 128), uint64_t, + fillNeon( + {0xDEADBEEF12345678, 0x98765432ABCDEF01, 0, 0}, SVL / 8)); + + // Vertical + initialHeapData_.resize(SVL / 4); + heap64 = reinterpret_cast(initialHeapData_.data()); + fillHeap(heap64, src, SVL / 32); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x1, #1 + ptrue p0.b + mov w12, #1 + # Load and broadcast values from heap + ld1q {za0v.q[w12, 0]}, p0/z, [x0, x1, lsl #4] + + # Test for inactive lanes - zip twice to get on-off for 128-bits + pfalse p1.b + zip1 p0.d, p0.d, p1.d + zip1 p0.d, p0.d, p0.d + ld1q {za15v.q[w12, 0]}, p0/z, [x0] + )"); + // Can't check Q columns as CHECK_MAT_COL isn't set up for doing this with + // uint64_t. + // Instead, manually place values into 1st column of Q tile (as per + // asm above) and check each Q row. + auto row0 = fillNeon({0}, (SVL / 8)); + auto row1 = fillNeon({0}, (SVL / 8)); + auto zeroRow = fillNeon({0}, (SVL / 8)); + // MOD SVL / 64 as dealing with uint64_t even though its a 128-bit tile + row0[2 % (SVL / 64)] = 0x98765432ABCDEF01; + row0[3 % (SVL / 64)] = 0xDEADBEEF12345678; + row1[2 % (SVL / 64)] = 0xDEADBEEF12345678; + row1[3 % (SVL / 64)] = 0x98765432ABCDEF01; + for (uint16_t i = 0; i < SVL / 128; i++) { + if (i % 2 == 0) { + CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, row0); + CHECK_MAT_ROW(AARCH64_REG_ZAQ15, i, uint64_t, row1); + } else { + CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, row1); + CHECK_MAT_ROW(AARCH64_REG_ZAQ15, i, uint64_t, zeroRow); + } + } +} + +TEST_P(InstSme, ld1w) { + // Horizontal + initialHeapData_.resize(SVL / 4); + uint32_t* heap32 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}; + fillHeap(heap32, src, SVL / 16); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x1, #1 + ptrue p0.s + mov w12, #1 + # Load and broadcast values from heap + ld1w {za0h.s[w12, 0]}, p0/z, [x0, x1, lsl #2] + ld1w {za0h.s[w12, 2]}, p0/z, [x0] + + # Test for inactive lanes + mov x1, #0 + mov x3, #8 + # TODO change to addsvl when implemented + addvl x1, x1, #1 + udiv x1, x1, x3 + mov x2, #0 + whilelo p1.s, xzr, x1 + ld1w {za1h.s[w12, 0]}, p1/z, [x0, x2, lsl #2] + )"); + CHECK_MAT_ROW( + AARCH64_REG_ZAS0, 1, uint64_t, + fillNeon({0x9876543212345678, 0xDEADBEEFABCDEF01}, SVL / 8)); + CHECK_MAT_ROW( + AARCH64_REG_ZAS0, 3, uint64_t, + fillNeon({0x12345678DEADBEEF, 0xABCDEF0198765432}, SVL / 8)); + CHECK_MAT_ROW(AARCH64_REG_ZAS1, 1, uint64_t, + fillNeonCombined( + {0x12345678DEADBEEF, 0xABCDEF0198765432}, {0}, SVL / 8)); + + // Vertical + initialHeapData_.resize(SVL / 4); + uint32_t* heap32_vert = reinterpret_cast(initialHeapData_.data()); + std::vector src_vert = {0xDEADBEEF, 0x12345678, 0x98765432, + 0xABCDEF01}; + fillHeap(heap32_vert, src_vert, SVL / 16); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x1, #1 + ptrue p0.s + mov w12, #1 + # Load and broadcast values from heap + ld1w {za0v.s[w12, 0]}, p0/z, [x0, x1, lsl #2] + ld1w {za0v.s[w12, 2]}, p0/z, [x0] + + # Test for inactive lanes + mov x1, #0 + mov x3, #8 + # TODO change to addsvl when implemented + addvl x1, x1, #1 + udiv x1, x1, x3 + mov x2, #0 + whilelo p1.s, xzr, x1 + ld1w {za1v.s[w12, 0]}, p1/z, [x0, x2, lsl #2] + )"); + CHECK_MAT_COL(AARCH64_REG_ZAS0, 1, uint32_t, + fillNeon( + {0x12345678, 0x98765432, 0xABCDEF01, 0xDEADBEEF}, SVL / 8)); + CHECK_MAT_COL(AARCH64_REG_ZAS0, 3, uint32_t, + fillNeon( + {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8)); + CHECK_MAT_COL( + AARCH64_REG_ZAS1, 1, uint32_t, + fillNeonCombined( + {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, {0}, SVL / 8)); +} + +TEST_P(InstSme, ldr) { + // Horizontal + initialHeapData_.resize(SVL); + uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, + 0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01}; + fillHeap(heap8, src, SVL); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + ptrue p0.b + mov w12, #0 + # Load and broadcast values from heap + ldr za[w12, 0], [x0] + ldr za[w12, 2], [x0, #2, mul vl] + )"); + CHECK_MAT_ROW( + AARCH64_REG_ZAB0, 0, uint8_t, + fillNeon({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98, + 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01}, + SVL / 8)); + CHECK_MAT_ROW(AARCH64_REG_ZAB0, 1, uint8_t, fillNeon({0}, SVL / 8)); + CHECK_MAT_ROW( + AARCH64_REG_ZAB0, 2, uint8_t, + fillNeon({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98, + 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01}, + SVL / 8)); + + for (uint16_t i = 3; i < SVL / 8; i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAB0, i, uint8_t, + fillNeon({0}, SVL / 8)); + } +} + +TEST_P(InstSme, smopa) { + // 32-bit + RUN_AARCH64(R"( + smstart + + dup z1.b, #8 + dup z2.b, #3 + ptrue p0.b + ptrue p1.b + + zero {za} + + smopa za0.s, p0/m, p1/m, z1.b, z2.b + + dup z3.b, #7 + dup z4.b, #4 + mov x0, #0 + mov x1, #2 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.b, xzr, x0 + + smopa za2.s, p0/m, p2/m, z3.b, z4.b + )"); + for (uint64_t i = 0; i < (SVL / 32); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t, + fillNeon({96}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t, + fillNeon({112}, (SVL / 16))); + } + + // 64-bit + RUN_AARCH64(R"( + smstart + + dup z1.h, #8 + dup z2.h, #3 + ptrue p0.h + ptrue p1.h + + zero {za} + + smopa za0.d, p0/m, p1/m, z1.h, z2.h + + dup z3.h, #7 + dup z4.h, #4 + mov x0, #0 + mov x1, #4 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.h, xzr, x0 + + smopa za2.d, p0/m, p2/m, z3.h, z4.h + )"); + for (uint64_t i = 0; i < (SVL / 64); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t, + fillNeon({96}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t, + fillNeon({112}, (SVL / 16))); + } +} + +TEST_P(InstSme, smops) { + // 32-bit + RUN_AARCH64(R"( + smstart + + dup z1.b, #8 + dup z2.b, #3 + ptrue p0.b + ptrue p1.b + + zero {za} + + smops za0.s, p0/m, p1/m, z1.b, z2.b + + dup z3.b, #7 + dup z4.b, #4 + mov x0, #0 + mov x1, #2 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.b, xzr, x0 + + smops za2.s, p0/m, p2/m, z3.b, z4.b + )"); + for (uint64_t i = 0; i < (SVL / 32); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t, + fillNeon({-96}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t, + fillNeon({-112}, (SVL / 16))); + } + + // 64-bit + RUN_AARCH64(R"( + smstart + + dup z1.h, #8 + dup z2.h, #3 + ptrue p0.h + ptrue p1.h + + zero {za} + + smops za0.d, p0/m, p1/m, z1.h, z2.h + + dup z3.h, #7 + dup z4.h, #4 + mov x0, #0 + mov x1, #4 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.h, xzr, x0 + + smops za2.d, p0/m, p2/m, z3.h, z4.h + )"); + for (uint64_t i = 0; i < (SVL / 64); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t, + fillNeon({-96}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t, + fillNeon({-112}, (SVL / 16))); + } +} + +TEST_P(InstSme, st1b) { + // Horizontal + initialHeapData_.resize(SVL / 4); + uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, + 0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01}; + fillHeap(heap8, src, SVL / 4); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + sub sp, sp, #4095 + mov x1, #0 + mov x4, #0 + addvl x4, x4, #1 + ptrue p0.b + + mov w12, #0 + ld1b {za0h.b[w12, 0]}, p0/z, [x0, x1] + ld1b {za0h.b[w12, 3]}, p0/z, [x0, x1] + st1b {za0h.b[w12, 0]}, p0, [sp, x1] + st1b {za0h.b[w12, 3]}, p0, [x4] + )"); + for (uint64_t i = 0; i < (SVL / 8); i++) { + EXPECT_EQ( + getMemoryValue(process_->getInitialStackPointer() - 4095 + i), + src[i % 16]); + EXPECT_EQ(getMemoryValue((SVL / 8) + i), src[i % 16]); + } + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x3, #16 + ptrue p0.b + pfalse p1.b + zip1 p1.b, p0.b, p1.b + mov x5, #400 + mov x6, #800 + + mov w12, #0 + mov w13, #1 + # Load entire row + ld1b {za0h.b[w12, 0]}, p0/z, [x0, x3] + # Store all 0s to memory + st1b {za0h.b[w12, 5]}, p0, [x5] + # Store odd indexed elements to memory + st1b {za0h.b[w12, 0]}, p1, [x5] + + # Load entire row + ld1b {za0h.b[w13, 1]}, p0/z, [x0, x3] + # Store all 0s to memory + st1b {za0h.b[w12, 5]}, p0, [x6, x3] + # Store odd indexed elements to memory + st1b {za0h.b[w13, 1]}, p1, [x6, x3] + )"); + for (uint64_t i = 0; i < (SVL / 8); i += 2) { + EXPECT_EQ(getMemoryValue(400 + i), src[i % 16]); + EXPECT_EQ(getMemoryValue(400 + (i + 1)), 0); + EXPECT_EQ(getMemoryValue(800 + 16 + i), src[i % 16]); + EXPECT_EQ(getMemoryValue(800 + 16 + (i + 1)), 0); + } + + // Vertical + initialHeapData_.resize(SVL / 4); + uint8_t* heap8_vert = reinterpret_cast(initialHeapData_.data()); + std::vector src_vert = {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, + 0x56, 0x78, 0x98, 0x76, 0x54, 0x32, + 0xAB, 0xCD, 0xEF, 0x01}; + fillHeap(heap8_vert, src_vert, SVL / 4); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + sub sp, sp, #4095 + mov x1, #0 + mov x4, #0 + addvl x4, x4, #1 + ptrue p0.b + + mov w12, #0 + ld1b {za0v.b[w12, 0]}, p0/z, [x0, x1] + ld1b {za0v.b[w12, 1]}, p0/z, [x0, x1] + st1b {za0v.b[w12, 0]}, p0, [sp, x1] + st1b {za0v.b[w12, 1]}, p0, [x4] + )"); + for (uint64_t i = 0; i < (SVL / 8); i++) { + EXPECT_EQ( + getMemoryValue(process_->getInitialStackPointer() - 4095 + i), + src_vert[i % 16]); + EXPECT_EQ(getMemoryValue((SVL / 8) + i), src_vert[i % 16]); + } + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x3, #16 + ptrue p0.b + pfalse p1.b + zip1 p1.b, p0.b, p1.b + mov x5, #400 + mov x6, #800 + + mov w12, #0 + mov w13, #1 + # Load entire row + ld1b {za0v.b[w12, 0]}, p0/z, [x0, x3] + # Store all 0s to memory + st1b {za0v.b[w12, 5]}, p0, [x5] + # Store odd indexed elements to memory + st1b {za0v.b[w12, 0]}, p1, [x5] + + # Load entire row + ld1b {za0v.b[w13, 1]}, p0/z, [x0, x3] + # Store all 0s to memory + st1b {za0v.b[w12, 5]}, p0, [x6, x3] + # Store odd indexed elements to memory + st1b {za0v.b[w13, 1]}, p1, [x6, x3] + )"); + for (uint64_t i = 0; i < (SVL / 8); i += 2) { + EXPECT_EQ(getMemoryValue(400 + i), src[i % 16]); + EXPECT_EQ(getMemoryValue(400 + (i + 1)), 0); + EXPECT_EQ(getMemoryValue(800 + 16 + i), src[i % 16]); + EXPECT_EQ(getMemoryValue(800 + 16 + (i + 1)), 0); + } +} + +TEST_P(InstSme, st1d) { + // Horizontal + initialHeapData_.resize(SVL / 4); + uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0xDEADBEEF12345678, 0x98765432ABCDEF01}; + fillHeap(heap64, src, SVL / 32); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + sub sp, sp, #4095 + mov x1, #0 + mov x4, #0 + addvl x4, x4, #1 + ptrue p0.d + + mov w12, #0 + ld1d {za0h.d[w12, 0]}, p0/z, [x0, x1, lsl #3] + ld1d {za1h.d[w12, 1]}, p0/z, [x0, x1, lsl #3] + st1d {za0h.d[w12, 0]}, p0, [sp, x1, lsl #3] + st1d {za1h.d[w12, 1]}, p0, [x4] + )"); + for (uint64_t i = 0; i < (SVL / 64); i++) { + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (i * 8)), + src[i % 2]); + EXPECT_EQ(getMemoryValue((SVL / 8) + (i * 8)), src[i % 2]); + } + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x3, #2 + ptrue p0.d + pfalse p1.b + zip1 p1.d, p0.d, p1.d + mov x5, #400 + mov x6, #800 + + mov w12, #0 + mov w13, #1 + # Load entire row + ld1d {za3h.d[w12, 0]}, p0/z, [x0, x3, lsl #3] + # Store all 0s to memory + st1d {za0h.d[w12, 0]}, p0, [x5] + # Store odd indexed elements to memory + st1d {za3h.d[w12, 0]}, p1, [x5] + + # Load entire row + ld1d {za1h.d[w13, 1]}, p0/z, [x0, x3, lsl #3] + # Store all 0s to memory + st1d {za0h.d[w12, 0]}, p0, [x6, x3, lsl #3] + # Store odd indexed elements to memory + st1d {za1h.d[w13, 1]}, p1, [x6, x3, lsl #3] + )"); + for (uint64_t i = 0; i < (SVL / 64); i += 2) { + EXPECT_EQ(getMemoryValue(400 + (i * 8)), src[i % 2]); + EXPECT_EQ(getMemoryValue(400 + ((i + 1) * 8)), 0); + EXPECT_EQ(getMemoryValue(800 + 16 + (i * 8)), src[i % 2]); + EXPECT_EQ(getMemoryValue(800 + 16 + ((i + 1) * 8)), 0); + } + + // Vertical + initialHeapData_.resize(SVL / 4); + uint64_t* heap64_vert = reinterpret_cast(initialHeapData_.data()); + std::vector src_vert = {0xDEADBEEF12345678, 0x98765432ABCDEF01}; + fillHeap(heap64_vert, src_vert, SVL / 32); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + sub sp, sp, #4095 + mov x1, #0 + mov x4, #0 + addvl x4, x4, #1 + ptrue p0.d + + mov w12, #0 + ld1d {za0v.d[w12, 0]}, p0/z, [x0, x1, lsl #3] + ld1d {za1v.d[w12, 1]}, p0/z, [x0, x1, lsl #3] + st1d {za0v.d[w12, 0]}, p0, [sp, x1, lsl #3] + st1d {za1v.d[w12, 1]}, p0, [x4] + )"); + for (uint64_t i = 0; i < (SVL / 64); i++) { + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (i * 8)), + src_vert[i % 2]); + EXPECT_EQ(getMemoryValue((SVL / 8) + (i * 8)), src_vert[i % 2]); + } + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x3, #2 + ptrue p0.d + pfalse p1.b + zip1 p1.d, p0.d, p1.d + mov x5, #400 + mov x6, #800 + + mov w12, #0 + mov w13, #1 + # Load entire row + ld1d {za3v.d[w12, 0]}, p0/z, [x0, x3, lsl #3] + # Store all 0s to memory + st1d {za0v.d[w12, 0]}, p0, [x5] + # Store odd indexed elements to memory + st1d {za3v.d[w12, 0]}, p1, [x5] + + # Load entire row + ld1d {za1v.d[w13, 1]}, p0/z, [x0, x3, lsl #3] + # Store all 0s to memory + st1d {za0v.d[w12, 0]}, p0, [x6, x3, lsl #3] + # Store odd indexed elements to memory + st1d {za1v.d[w13, 1]}, p1, [x6, x3, lsl #3] + )"); + for (uint64_t i = 0; i < (SVL / 64); i += 2) { + EXPECT_EQ(getMemoryValue(400 + (i * 8)), src_vert[i % 2]); + EXPECT_EQ(getMemoryValue(400 + ((i + 1) * 8)), 0); + EXPECT_EQ(getMemoryValue(800 + 16 + (i * 8)), src_vert[i % 2]); + EXPECT_EQ(getMemoryValue(800 + 16 + ((i + 1) * 8)), 0); + } +} + +TEST_P(InstSme, st1h) { + // Horizontal + initialHeapData_.resize(SVL / 4); + uint16_t* heap16 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0xDEAD, 0xBEEF, 0x1234, 0x5678, + 0x9876, 0x5432, 0xABCD, 0xEF01}; + fillHeap(heap16, src, SVL / 8); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + sub sp, sp, #4095 + mov x1, #0 + mov x4, #0 + addvl x4, x4, #1 + ptrue p0.h + + mov w12, #0 + ld1h {za0h.h[w12, 0]}, p0/z, [x0, x1, lsl #1] + ld1h {za1h.h[w12, 1]}, p0/z, [x0, x1, lsl #1] + st1h {za0h.h[w12, 0]}, p0, [sp, x1, lsl #1] + st1h {za1h.h[w12, 1]}, p0, [x4] + )"); + for (uint64_t i = 0; i < (SVL / 16); i++) { + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (i * 2)), + src[i % 8]); + EXPECT_EQ(getMemoryValue((SVL / 8) + (i * 2)), src[i % 8]); + } + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x3, #8 + ptrue p0.h + pfalse p1.b + zip1 p1.h, p0.h, p1.h + mov x5, #400 + mov x6, #800 + + mov w12, #0 + mov w13, #1 + # Load entire row + ld1h {za0h.h[w12, 0]}, p0/z, [x0, x3, lsl #1] + # Store all 0s to memory + st1h {za1h.h[w12, 0]}, p0, [x5] + # Store odd indexed elements to memory + st1h {za0h.h[w12, 0]}, p1, [x5] + + # Load entire row + ld1h {za0h.h[w13, 1]}, p0/z, [x0, x3, lsl #1] + # Store all 0s to memory + st1h {za1h.h[w12, 0]}, p0, [x6, x3, lsl #1] + # Store odd indexed elements to memory + st1h {za0h.h[w13, 1]}, p1, [x6, x3, lsl #1] + )"); + for (uint64_t i = 0; i < (SVL / 16); i += 2) { + EXPECT_EQ(getMemoryValue(400 + (i * 2)), src[i % 8]); + EXPECT_EQ(getMemoryValue(400 + ((i + 1) * 2)), 0); + EXPECT_EQ(getMemoryValue(800 + 16 + (i * 2)), src[i % 8]); + EXPECT_EQ(getMemoryValue(800 + 16 + ((i + 1) * 2)), 0); + } + + // Vertical + initialHeapData_.resize(SVL / 4); + uint16_t* heap16_vert = reinterpret_cast(initialHeapData_.data()); + std::vector src_vert = {0xDEAD, 0xBEEF, 0x1234, 0x5678, + 0x9876, 0x5432, 0xABCD, 0xEF01}; + fillHeap(heap16_vert, src_vert, SVL / 8); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + sub sp, sp, #4095 + mov x1, #0 + mov x4, #0 + addvl x4, x4, #1 + ptrue p0.h + + mov w12, #0 + ld1h {za0v.h[w12, 0]}, p0/z, [x0, x1, lsl #1] + ld1h {za1v.h[w12, 1]}, p0/z, [x0, x1, lsl #1] + st1h {za0v.h[w12, 0]}, p0, [sp, x1, lsl #1] + st1h {za1v.h[w12, 1]}, p0, [x4] + )"); + for (uint64_t i = 0; i < (SVL / 16); i++) { + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (i * 2)), + src_vert[i % 8]); + EXPECT_EQ(getMemoryValue((SVL / 8) + (i * 2)), src_vert[i % 8]); + } + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x3, #8 + ptrue p0.h + pfalse p1.b + zip1 p1.h, p0.h, p1.h + mov x5, #400 + mov x6, #800 + + mov w12, #0 + mov w13, #1 + # Load entire row + ld1h {za0v.h[w12, 0]}, p0/z, [x0, x3, lsl #1] + # Store all 0s to memory + st1h {za1v.h[w12, 0]}, p0, [x5] + # Store odd indexed elements to memory + st1h {za0v.h[w12, 0]}, p1, [x5] + + # Load entire row + ld1h {za0v.h[w13, 1]}, p0/z, [x0, x3, lsl #1] + # Store all 0s to memory + st1h {za1v.h[w12, 0]}, p0, [x6, x3, lsl #1] + # Store odd indexed elements to memory + st1h {za0v.h[w13, 1]}, p1, [x6, x3, lsl #1] + )"); + for (uint64_t i = 0; i < (SVL / 16); i += 2) { + EXPECT_EQ(getMemoryValue(400 + (i * 2)), src[i % 8]); + EXPECT_EQ(getMemoryValue(400 + ((i + 1) * 2)), 0); + EXPECT_EQ(getMemoryValue(800 + 16 + (i * 2)), src[i % 8]); + EXPECT_EQ(getMemoryValue(800 + 16 + ((i + 1) * 2)), 0); + } +} + +TEST_P(InstSme, st1q) { + // Horizontal + initialHeapData_.resize(SVL); + uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0xDEADBEEF12345678, 0x98765432ABCDEF01}; + fillHeap(heap64, src, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + sub sp, sp, #4095 + mov x1, #0 + mov x4, #0 + addvl x4, x4, #1 + ptrue p0.b + + mov w12, #0 + mov w13, #1 + ld1q {za0h.q[w12, 0]}, p0/z, [x0, x1, lsl #4] + ld1q {za1h.q[w13, 0]}, p0/z, [x0, x1, lsl #4] + st1q {za0h.q[w12, 0]}, p0, [sp, x1, lsl #4] + st1q {za1h.q[w13, 0]}, p0, [x4] + )"); + for (uint64_t i = 0; i < (SVL / 128); i++) { + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + ((2 * i) * 8)), + src[(2 * i) % 2]); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + ((2 * i + 1) * 8)), + src[(2 * i + 1) % 2]); + + EXPECT_EQ(getMemoryValue((SVL / 8) + ((2 * i) * 8)), + src[(2 * i) % 2]); + EXPECT_EQ(getMemoryValue((SVL / 8) + ((2 * i + 1) * 8)), + src[(2 * i + 1) % 2]); + } + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x3, #2 + ptrue p0.d + pfalse p1.b + # Do zip1 twice to get on-off for 128-bit + zip1 p1.d, p0.d, p1.d + zip1 p1.d, p1.d, p1.d + mov x5, #400 + mov x6, #800 + + mov w12, #0 + mov w13, #2 + # Load entire row + ld1q {za3h.q[w12, 0]}, p0/z, [x0, x3, lsl #4] + # Store all 0s to memory + st1q {za0h.q[w12, 0]}, p0, [x5] + # Store odd indexed elements to memory + st1q {za3h.q[w12, 0]}, p1, [x5] + + # Load entire row + ld1q {za1h.q[w13, 0]}, p0/z, [x0, x3, lsl #4] + # Store all 0s to memory + st1q {za0h.q[w12, 0]}, p0, [x6, x3, lsl #4] + # Store odd indexed elements to memory + st1q {za1h.q[w13, 0]}, p1, [x6, x3, lsl #4] + )"); + for (uint64_t i = 0; i < (SVL / 128); i += 2) { + EXPECT_EQ(getMemoryValue(400 + ((2 * i) * 8)), src[(2 * i) % 2]); + EXPECT_EQ(getMemoryValue(400 + ((2 * i + 1) * 8)), + src[(2 * i + 1) % 2]); + EXPECT_EQ(getMemoryValue(400 + (((2 * i) + 2) * 8)), 0); + EXPECT_EQ(getMemoryValue(400 + (((2 * i + 1) + 2) * 8)), 0); + + EXPECT_EQ(getMemoryValue(800 + 32 + ((2 * i) * 8)), + src[(2 * i) % 2]); + EXPECT_EQ(getMemoryValue(800 + 32 + ((2 * i + 1) * 8)), + src[(2 * i + 1) % 2]); + EXPECT_EQ(getMemoryValue(800 + 32 + (((2 * i) + 2) * 8)), 0); + EXPECT_EQ(getMemoryValue(800 + 32 + (((2 * i + 1) + 2) * 8)), 0); + } + + // Vertical + initialHeapData_.resize(SVL); + uint64_t* heap64_vert = reinterpret_cast(initialHeapData_.data()); + std::vector src_vert = {0xDEADBEEF12345678, 0x98765432ABCDEF01}; + fillHeap(heap64_vert, src_vert, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + sub sp, sp, #4095 + mov x1, #0 + mov x4, #0 + addvl x4, x4, #1 + ptrue p0.b + + mov w12, #0 + mov w13, #1 + ld1q {za0v.q[w12, 0]}, p0/z, [x0, x1, lsl #4] + ld1q {za1v.q[w13, 0]}, p0/z, [x0, x1, lsl #4] + st1q {za0v.q[w12, 0]}, p0, [sp, x1, lsl #4] + st1q {za1v.q[w13, 0]}, p0, [x4] + )"); + for (uint64_t i = 0; i < (SVL / 128); i++) { + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + ((2 * i) * 8)), + src_vert[(2 * i) % 2]); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + ((2 * i + 1) * 8)), + src_vert[(2 * i + 1) % 2]); + + EXPECT_EQ(getMemoryValue((SVL / 8) + ((2 * i) * 8)), + src_vert[(2 * i) % 2]); + EXPECT_EQ(getMemoryValue((SVL / 8) + ((2 * i + 1) * 8)), + src_vert[(2 * i + 1) % 2]); + } + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x3, #2 + ptrue p0.d + pfalse p1.b + # Do zip1 twice to get on-off for 128-bit + zip1 p1.d, p0.d, p1.d + zip1 p1.d, p1.d, p1.d + mov x5, #400 + mov x6, #800 + + mov w12, #0 + mov w13, #2 + # Load entire row + ld1q {za3v.q[w12, 0]}, p0/z, [x0, x3, lsl #4] + # Store all 0s to memory + st1q {za0v.q[w12, 0]}, p0, [x5] + # Store odd indexed elements to memory + st1q {za3v.q[w12, 0]}, p1, [x5] + + # Load entire row + ld1q {za1v.q[w13, 0]}, p0/z, [x0, x3, lsl #4] + # Store all 0s to memory + st1q {za0v.q[w12, 0]}, p0, [x6, x3, lsl #4] + # Store odd indexed elements to memory + st1q {za1v.q[w13, 0]}, p1, [x6, x3, lsl #4] + )"); + for (uint64_t i = 0; i < (SVL / 128); i += 2) { + EXPECT_EQ(getMemoryValue(400 + ((2 * i) * 8)), src[(2 * i) % 2]); + EXPECT_EQ(getMemoryValue(400 + ((2 * i + 1) * 8)), + src[(2 * i + 1) % 2]); + EXPECT_EQ(getMemoryValue(400 + (((2 * i) + 2) * 8)), 0); + EXPECT_EQ(getMemoryValue(400 + (((2 * i + 1) + 2) * 8)), 0); + + EXPECT_EQ(getMemoryValue(800 + 32 + ((2 * i) * 8)), + src[(2 * i) % 2]); + EXPECT_EQ(getMemoryValue(800 + 32 + ((2 * i + 1) * 8)), + src[(2 * i + 1) % 2]); + EXPECT_EQ(getMemoryValue(800 + 32 + (((2 * i) + 2) * 8)), 0); + EXPECT_EQ(getMemoryValue(800 + 32 + (((2 * i + 1) + 2) * 8)), 0); + } +} + +TEST_P(InstSme, st1w) { + // Horizontal + initialHeapData_.resize(SVL / 4); + uint32_t* heap32 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}; + fillHeap(heap32, src, SVL / 16); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + sub sp, sp, #4095 + mov x1, #0 + mov x4, #0 + addvl x4, x4, #1 + ptrue p0.s + + mov w12, #0 + ld1w {za0h.s[w12, 0]}, p0/z, [x0, x1, lsl #2] + ld1w {za1h.s[w12, 1]}, p0/z, [x0, x1, lsl #2] + st1w {za0h.s[w12, 0]}, p0, [sp, x1, lsl #2] + st1w {za1h.s[w12, 1]}, p0, [x4] + )"); + for (uint64_t i = 0; i < (SVL / 32); i++) { + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (i * 4)), + src[i % 4]); + EXPECT_EQ(getMemoryValue((SVL / 8) + (i * 4)), src[i % 4]); + } + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x3, #4 + ptrue p0.s + pfalse p1.b + zip1 p1.s, p0.s, p1.s + mov x5, #400 + mov x6, #800 + + mov w12, #0 + mov w13, #1 + # Load entire row + ld1w {za3h.s[w12, 0]}, p0/z, [x0, x3, lsl #2] + # Store all 0s to memory + st1w {za0h.s[w12, 0]}, p0, [x5] + # Store odd indexed elements to memory + st1w {za3h.s[w12, 0]}, p1, [x5] + + # Load entire row + ld1w {za1h.s[w13, 1]}, p0/z, [x0, x3, lsl #2] + # Store all 0s to memory + st1w {za0h.s[w12, 0]}, p0, [x6, x3, lsl #2] + # Store odd indexed elements to memory + st1w {za1h.s[w13, 1]}, p1, [x6, x3, lsl #2] + )"); + for (uint64_t i = 0; i < (SVL / 32); i += 2) { + EXPECT_EQ(getMemoryValue(400 + (i * 4)), src[i % 4]); + EXPECT_EQ(getMemoryValue(400 + ((i + 1) * 4)), 0); + EXPECT_EQ(getMemoryValue(800 + 16 + (i * 4)), src[i % 4]); + EXPECT_EQ(getMemoryValue(800 + 16 + ((i + 1) * 4)), 0); + } + + // Vertical + initialHeapData_.resize(SVL / 4); + uint32_t* heap32_vert = reinterpret_cast(initialHeapData_.data()); + std::vector src_vert = {0xDEADBEEF, 0x12345678, 0x98765432, + 0xABCDEF01}; + fillHeap(heap32_vert, src_vert, SVL / 16); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + sub sp, sp, #4095 + mov x1, #0 + mov x4, #0 + addvl x4, x4, #1 + ptrue p0.s + + mov w12, #0 + ld1w {za0v.s[w12, 0]}, p0/z, [x0, x1, lsl #2] + ld1w {za1v.s[w12, 1]}, p0/z, [x0, x1, lsl #2] + st1w {za0v.s[w12, 0]}, p0, [sp, x1, lsl #2] + st1w {za1v.s[w12, 1]}, p0, [x4] + )"); + for (uint64_t i = 0; i < (SVL / 32); i++) { + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (i * 4)), + src_vert[i % 4]); + EXPECT_EQ(getMemoryValue((SVL / 8) + (i * 4)), src_vert[i % 4]); + } + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + mov x3, #4 + ptrue p0.s + pfalse p1.b + zip1 p1.s, p0.s, p1.s + mov x5, #400 + mov x6, #800 + + mov w12, #0 + mov w13, #1 + # Load entire row + ld1w {za3v.s[w12, 0]}, p0/z, [x0, x3, lsl #2] + # Store all 0s to memory + st1w {za0v.s[w12, 0]}, p0, [x5] + # Store odd indexed elements to memory + st1w {za3v.s[w12, 0]}, p1, [x5] + + # Load entire row + ld1w {za1v.s[w13, 1]}, p0/z, [x0, x3, lsl #2] + # Store all 0s to memory + st1w {za0v.s[w12, 0]}, p0, [x6, x3, lsl #2] + # Store odd indexed elements to memory + st1w {za1v.s[w13, 1]}, p1, [x6, x3, lsl #2] + )"); + for (uint64_t i = 0; i < (SVL / 32); i += 2) { + EXPECT_EQ(getMemoryValue(400 + (i * 4)), src_vert[i % 4]); + EXPECT_EQ(getMemoryValue(400 + ((i + 1) * 4)), 0); + EXPECT_EQ(getMemoryValue(800 + 16 + (i * 4)), src_vert[i % 4]); + EXPECT_EQ(getMemoryValue(800 + 16 + ((i + 1) * 4)), 0); + } +} + +TEST_P(InstSme, str) { + RUN_AARCH64(R"( + smstart + + zero {za} + + dup z0.b, #2 + dup z1.b, #5 + ptrue p0.b + ptrue p1.b + + # Fill first 32-bit ZA tile with 40 in every element + umopa za0.s, p0/m, p1/m, z0.b, z1.b + + dup z0.b, #1 + dup z1.b, #5 + + # Fill third 32-bit ZA tile with 20 in every element + umopa za2.s, p0/m, p1/m, z0.b, z1.b + + mov x2, #600 + mov w12, #0 + + # ZA sub tiles are interleaved, so 0th, 4th, 8th... rows will be for za0.s + # 2nd, 6th, 10th ... rows will be for za2.s + str za[w12, #0], [x2] + str za[w12, #1], [x2, #1, mul vl] + str za[w12, #2], [x2, #2, mul vl] + str za[w12, #3], [x2, #3, mul vl] + + # Store 8th row (3rd row of za0.s) + add w12, w12, #8 + mov x3, #0 + addvl x3, x3, #4 + add x2, x2, x3 + str za[w12, #0], [x2] + + # Store 10th row (3rd row of za2.s) + add w12, w12, #2 + mov x3, #0 + addvl x3, x3, #1 + add x2, x2, x3 + str za[w12, #0], [x2] + )"); + for (uint64_t i = 0; i < (SVL / 32); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t, + fillNeon({40}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS1, i, uint32_t, + fillNeon({0}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t, + fillNeon({20}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS3, i, uint32_t, + fillNeon({0}, (SVL / 8))); + } + const uint64_t SVL_bytes = SVL / 8; + for (uint64_t i = 0; i < (SVL / 32); i++) { + const uint64_t off = i * sizeof(uint32_t); + EXPECT_EQ(getMemoryValue(600 + off), 40); + EXPECT_EQ(getMemoryValue(600 + SVL_bytes + off), 0); + EXPECT_EQ(getMemoryValue(600 + (2 * SVL_bytes) + off), 20); + EXPECT_EQ(getMemoryValue(600 + (3 * SVL_bytes) + off), 0); + EXPECT_EQ(getMemoryValue(600 + (4 * SVL_bytes) + off), 40); + EXPECT_EQ(getMemoryValue(600 + (5 * SVL_bytes) + off), 20); + } +} + +TEST_P(InstSme, sumopa) { + // 32-bit + RUN_AARCH64(R"( + smstart + + dup z1.b, #-8 + dup z2.b, #3 + ptrue p0.b + ptrue p1.b + + zero {za} + + sumopa za0.s, p0/m, p1/m, z1.b, z2.b + + dup z3.b, #-7 + dup z4.b, #4 + mov x0, #0 + mov x1, #2 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.b, xzr, x0 + + sumopa za2.s, p0/m, p2/m, z3.b, z4.b + )"); + for (uint64_t i = 0; i < (SVL / 32); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t, + fillNeon({-96}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t, + fillNeon({-112}, (SVL / 16))); + } + + RUN_AARCH64(R"( + smstart + + # z1 is signed, z2 is unsigned so will become 255 + dup z1.b, #3 + dup z2.b, #-1 + ptrue p0.b + ptrue p1.b + + zero {za} + + sumopa za0.s, p0/m, p1/m, z1.b, z2.b + + # z3 is signed, z4 is unsigned so will become 254 + dup z3.b, #7 + dup z4.b, #-2 + mov x0, #0 + mov x1, #2 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.b, xzr, x0 + + sumopa za2.s, p0/m, p2/m, z3.b, z4.b + )"); + for (uint64_t i = 0; i < (SVL / 32); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t, + fillNeon({3060}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t, + fillNeon({7112}, (SVL / 16))); + } + + // 64-bit + RUN_AARCH64(R"( + smstart + + dup z1.h, #-8 + dup z2.h, #3 + ptrue p0.h + ptrue p1.h + + zero {za} + + sumopa za0.d, p0/m, p1/m, z1.h, z2.h + + dup z3.h, #-7 + dup z4.h, #4 + mov x0, #0 + mov x1, #4 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.h, xzr, x0 + + sumopa za2.d, p0/m, p2/m, z3.h, z4.h + )"); + for (uint64_t i = 0; i < (SVL / 64); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t, + fillNeon({-96}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t, + fillNeon({-112}, (SVL / 16))); + } + + RUN_AARCH64(R"( + smstart + + # z1 is signed, z2 is unsigned so will become 65535 + dup z1.h, #3 + dup z2.h, #-1 + ptrue p0.h + ptrue p1.h + + zero {za} + + sumopa za0.d, p0/m, p1/m, z1.h, z2.h + + # z3 is signed, z4 is unsigned so will become 65534 + dup z3.h, #7 + dup z4.h, #-2 + mov x0, #0 + mov x1, #4 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.h, xzr, x0 + + sumopa za2.d, p0/m, p2/m, z3.h, z4.h + )"); + for (uint64_t i = 0; i < (SVL / 64); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t, + fillNeon({786420}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t, + fillNeon({1834952}, (SVL / 16))); + } +} + +TEST_P(InstSme, sumops) { + // 32-bit + RUN_AARCH64(R"( + smstart + + dup z1.b, #-8 + dup z2.b, #3 + ptrue p0.b + ptrue p1.b + + zero {za} + + sumops za0.s, p0/m, p1/m, z1.b, z2.b + + dup z3.b, #-7 + dup z4.b, #4 + mov x0, #0 + mov x1, #2 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.b, xzr, x0 + + sumops za2.s, p0/m, p2/m, z3.b, z4.b + )"); + for (uint64_t i = 0; i < (SVL / 32); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t, + fillNeon({96}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t, + fillNeon({112}, (SVL / 16))); + } + + RUN_AARCH64(R"( + smstart + + # z1 is signed, z2 is unsigned so will become 255 + dup z1.b, #3 + dup z2.b, #-1 + ptrue p0.b + ptrue p1.b + + zero {za} + + sumops za0.s, p0/m, p1/m, z1.b, z2.b + + # z3 is signed, z4 is unsigned so will become 254 + dup z3.b, #7 + dup z4.b, #-2 + mov x0, #0 + mov x1, #2 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.b, xzr, x0 + + sumops za2.s, p0/m, p2/m, z3.b, z4.b + )"); + for (uint64_t i = 0; i < (SVL / 32); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t, + fillNeon({-3060}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t, + fillNeon({-7112}, (SVL / 16))); + } + + // 64-bit + RUN_AARCH64(R"( + smstart + + dup z1.h, #-8 + dup z2.h, #3 + ptrue p0.h + ptrue p1.h + + zero {za} + + sumops za0.d, p0/m, p1/m, z1.h, z2.h + + dup z3.h, #-7 + dup z4.h, #4 + mov x0, #0 + mov x1, #4 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.h, xzr, x0 + + sumops za2.d, p0/m, p2/m, z3.h, z4.h + )"); + for (uint64_t i = 0; i < (SVL / 64); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t, + fillNeon({96}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t, + fillNeon({112}, (SVL / 16))); + } + + RUN_AARCH64(R"( + smstart + + # z1 is signed, z2 is unsigned so will become 255 + dup z1.h, #3 + dup z2.h, #-1 + ptrue p0.h + ptrue p1.h + + zero {za} + + sumops za0.d, p0/m, p1/m, z1.h, z2.h + + # z3 is signed, z4 is unsigned so will become 254 + dup z3.h, #7 + dup z4.h, #-2 + mov x0, #0 + mov x1, #4 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.h, xzr, x0 + + sumops za2.d, p0/m, p2/m, z3.h, z4.h + )"); + for (uint64_t i = 0; i < (SVL / 64); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t, + fillNeon({-786420}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t, + fillNeon({-1834952}, (SVL / 16))); + } +} + +TEST_P(InstSme, umopa) { + // 32-bit + RUN_AARCH64(R"( + smstart + + dup z1.b, #8 + dup z2.b, #3 + ptrue p0.b + ptrue p1.b + + zero {za} + + umopa za0.s, p0/m, p1/m, z1.b, z2.b + + dup z3.b, #7 + dup z4.b, #4 + mov x0, #0 + mov x1, #2 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.b, xzr, x0 + + umopa za2.s, p0/m, p2/m, z3.b, z4.b + )"); + for (uint64_t i = 0; i < (SVL / 32); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t, + fillNeon({96}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t, + fillNeon({112}, (SVL / 16))); + } + + // 64-bit + RUN_AARCH64(R"( + smstart + + dup z1.h, #8 + dup z2.h, #3 + ptrue p0.h + ptrue p1.h + + zero {za} + + umopa za0.d, p0/m, p1/m, z1.h, z2.h + + dup z3.h, #7 + dup z4.h, #4 + mov x0, #0 + mov x1, #4 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.h, xzr, x0 + + umopa za2.d, p0/m, p2/m, z3.h, z4.h + )"); + for (uint64_t i = 0; i < (SVL / 64); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, uint64_t, + fillNeon({96}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, uint64_t, + fillNeon({112}, (SVL / 16))); + } +} + +TEST_P(InstSme, umops) { + // 32-bit + RUN_AARCH64(R"( + smstart + + dup z1.b, #8 + dup z2.b, #3 + dup z3.b, #2 + ptrue p0.b + ptrue p1.b + + zero {za} + + umopa za0.s, p0/m, p1/m, z1.b, z2.b + umops za0.s, p0/m, p1/m, z1.b, z3.b + + dup z3.b, #7 + dup z4.b, #4 + dup z5.b, #3 + mov x0, #0 + mov x1, #2 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.b, xzr, x0 + + umopa za2.s, p0/m, p2/m, z3.b, z4.b + umops za2.s, p0/m, p2/m, z3.b, z5.b + )"); + for (uint64_t i = 0; i < (SVL / 32); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t, + fillNeon({32}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t, + fillNeon({28}, (SVL / 16))); + } + + // 64-bit + RUN_AARCH64(R"( + smstart + + dup z1.h, #8 + dup z2.h, #3 + dup z3.h, #2 + ptrue p0.h + ptrue p1.h + + zero {za} + + umopa za0.d, p0/m, p1/m, z1.h, z2.h + umops za0.d, p0/m, p1/m, z1.h, z3.h + + dup z3.h, #7 + dup z4.h, #4 + dup z5.h, #3 + mov x0, #0 + mov x1, #4 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.h, xzr, x0 + + umopa za2.d, p0/m, p2/m, z3.h, z4.h + umops za2.d, p0/m, p2/m, z3.h, z5.h + )"); + for (uint64_t i = 0; i < (SVL / 64); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, uint64_t, + fillNeon({32}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, uint64_t, + fillNeon({28}, (SVL / 16))); + } +} + +TEST_P(InstSme, usmopa) { + // 32-bit + RUN_AARCH64(R"( + smstart + + dup z1.b, #8 + dup z2.b, #-3 + ptrue p0.b + ptrue p1.b + + zero {za} + + usmopa za0.s, p0/m, p1/m, z1.b, z2.b + + dup z3.b, #7 + dup z4.b, #-4 + mov x0, #0 + mov x1, #2 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.b, xzr, x0 + + usmopa za2.s, p0/m, p2/m, z3.b, z4.b + )"); + for (uint64_t i = 0; i < (SVL / 32); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t, + fillNeon({-96}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t, + fillNeon({-112}, (SVL / 16))); + } + + RUN_AARCH64(R"( + smstart + + # z1 is unsigned so will become 253, z2 is signed + dup z1.b, #-3 + dup z2.b, #2 + ptrue p0.b + ptrue p1.b + + zero {za} + + usmopa za0.s, p0/m, p1/m, z1.b, z2.b + + # z3 is unsigned so will become 254, z4 is unsigned + dup z3.b, #-2 + dup z4.b, #7 + mov x0, #0 + mov x1, #2 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.b, xzr, x0 + + usmopa za2.s, p0/m, p2/m, z3.b, z4.b + )"); + for (uint64_t i = 0; i < (SVL / 32); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t, + fillNeon({2024}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t, + fillNeon({7112}, (SVL / 16))); + } + + // 64-bit + RUN_AARCH64(R"( + smstart + + dup z1.h, #8 + dup z2.h, #-3 + ptrue p0.h + ptrue p1.h + + zero {za} + + usmopa za0.d, p0/m, p1/m, z1.h, z2.h + + dup z3.h, #7 + dup z4.h, #-4 + mov x0, #0 + mov x1, #4 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.h, xzr, x0 + + usmopa za2.d, p0/m, p2/m, z3.h, z4.h + )"); + for (uint64_t i = 0; i < (SVL / 64); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t, + fillNeon({-96}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t, + fillNeon({-112}, (SVL / 16))); + } + + RUN_AARCH64(R"( + smstart + + # z1 is unsigned so will become 65533, z2 is unsigned + dup z1.h, #-3 + dup z2.h, #2 + ptrue p0.h + ptrue p1.h + + zero {za} + + usmopa za0.d, p0/m, p1/m, z1.h, z2.h + + # z3 is unsigned so will become 65534, z4 is signed + dup z3.h, #-2 + dup z4.h, #7 + mov x0, #0 + mov x1, #4 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.h, xzr, x0 + + usmopa za2.d, p0/m, p2/m, z3.h, z4.h + )"); + for (uint64_t i = 0; i < (SVL / 64); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t, + fillNeon({524264}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t, + fillNeon({1834952}, (SVL / 16))); + } +} + +TEST_P(InstSme, usmops) { + // 32-bit + RUN_AARCH64(R"( + smstart + + dup z1.b, #8 + dup z2.b, #-3 + ptrue p0.b + ptrue p1.b + + zero {za} + + usmops za0.s, p0/m, p1/m, z1.b, z2.b + + dup z3.b, #7 + dup z4.b, #-4 + mov x0, #0 + mov x1, #2 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.b, xzr, x0 + + usmops za2.s, p0/m, p2/m, z3.b, z4.b + )"); + for (uint64_t i = 0; i < (SVL / 32); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t, + fillNeon({96}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t, + fillNeon({112}, (SVL / 16))); + } + + RUN_AARCH64(R"( + smstart + + # z1 is unsigned so will become 253, z2 is signed + dup z1.b, #-3 + dup z2.b, #2 + ptrue p0.b + ptrue p1.b + + zero {za} + + usmops za0.s, p0/m, p1/m, z1.b, z2.b + + # z3 is unsigned so will become 254, z4 is signed + dup z3.b, #-2 + dup z4.b, #7 + mov x0, #0 + mov x1, #2 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.b, xzr, x0 + + usmops za2.s, p0/m, p2/m, z3.b, z4.b + )"); + for (uint64_t i = 0; i < (SVL / 32); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t, + fillNeon({-2024}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t, + fillNeon({-7112}, (SVL / 16))); + } + + // 64-bit + RUN_AARCH64(R"( + smstart + + dup z1.h, #8 + dup z2.h, #-3 + ptrue p0.h + ptrue p1.h + + zero {za} + + usmops za0.d, p0/m, p1/m, z1.h, z2.h + + dup z3.h, #7 + dup z4.h, #-4 + mov x0, #0 + mov x1, #4 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.h, xzr, x0 + + usmops za2.d, p0/m, p2/m, z3.h, z4.h + )"); + for (uint64_t i = 0; i < (SVL / 64); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t, + fillNeon({96}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t, + fillNeon({112}, (SVL / 16))); + } + + RUN_AARCH64(R"( + smstart + + # z1 is unsigned so will become 65533, z2 is signed + dup z1.h, #-3 + dup z2.h, #2 + ptrue p0.h + ptrue p1.h + + zero {za} + + usmops za0.d, p0/m, p1/m, z1.h, z2.h + + # z3 is unsigned so will become 65534, z4 is signed + dup z3.h, #-2 + dup z4.h, #7 + mov x0, #0 + mov x1, #4 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.h, xzr, x0 + + usmops za2.d, p0/m, p2/m, z3.h, z4.h + )"); + for (uint64_t i = 0; i < (SVL / 64); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t, + fillNeon({-524264}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t, + fillNeon({-1834952}, (SVL / 16))); } } diff --git a/test/unit/aarch64/ArchitectureTest.cc b/test/unit/aarch64/ArchitectureTest.cc index dbc1fa65ac..8f2619a283 100644 --- a/test/unit/aarch64/ArchitectureTest.cc +++ b/test/unit/aarch64/ArchitectureTest.cc @@ -117,6 +117,7 @@ TEST_F(AArch64ArchitectureTest, predecode) { EXPECT_EQ(result, 4); EXPECT_EQ(output[0]->getInstructionAddress(), 0x4); EXPECT_EQ(output[0]->exceptionEncountered(), false); + EXPECT_EQ(output[0]->getGroup(), InstructionGroups::SVE_DIV_OR_SQRT); } TEST_F(AArch64ArchitectureTest, getSystemRegisterTag) { @@ -239,6 +240,23 @@ TEST_F(AArch64ArchitectureTest, get_set_SVCRVal) { EXPECT_EQ(arch->getSVCRval(), 3); } +TEST_F(AArch64ArchitectureTest, isSM_ZA_enabled) { + EXPECT_FALSE(arch->isStreamingModeEnabled()); + EXPECT_FALSE(arch->isZARegisterEnabled()); + arch->setSVCRval(1); + EXPECT_TRUE(arch->isStreamingModeEnabled()); + EXPECT_FALSE(arch->isZARegisterEnabled()); + arch->setSVCRval(2); + EXPECT_FALSE(arch->isStreamingModeEnabled()); + EXPECT_TRUE(arch->isZARegisterEnabled()); + arch->setSVCRval(3); + EXPECT_TRUE(arch->isStreamingModeEnabled()); + EXPECT_TRUE(arch->isZARegisterEnabled()); + arch->setSVCRval(0); + EXPECT_FALSE(arch->isStreamingModeEnabled()); + EXPECT_FALSE(arch->isZARegisterEnabled()); +} + } // namespace aarch64 } // namespace arch } // namespace simeng diff --git a/test/unit/aarch64/InstructionTest.cc b/test/unit/aarch64/InstructionTest.cc index 1ecf14a1a6..06ab76c1e3 100644 --- a/test/unit/aarch64/InstructionTest.cc +++ b/test/unit/aarch64/InstructionTest.cc @@ -58,6 +58,18 @@ class AArch64InstructionTest : public testing::Test { &rawInsn_cbz); cbzMetadata = std::make_unique(rawInsn_cbz); + // psel + cs_insn rawInsn_psel; + cs_detail rawDetail_psel; + rawInsn_psel.detail = &rawDetail_psel; + size_t size_psel = 4; + uint64_t address_psel = 0; + const uint8_t* encoding_psel = + reinterpret_cast(pselInstrBytes.data()); + cs_disasm_iter(capstoneHandle, &encoding_psel, &size_psel, &address_psel, + &rawInsn_psel); + pselMetadata = std::make_unique(rawInsn_psel); + const uint8_t* badEncoding = reinterpret_cast(invalidInstrBytes.data()); invalidMetadata = std::make_unique(badEncoding); @@ -74,6 +86,8 @@ class AArch64InstructionTest : public testing::Test { std::array ldpInstrBytes = {0x61, 0x08, 0x40, 0xA9}; // cbz x2, #0x28 std::array cbzInstrBytes = {0x42, 0x01, 0x00, 0xB4}; + // psel p4, p0, p2.s[w13, 0] + std::array pselInstrBytes = {0x44, 0x40, 0x31, 0x25}; std::array invalidInstrBytes = {0x20, 0x00, 0x02, 0x8c}; // A Capstone decoding library handle, for decoding instructions. @@ -85,6 +99,7 @@ class AArch64InstructionTest : public testing::Test { std::unique_ptr fdivMetadata; std::unique_ptr ldpMetadata; std::unique_ptr cbzMetadata; + std::unique_ptr pselMetadata; std::unique_ptr invalidMetadata; std::unique_ptr uopInfo; InstructionException exception; @@ -182,7 +197,7 @@ TEST_F(AArch64InstructionTest, invalidInsn_1) { } EXPECT_EQ(insn.getException(), InstructionException::EncodingUnallocated); EXPECT_EQ(insn.getGeneratedAddresses().size(), 0); - // Default Group + // Default Group for instruction that is not decoded EXPECT_EQ(insn.getGroup(), InstructionGroups::INT_SIMPLE_ARTH_NOSHIFT); EXPECT_EQ(insn.getInstructionAddress(), 0x44); EXPECT_EQ(insn.getInstructionId(), 13); @@ -248,7 +263,7 @@ TEST_F(AArch64InstructionTest, invalidInsn_2) { } EXPECT_EQ(insn.getException(), InstructionException::HypervisorCall); EXPECT_EQ(insn.getGeneratedAddresses().size(), 0); - // Default Group + // Default Group for instruction that is not decoded EXPECT_EQ(insn.getGroup(), InstructionGroups::INT_SIMPLE_ARTH_NOSHIFT); EXPECT_EQ(insn.getInstructionAddress(), 0x43); EXPECT_EQ(insn.getInstructionId(), 15); diff --git a/test/unit/riscv/InstructionTest.cc b/test/unit/riscv/InstructionTest.cc index 6103cd4f5c..64eff7071c 100644 --- a/test/unit/riscv/InstructionTest.cc +++ b/test/unit/riscv/InstructionTest.cc @@ -178,7 +178,7 @@ TEST_F(RiscVInstructionTest, invalidInsn_1) { } EXPECT_EQ(insn.getException(), InstructionException::EncodingUnallocated); EXPECT_EQ(insn.getGeneratedAddresses().size(), 0); - // Default Group + // Default Group for instruction that is not decoded EXPECT_EQ(insn.getGroup(), InstructionGroups::INT_SIMPLE_ARTH); EXPECT_EQ(insn.getInstructionAddress(), 0x44); EXPECT_EQ(insn.getInstructionId(), 13); @@ -242,7 +242,7 @@ TEST_F(RiscVInstructionTest, invalidInsn_2) { } EXPECT_EQ(insn.getException(), InstructionException::HypervisorCall); EXPECT_EQ(insn.getGeneratedAddresses().size(), 0); - // Default Group + // Default Group for instruction that is not decoded EXPECT_EQ(insn.getGroup(), InstructionGroups::INT_SIMPLE_ARTH); EXPECT_EQ(insn.getInstructionAddress(), 0x43); EXPECT_EQ(insn.getInstructionId(), 15);