diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh index c2626b7e91..e5cf3dd3aa 100644 --- a/src/include/simeng/arch/aarch64/helpers/neon.hh +++ b/src/include/simeng/arch/aarch64/helpers/neon.hh @@ -558,6 +558,39 @@ RegisterValue vecLogicOp_3vecs(srcValContainer& sourceValues, return {out, 256}; } +/** Helper function for NEON instructions with the format `uaddlv rd, Vn.T`. + * T represents the type of the destination register (e.g. for h0, T = + * uint32_t). + * U represents the type of the sourceValues[0] (e.g. for v0.8b, U = + * uint8_t) + * I represents the number of elements in the output array to be updated (e.g. + * for vd.8b I = 8). + * Returns correctly formatted RegisterValue. */ +template +RegisterValue vecAddlv(srcValContainer& sourceValues) { + const U* n = sourceValues[0].getAsVector(); + T out = 0; + for (int i = 0; i < I; i++) { + out += n[i]; + } + return {out, 256}; +} + +/** Helper function for NEON instructions with the format `umaxv rd, Vn.T`. + * T represents the type of sourceValues (e.g. for vn.s, T = uint32_t). + * I represents the number of elements in the output array to be updated (e.g. + * for vd.8b I = 8). + * Returns correctly formatted RegisterValue. */ +template +RegisterValue vecUMaxV(srcValContainer& sourceValues) { + const T* n = sourceValues[0].getAsVector(); + T out = n[0]; + for (int i = 1; i < I; i++) { + out = std::max(n[i], out); + } + return {out, 256}; +} + /** Helper function for NEON instructions with the format `umaxp vd, vn, vm`. * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t). * I represents the number of elements in the output array to be updated (e.g. diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 2c33ccfbe6..6d4c0df66a 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -257,6 +257,32 @@ RegisterValue sveCpy_imm( return {out, 256}; } +/** Helper function for SVE instructions with the format `cpy zd, pg/m, rn + * T represents the type of sourceValues (e.g. for zd.d, T = int64_t). + * Returns correctly formatted RegisterValue. */ +template +RegisterValue sveCpy_Scalar( + srcValContainer& sourceValues, + const simeng::arch::aarch64::InstructionMetadata& metadata, + const uint16_t VL_bits) { + const T* zd = sourceValues[0].getAsVector(); + const uint64_t* p = sourceValues[1].getAsVector(); + const T rn = sourceValues[2].get(); + + const uint16_t partition_num = VL_bits / (sizeof(T) * 8); + T out[256 / sizeof(T)] = {0}; + + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); + if (p[i / (64 / sizeof(T))] & shifted_active) { + out[i] = rn; + } else { + out[i] = zd[i]; + } + } + return {out, 256}; +} + /** Helper function for SVE instructions with the format `dec xdn{, * pattern{, MUL #imm}}`. * T represents the type of operation (e.g. for DECD, T = uint64_t). @@ -849,6 +875,132 @@ RegisterValue sveFsqrtPredicated_2vecs(srcValContainer& sourceValues, return {out, 256}; } +/** Helper function for SVE instructions with the format `ftsmul zd, zn, zm`. + * T represents the type of sourceValues (e.g. for zn.d, T = double). + * U represents the same precision as T, but as an integer type for the second + * source register. + * Returns correctly formatted RegisterValue. */ +template +RegisterValue sveFTrigSMul(srcValContainer& sourceValues, + const uint16_t VL_bits) { + const T* n = sourceValues[0].getAsVector(); + const U* m = sourceValues[1].getAsVector(); + + const uint16_t partition_num = VL_bits / (sizeof(T) * 8); + T out[256 / sizeof(T)] = {0}; + + U bit_0_mask = static_cast(1) << (sizeof(T) * 8 - 1); + // Square each element in the first source vector and then set the sign bit + // to a copy of bit 0 of the corresponding element in the second source + // register + for (int i = 0; i < partition_num; i++) { + out[i] = n[i] * n[i]; + T sign_bit = m[i] & bit_0_mask ? -1.0 : 1.0; + out[i] = std::abs(out[i]) * sign_bit; + } + + return {out, 256}; +} + +/** Helper function for SVE instructions with the format `ftssel zd, zn, zm`. + * T represents the type of sourceValues (e.g. for zn.d, T = double). + * U represents the same precision as T, but as an integer type for the second + * source register. + * Returns correctly formatted RegisterValue. */ +template +RegisterValue sveFTrigSSel(srcValContainer& sourceValues, + const uint16_t VL_bits) { + const T* n = sourceValues[0].getAsVector(); + const U* m = sourceValues[1].getAsVector(); + + const uint16_t partition_num = VL_bits / (sizeof(T) * 8); + T out[256 / sizeof(T)] = {0}; + + U bit_0_mask = static_cast(1) << (sizeof(T) * 8 - 1); + U bit_1_mask = static_cast(1) << (sizeof(T) * 8 - 2); + + // Place the value 1.0 or a copy of the first source vector element in the + // destination element, depending on bit 0 of the corresponding element of + // the second source vector. The sign bit of the destination element is + // negated from bit 1 of the second source vector + for (int i = 0; i < partition_num; i++) { + out[i] = m[i] & bit_0_mask ? static_cast(1.0) : n[i]; + out[i] = m[i] & bit_1_mask ? -out[i] : out[i]; + } + return {out, 256}; +} + +/** Helper function for SVE instructions with the format `ftmad zd, zn, zm, + * #imm`. + * T represents the type of sourceValues (e.g. for zn.d, T = double). + * Returns correctly formatted RegisterValue. **/ +template +RegisterValue sveFTrigMad( + srcValContainer& sourceValues, + const simeng::arch::aarch64::InstructionMetadata& metadata, + const uint16_t VL_bits) { + const T* n = sourceValues[1].getAsVector(); + const T* m = sourceValues[2].getAsVector(); + const uint8_t imm = static_cast(metadata.operands[3].imm); + + const std::array sin64 = {1.0, + -0.1666666666666661, + 0.8333333333320002e-02, + -0.1984126982840213e-03, + 0.2755731329901505e-05, + -0.2505070584637887e-07, + 0.1589413637195215e-09, + 0.0}; + + const std::array cos64 = {1.0, + -0.5000000000000000, + 0.4166666666666645e-01, + -0.1388888888886111e-02, + 0.2480158728388683e-04, + -0.2755731309913950e-06, + 0.2087558253975872e-08, + -0.1135338700720054e-10}; + + const std::array sin32 = {1.0f, + -1.666666716337e-01f, + 8.333330973983e-03f, + -1.983967522392e-04f, + 2.721174723774e-06f, + 0.0f, + 0.0f, + 0.0f}; + + const std::array cos32 = {1.0f, + -5.000000000000e-01f, + 4.166664928198e-02f, + -1.388759003021e-03f, + 2.446388680255e-05f, + 0.0f, + 0.0f, + 0.0f}; + + const uint16_t partition_num = VL_bits / (sizeof(T) * 8); + T out[256 / sizeof(T)] = {0}; + + for (int i = 0; i < partition_num; i++) { + T coeff; + const bool sign_bit = std::signbit(m[i]); + // If float then use those LUTs + if (sizeof(T) == 4) { + coeff = sign_bit ? cos32[imm] : sin32[imm]; + } + // Else if double use those LUTs + else { + coeff = sign_bit ? cos64[imm] : sin64[imm]; + } + // TODO: Add FP16 support if/when we eventually support these (may require + // C++23) + out[i] = n[i] * std::abs(m[i]) + coeff; + } + + return {out, 256}; +} + /** Helper function for SVE instructions with the format `inc * xdn{, pattern{, MUL #imm}}`. * T represents the type of operation (e.g. for INCB, T = int8_t). @@ -936,6 +1088,63 @@ RegisterValue sveIndex( return {out, 256}; } +/** Helper function for SVE instructions with the format `lastb zd, pg, zn`. + * T represents the vector register type (e.g. zd.d would be uint64_t). + * Returns correctly formatted RegisterValue. */ +template +RegisterValue sveLastBSimdScalar(srcValContainer& sourceValues, + const uint16_t VL_bits) { + const uint64_t* p = sourceValues[0].getAsVector(); + const T* n = sourceValues[1].getAsVector(); + + const uint16_t partition_num = VL_bits / (sizeof(T) * 8); + + // Get last active element + int lastElem = 0; + for (int i = partition_num - 1; i >= 0; i--) { + uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); + if (p[i / (64 / sizeof(T))] & shifted_active) { + lastElem = i; + break; + } + // If no active lane has been found, select highest element instead + if (i == 0) lastElem = partition_num - 1; + } + return {n[lastElem], 256}; +} + +/** Helper function for SVE instructions with the format `clastb zd, pg, zd, + * zn`. + * T represents the vector register type (e.g. zd.d would be uint64_t). + * Returns correctly formatted RegisterValue. */ +template +RegisterValue sveCLastBSimdScalar(srcValContainer& sourceValues, + const uint16_t VL_bits) { + const uint64_t* p = sourceValues[0].getAsVector(); + const T* m = sourceValues[1].getAsVector(); + const T* n = sourceValues[2].getAsVector(); + + const uint16_t partition_num = VL_bits / (sizeof(T) * 8); + T out; + + // Get last active element + int lastElem = -1; + for (int i = partition_num - 1; i >= 0; i--) { + uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); + if (p[i / (64 / sizeof(T))] & shifted_active) { + lastElem = i; + break; + } + } + + if (lastElem < 0) { + out = m[0]; + } else { + out = n[lastElem]; + } + return {out, 256}; +} + /** Helper function for SVE instructions with the format ` * pd, pg/z, pn, pm`. * T represents the type of sourceValues (e.g. for pn.d, T = uint64_t). @@ -1268,7 +1477,8 @@ RegisterValue sveOrr_3vecs(srcValContainer& sourceValues, /** Helper function for SVE2 instructions with the format `psel pd, pn, * pm.t[wa, #imm]`. * T represents the type of sourceValues (e.g. for pm.d, T = - * uint64_t). Returns an array of 4 uint64_t elements. */ + * uint64_t). + * Returns an array of 4 uint64_t elements. */ template std::array svePsel( srcValContainer& sourceValues, @@ -1293,6 +1503,63 @@ std::array svePsel( return out; } +/** Helper function for SVE instructions with the format `pfirst pdn.b, pg, + * pdn.b`. + * Returns an array of 4 uint64_t elements, and updates the NZCV flags. + */ +std::tuple, uint8_t> svePfirst( + srcValContainer& sourceValues, const uint16_t VL_bits) { + const uint16_t partition_num = VL_bits / 8; + const uint64_t* p = sourceValues[0].getAsVector(); + const uint64_t* dn = sourceValues[1].getAsVector(); + // Set destination d as source n to copy all false lanes and the active lanes + // beyond the first + std::array out = {dn[0], dn[1], dn[2], dn[3]}; + // Get the first active lane and set same lane in destination predicate + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % (64))); + if (p[i / 64] & shifted_active) { + out[i / 64] |= shifted_active; + break; + } + } + return {out, getNZCVfromPred(out, VL_bits, 1)}; +} + +/** Helper function for SVE instructions with the format `pnext pdn, pv, pdn`. + * T represents the type of sourceValues (e.g. for pdn.d, T = uint64_t). + * Returns an array of 4 uint64_t elements, and updates the NZCV flags. */ +template +std::tuple, uint8_t> svePnext( + srcValContainer& sourceValues, + const simeng::arch::aarch64::InstructionMetadata& metadata, + const uint16_t VL_bits) { + const uint16_t partition_num = VL_bits / (sizeof(T) * 8); + const uint64_t* p = sourceValues[0].getAsVector(); + const uint64_t* dn = sourceValues[1].getAsVector(); + // Set destination elements to 0 + std::array out = {0, 0, 0, 0}; + + // Get last active element of dn.pattern + int lastElem = -1; + for (int i = partition_num - 1; i >= 0; i--) { + uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); + if (dn[i / (64 / sizeof(T))] & shifted_active) { + lastElem = i; + break; + } + } + // Get next active element of p, starting from last of dn.pattern + for (int i = lastElem + 1; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); + if (p[i / (64 / sizeof(T))] & shifted_active) { + out[i / (64 / sizeof(T))] |= shifted_active; + break; + } + } + return {out, getNZCVfromPred(out, VL_bits, sizeof(T))}; +} + /** Helper function for SVE instructions with the format `ptrue pd{, * pattern}. * T represents the type of sourceValues (e.g. for pd.d, T = uint64_t). @@ -1425,6 +1692,51 @@ RegisterValue sveSminv(srcValContainer& sourceValues, const uint16_t VL_bits) { return {out, 256}; } +/** Helper function for SVE instructions with the format `splice zd, pg, zn, + * zm`. + * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t). + * Returns correctly formatted RegisterValue. */ +template +RegisterValue sveSplice(srcValContainer& sourceValues, const uint16_t VL_bits) { + const uint64_t* p = sourceValues[0].getAsVector(); + const T* n = sourceValues[1].getAsVector(); + const T* m = sourceValues[2].getAsVector(); + + const uint16_t partition_num = VL_bits / (sizeof(T) * 8); + T out[256 / sizeof(T)] = {0}; + + // Get last active element + int lastElem = 0; + for (int i = partition_num - 1; i >= 0; i--) { + uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); + if (p[i / (64 / sizeof(T))] & shifted_active) { + lastElem = i; + break; + } + } + + // Extract region from n as denoted by predicate p. Copy region into the + // lowest elements of the destination operand + bool active = false; + int index = 0; + for (int i = 0; i <= lastElem; i++) { + uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); + if (p[i / (64 / sizeof(T))] & shifted_active) active = true; + if (active) { + out[index] = n[i]; + index++; + } + } + + // Set any unassigned elements to the lowest elements in m + int elemsLeft = partition_num - index; + for (int i = 0; i < elemsLeft; i++) { + out[index] = m[i]; + index++; + } + return {out, 256}; +} + /** Helper function for SVE instructions with the format `Sub zd, zn, * zm`. * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t). @@ -1634,33 +1946,31 @@ RegisterValue sveUzp_vecs(srcValContainer& sourceValues, const uint16_t VL_bits, return {out, 256}; } -/** Helper function for SVE instructions with the format `whilelo pd, - * n, m`. +/** Helper function for SVE instructions with the format `while pd, n, m`. * T represents the type of sourceValues n and m (e.g. for wn, T = uint32_t). * P represents the type of operand p (e.g. for pd.b, P = uint8_t). * Returns tuple of type [pred results (array of 4 uint64_t), nzcv]. */ template -std::tuple, uint8_t> sveWhilelo( - srcValContainer& sourceValues, const uint16_t VL_bits, bool calcNZCV) { +std::tuple, uint8_t> sveWhile( + srcValContainer& sourceValues, const uint16_t VL_bits, + std::function func) { const T n = sourceValues[0].get(); const T m = sourceValues[1].get(); const uint16_t partition_num = VL_bits / (sizeof(P) * 8); std::array out = {0, 0, 0, 0}; - uint16_t index = 0; for (int i = 0; i < partition_num; i++) { // Determine whether lane should be active and shift to align with // element in predicate register. uint64_t shifted_active = - (n + i) < m ? 1ull << ((i % (64 / (sizeof(P))) * (sizeof(P)))) : 0; - out[index / (64 / (sizeof(P)))] = - out[index / (64 / (sizeof(P)))] | shifted_active; - index++; + func((n + i), m) ? 1ull << ((i % (64 / (sizeof(P))) * (sizeof(P)))) : 0; + out[i / (64 / (sizeof(P)))] |= shifted_active; } // Byte count = sizeof(P) as destination predicate is predicate of P // bytes. - uint8_t nzcv = calcNZCV ? getNZCVfromPred(out, VL_bits, sizeof(P)) : 0; + uint8_t nzcv = getNZCVfromPred(out, VL_bits, sizeof(P)); return {out, nzcv}; } diff --git a/src/include/simeng/pipeline/ReorderBuffer.hh b/src/include/simeng/pipeline/ReorderBuffer.hh index 2e6e68e37b..4c31eeb38a 100644 --- a/src/include/simeng/pipeline/ReorderBuffer.hh +++ b/src/include/simeng/pipeline/ReorderBuffer.hh @@ -125,6 +125,14 @@ class ReorderBuffer { */ uint64_t pc_; + /** A counter for how many cycles the same instruction has been at the head of + * the ROB */ + uint64_t robHeadRepeatCounter_ = 0; + + /** A limit for the counter of how long an instruction can be stuck at the + * head of the ROB before SimEng exits with an exception. */ + uint64_t robHeadRepeatLimit_ = 10000000; + /** The sequence ID of the youngest instruction that should remain after the * current flush. */ uint64_t flushAfter_; diff --git a/src/lib/arch/aarch64/ExceptionHandler.cc b/src/lib/arch/aarch64/ExceptionHandler.cc index ae98dddb1a..639f8e0655 100644 --- a/src/lib/arch/aarch64/ExceptionHandler.cc +++ b/src/lib/arch/aarch64/ExceptionHandler.cc @@ -429,9 +429,9 @@ bool ExceptionHandler::init() { << std::endl; return fatal(); } - uint64_t retval = (pid == 0) ? 1 : 0; - stateChange = {ChangeType::REPLACEMENT, {R0}, {retval}}; - stateChange.memoryAddresses.push_back({mask, 1}); + uint64_t retval = static_cast(bitmask); + stateChange = {ChangeType::REPLACEMENT, {R0}, {sizeof(retval)}}; + stateChange.memoryAddresses.push_back({mask, sizeof(bitmask)}); stateChange.memoryAddressValues.push_back(bitmask); } else { stateChange = {ChangeType::REPLACEMENT, {R0}, {-1ll}}; diff --git a/src/lib/arch/aarch64/InstructionMetadata.cc b/src/lib/arch/aarch64/InstructionMetadata.cc index 34ddca07d7..219023d93a 100644 --- a/src/lib/arch/aarch64/InstructionMetadata.cc +++ b/src/lib/arch/aarch64/InstructionMetadata.cc @@ -100,6 +100,14 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn) operands[0].access = CS_AC_WRITE; operands[1].access = CS_AC_READ; break; + case Opcode::AArch64_FTSMUL_ZZZ_D: + [[fallthrough]]; + case Opcode::AArch64_FTSMUL_ZZZ_S: + [[fallthrough]]; + case Opcode::AArch64_FTSSEL_ZZZ_D: + [[fallthrough]]; + case Opcode::AArch64_FTSSEL_ZZZ_S: + [[fallthrough]]; case Opcode::AArch64_FSUB_ZPmI_D: [[fallthrough]]; case Opcode::AArch64_FSUB_ZPmI_H: @@ -123,6 +131,41 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn) operands[2].access = CS_AC_READ; break; } + case Opcode::AArch64_FTMAD_ZZI_D: + [[fallthrough]]; + case Opcode::AArch64_FTMAD_ZZI_S: { + // Incorrect access types + operands[0].access = CS_AC_READ | CS_AC_WRITE; + operands[1].access = CS_AC_READ; + operands[2].access = CS_AC_READ; + break; + } + case Opcode::AArch64_PFIRST_B: + [[fallthrough]]; + case Opcode::AArch64_PNEXT_D: + [[fallthrough]]; + case Opcode::AArch64_PNEXT_S: + [[fallthrough]]; + case Opcode::AArch64_PNEXT_H: + [[fallthrough]]; + case Opcode::AArch64_PNEXT_B: { + // Incorrect access types + operands[0].access = CS_AC_WRITE; + operands[1].access = CS_AC_READ; + operands[2].access = CS_AC_READ; + // Doesn't identify implicit NZCV destination + implicitDestinationCount = 1; + implicitDestinations[0] = AARCH64_REG_NZCV; + break; + } + case Opcode::AArch64_CLASTB_VPZ_D: + [[fallthrough]]; + case Opcode::AArch64_CLASTB_VPZ_S: + [[fallthrough]]; + case Opcode::AArch64_CLASTB_VPZ_H: + [[fallthrough]]; + case Opcode::AArch64_CLASTB_VPZ_B: + [[fallthrough]]; case Opcode::AArch64_AND_ZPmZ_D: // Example bytecode - 4901da04 [[fallthrough]]; case Opcode::AArch64_AND_ZPmZ_H: @@ -155,6 +198,10 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn) [[fallthrough]]; case Opcode::AArch64_SMAX_ZPmZ_S: // Example bytecode - 01008804 [[fallthrough]]; + case Opcode::AArch64_SPLICE_ZPZ_D: + [[fallthrough]]; + case Opcode::AArch64_SPLICE_ZPZ_S: + [[fallthrough]]; case Opcode::AArch64_MUL_ZPmZ_B: // Example bytecode - 40001004 [[fallthrough]]; case Opcode::AArch64_MUL_ZPmZ_D: diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index 357077e7b3..06eb7e2004 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -451,6 +451,10 @@ span Instruction::generateAddresses() { setMemoryAddresses({{sourceValues_[0].get(), 8}}); break; } + case Opcode::AArch64_LDAXRB: { // ldaxrb wt, [xn] + setMemoryAddresses({{sourceValues_[0].get(), 1}}); + break; + } case Opcode::AArch64_LDAXRW: { // ldaxr wd, [xn] setMemoryAddresses({{sourceValues_[0].get(), 4}}); break; @@ -749,6 +753,13 @@ span Instruction::generateAddresses() { setMemoryAddresses({{sourceValues_[0].get() + offset, 4}}); break; } + case Opcode::AArch64_LDRSWroW: { // ldrsw xt, [xn, wm{, extend + // {#amount}}] + uint64_t offset = extendOffset(sourceValues_[1].get(), + metadata_.operands[1]); + setMemoryAddresses({{sourceValues_[0].get() + offset, 4}}); + break; + } case Opcode::AArch64_LDRSWui: { // ldrsw xt, [xn{, #pimm}] uint64_t base = sourceValues_[0].get() + metadata_.operands[1].mem.disp; @@ -1350,11 +1361,19 @@ span Instruction::generateAddresses() { setMemoryAddresses({{sourceValues_[1].get(), 8}}); break; } - case Opcode::AArch64_STLXRW: { // stlxr ws, wt, [xn] + case Opcode::AArch64_STLXRB: { // stlxrb ws, wt, [xn] + setMemoryAddresses({{sourceValues_[1].get(), 1}}); + break; + } + case Opcode::AArch64_STLXRH: { // stlxrb ws, ht, [xn] + setMemoryAddresses({{sourceValues_[1].get(), 2}}); + break; + } + case Opcode::AArch64_STLXRW: { // stlxrb ws, wt, [xn] setMemoryAddresses({{sourceValues_[1].get(), 4}}); break; } - case Opcode::AArch64_STLXRX: { // stlxr ws, xt, [xn] + case Opcode::AArch64_STLXRX: { // stlxr ws, xwt, [xn] setMemoryAddresses({{sourceValues_[1].get(), 8}}); break; } diff --git a/src/lib/arch/aarch64/Instruction_decode.cc b/src/lib/arch/aarch64/Instruction_decode.cc index 6d2007cb55..215ade08fa 100644 --- a/src/lib/arch/aarch64/Instruction_decode.cc +++ b/src/lib/arch/aarch64/Instruction_decode.cc @@ -500,7 +500,9 @@ void Instruction::decode() { // Check first operand access to determine if it's a load or store if (metadata_.operands[0].access & CS_AC_WRITE) { if (metadata_.id == AARCH64_INS_STXR || - metadata_.id == AARCH64_INS_STLXR) { + metadata_.id == AARCH64_INS_STLXR || + metadata_.id == AARCH64_INS_STLXRB || + metadata_.id == AARCH64_INS_STLXRH) { // Exceptions to this is load condition are exclusive store with a // success flag as first operand if (microOpcode_ != MicroOpcode::STR_DATA) { diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 20b62904b9..3090e3cb42 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -205,6 +205,10 @@ void Instruction::execute() { results_[0] = vecSumElems_2ops(sourceValues_); break; } + case Opcode::AArch64_UADDLVv8i8v: { // uaddlv hd, vn.8b + results_[0] = vecAddlv(sourceValues_); + break; + } case Opcode::AArch64_ADDWri: { // add wd, wn, #imm{, shift} auto [result, nzcv] = addShift_imm(sourceValues_, metadata_, false); @@ -355,6 +359,30 @@ void Instruction::execute() { sveAdr_packedOffsets(sourceValues_, metadata_, VL_bits); break; } + case Opcode::AArch64_FTSMUL_ZZZ_S: { // ftsmul zd.s, zn.s, zm.s + results_[0] = sveFTrigSMul(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_FTSMUL_ZZZ_D: { // ftsmul zd.d, zn.d, zm.d + results_[0] = sveFTrigSMul(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_FTSSEL_ZZZ_S: { // ftssel zd.s, zn.s, zm.s + results_[0] = sveFTrigSSel(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_FTSSEL_ZZZ_D: { // ftssel zd.d, zn.d, zm.d + results_[0] = sveFTrigSSel(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_FTMAD_ZZI_S: { // ftmad zd.s, zn.s, zm.s, #imm + results_[0] = sveFTrigMad(sourceValues_, metadata_, VL_bits); + break; + } + case Opcode::AArch64_FTMAD_ZZI_D: { // ftmad zd.s, zn.s, zm.s, #imm + results_[0] = sveFTrigMad(sourceValues_, metadata_, VL_bits); + break; + } case Opcode::AArch64_ANDSWri: { // ands wd, wn, #imm auto [result, nzcv] = logicOp_imm( sourceValues_, metadata_, true, @@ -675,6 +703,12 @@ void Instruction::execute() { [](uint8_t x, uint8_t y) -> bool { return (x == y); }); break; } + case Opcode::AArch64_CMEQv2i32rz: { // cmeq vd.2s, vn.2s, #0 + results_[0] = vecCompare( + sourceValues_, true, + [](uint32_t x, uint32_t y) -> bool { return (x == y); }); + break; + } case Opcode::AArch64_CMEQv4i32: { // cmeq vd.4s, vn.4s, vm.4s results_[0] = vecCompare( sourceValues_, false, @@ -693,6 +727,12 @@ void Instruction::execute() { [](int8_t x, int8_t y) -> bool { return (x == y); }); break; } + case Opcode::AArch64_CMHIv2i32: { // cmhi vd.2s, vn.2s, vm.2s + results_[0] = vecCompare( + sourceValues_, false, + [](uint32_t x, uint32_t y) -> bool { return (x > y); }); + break; + } case Opcode::AArch64_CMHIv4i32: { // cmhi vd.4s, vn.4s, vm.4s results_[0] = vecCompare( sourceValues_, false, @@ -833,6 +873,38 @@ void Instruction::execute() { results_[1] = output; break; } + case Opcode::AArch64_CMPHS_PPzZZ_B: { // cmphs pd.b, pg/z, zn.b, zm.b + auto [output, nzcv] = sveCmpPredicated_toPred( + sourceValues_, metadata_, VL_bits, false, + [](uint8_t x, uint8_t y) -> bool { return x >= y; }); + results_[0] = nzcv; + results_[1] = output; + break; + } + case Opcode::AArch64_CMPHS_PPzZZ_D: { // cmphs pd.d, pg/z, zn.d, zm.d + auto [output, nzcv] = sveCmpPredicated_toPred( + sourceValues_, metadata_, VL_bits, false, + [](uint64_t x, uint64_t y) -> bool { return x >= y; }); + results_[0] = nzcv; + results_[1] = output; + break; + } + case Opcode::AArch64_CMPHS_PPzZZ_H: { // cmphs pd.h, pg/z, zn.h, zm.h + auto [output, nzcv] = sveCmpPredicated_toPred( + sourceValues_, metadata_, VL_bits, false, + [](uint16_t x, uint16_t y) -> bool { return x >= y; }); + results_[0] = nzcv; + results_[1] = output; + break; + } + case Opcode::AArch64_CMPHS_PPzZZ_S: { // cmphs pd.s, pg/z, zn.s, zm.s + auto [output, nzcv] = sveCmpPredicated_toPred( + sourceValues_, metadata_, VL_bits, false, + [](uint32_t x, uint32_t y) -> bool { return x >= y; }); + results_[0] = nzcv; + results_[1] = output; + break; + } case Opcode::AArch64_CMPNE_PPzZI_B: { // cmpne pd.b, pg/z. zn.b, #imm auto [output, nzcv] = sveCmpPredicated_toPred( sourceValues_, metadata_, VL_bits, true, @@ -949,6 +1021,22 @@ void Instruction::execute() { results_[0] = sveCpy_imm(sourceValues_, metadata_, VL_bits); break; } + case Opcode::AArch64_CPY_ZPmV_B: { // cpy zd.b, pg/m, vn.b + results_[0] = sveCpy_Scalar(sourceValues_, metadata_, VL_bits); + break; + } + case Opcode::AArch64_CPY_ZPmV_D: { // cpy zd.d, pg/m, vn.d + results_[0] = sveCpy_Scalar(sourceValues_, metadata_, VL_bits); + break; + } + case Opcode::AArch64_CPY_ZPmV_H: { // cpy zd.h, pg/m, vn.h + results_[0] = sveCpy_Scalar(sourceValues_, metadata_, VL_bits); + break; + } + case Opcode::AArch64_CPY_ZPmV_S: { // cpy zd.s, pg/m, vn.s + results_[0] = sveCpy_Scalar(sourceValues_, metadata_, VL_bits); + break; + } case Opcode::AArch64_DUPi32: { // dup vd, vn.s[index] results_[0] = vecDup_gprOrIndex(sourceValues_, metadata_, false); @@ -1752,6 +1840,10 @@ void Instruction::execute() { results_[0] = vecFDiv(sourceValues_); break; } + case Opcode::AArch64_FDIVv4f32: { // fdiv vd.4s, vn.4s, vm.4s + results_[0] = vecFDiv(sourceValues_); + break; + } case Opcode::AArch64_FDUP_ZI_D: { // fdup zd.d, #imm results_[0] = sveDup_immOrScalar(sourceValues_, metadata_, VL_bits, true); @@ -2557,6 +2649,38 @@ void Instruction::execute() { vecInsIndex_gpr(sourceValues_, metadata_); break; } + case Opcode::AArch64_LASTB_VPZ_D: { // lastb dd, pg, zn.d + results_[0] = sveLastBSimdScalar(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_LASTB_VPZ_S: { // lastb sd, pg, zn.s + results_[0] = sveLastBSimdScalar(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_LASTB_VPZ_H: { // lastb hd, pg, zn.h + results_[0] = sveLastBSimdScalar(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_LASTB_VPZ_B: { // lastb bd, pg, zn.b + results_[0] = sveLastBSimdScalar(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_CLASTB_VPZ_D: { // clastb dd, pg, dn, zn.d + results_[0] = sveCLastBSimdScalar(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_CLASTB_VPZ_S: { // clastb sd, pg, sn, zn.s + results_[0] = sveCLastBSimdScalar(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_CLASTB_VPZ_H: { // clastb hd, pg, hn, zn.h + results_[0] = sveCLastBSimdScalar(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_CLASTB_VPZ_B: { // clastb bd, pg, bn, zn.b + results_[0] = sveCLastBSimdScalar(sourceValues_, VL_bits); + break; + } case Opcode::AArch64_LD1_MXIPXX_H_D: { // ld1d {zath.d[ws, #imm]}, pg/z, // [{, xm, lsl #3}] // SME, LOAD @@ -3330,6 +3454,11 @@ void Instruction::execute() { results_[0] = memoryData_[0]; break; } + case Opcode::AArch64_LDAXRB: { // ldaxrb wt, [xn] + // LOAD + results_[0] = memoryData_[0].zeroExtend(1, 8); + break; + } case Opcode::AArch64_LDAXRW: { // ldaxr wd, [xn] // LOAD results_[0] = memoryData_[0].zeroExtend(4, 8); @@ -3603,6 +3732,12 @@ void Instruction::execute() { results_[0] = static_cast(memoryData_[0].get()); break; } + case Opcode::AArch64_LDRSWroW: { // ldrsw xt, [xn, wm, {extend + // {#amount}}] + // LOAD + results_[0] = static_cast(memoryData_[0].get()); + break; + } case Opcode::AArch64_LDRSWui: { // ldrsw xt, [xn{, #pimm}] // LOAD results_[0] = static_cast(memoryData_[0].get()); @@ -4007,11 +4142,51 @@ void Instruction::execute() { [](uint8_t x, uint8_t y) -> uint8_t { return x | y; }); break; } + case Opcode::AArch64_ORNv8i8: { // orn vd.8b, vn.8b, vn.8b + results_[0] = vecLogicOp_3vecs( + sourceValues_, + [](uint8_t x, uint8_t y) -> uint8_t { return x | (~y); }); + break; + } case Opcode::AArch64_PFALSE: { // pfalse pd.b uint64_t out[4] = {0, 0, 0, 0}; results_[0] = out; break; } + case Opcode::AArch64_PFIRST_B: { // pfirst pdn.b, pg, pdn.b + auto [result, nzcv] = svePfirst(sourceValues_, VL_bits); + results_[0] = nzcv; + results_[1] = result; + break; + } + case Opcode::AArch64_PNEXT_B: { // pnext pdn.b, pv, pdn.b + auto [result, nzcv] = + svePnext(sourceValues_, metadata_, VL_bits); + results_[0] = nzcv; + results_[1] = result; + break; + } + case Opcode::AArch64_PNEXT_H: { // pnext pdn.h, pv, pdn.h + auto [result, nzcv] = + svePnext(sourceValues_, metadata_, VL_bits); + results_[0] = nzcv; + results_[1] = result; + break; + } + case Opcode::AArch64_PNEXT_S: { // pnext pdn.s, pv, pdn.s + auto [result, nzcv] = + svePnext(sourceValues_, metadata_, VL_bits); + results_[0] = nzcv; + results_[1] = result; + break; + } + case Opcode::AArch64_PNEXT_D: { // pnext pdn.d, pv, pdn.d + auto [result, nzcv] = + svePnext(sourceValues_, metadata_, VL_bits); + results_[0] = nzcv; + results_[1] = result; + break; + } case Opcode::AArch64_PRFMui: { // prfm op, [xn, xm{, extend{, #amount}}] break; } @@ -4300,34 +4475,88 @@ void Instruction::execute() { results_[0] = maddl_4ops(sourceValues_); break; } + case Opcode::AArch64_SMAX_ZI_D: { // smax zdn.d, zdn.d, #imm + results_[0] = sveMax_vecImm(sourceValues_, metadata_, VL_bits); + break; + } case Opcode::AArch64_SMAX_ZI_S: { // smax zdn.s, zdn.s, #imm results_[0] = sveMax_vecImm(sourceValues_, metadata_, VL_bits); break; } + case Opcode::AArch64_SMAX_ZI_H: { // smax zdn.h, zdn.h, #imm + results_[0] = sveMax_vecImm(sourceValues_, metadata_, VL_bits); + break; + } + case Opcode::AArch64_SMAX_ZI_B: { // smax zdn.b, zdn.b, #imm + results_[0] = sveMax_vecImm(sourceValues_, metadata_, VL_bits); + break; + } + case Opcode::AArch64_SMAX_ZPmZ_D: { // smax zd.d, pg/m, zn.d, zm.d + results_[0] = sveMaxPredicated_vecs(sourceValues_, VL_bits); + break; + } case Opcode::AArch64_SMAX_ZPmZ_S: { // smax zd.s, pg/m, zn.s, zm.s results_[0] = sveMaxPredicated_vecs(sourceValues_, VL_bits); break; } + case Opcode::AArch64_SMAX_ZPmZ_H: { // smax zd.h, pg/m, zn.h, zm.h + results_[0] = sveMaxPredicated_vecs(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_SMAX_ZPmZ_B: { // smax zd.b, pg/m, zn.b, zm.b + results_[0] = sveMaxPredicated_vecs(sourceValues_, VL_bits); + break; + } case Opcode::AArch64_SMAXv4i32: { // smax vd.4s, vn.4s, vm.4s results_[0] = vecLogicOp_3vecs( sourceValues_, [](int32_t x, int32_t y) -> int32_t { return std::max(x, y); }); break; } + case Opcode::AArch64_SMINV_VPZ_D: { // sminv sd, pg, zn.d + results_[0] = sveSminv(sourceValues_, VL_bits); + break; + } case Opcode::AArch64_SMINV_VPZ_S: { // sminv sd, pg, zn.s results_[0] = sveSminv(sourceValues_, VL_bits); break; } + case Opcode::AArch64_SMINV_VPZ_H: { // sminv sd, pg, zn.h + results_[0] = sveSminv(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_SMINV_VPZ_B: { // sminv sd, pg, zn.b + results_[0] = sveSminv(sourceValues_, VL_bits); + break; + } case Opcode::AArch64_SMINVv4i32v: { // sminv sd, vn.4s results_[0] = vecMinv_2ops(sourceValues_); break; } + case Opcode::AArch64_SMIN_ZPmZ_D: { // smin zd.d, pg/m, zn.d, zm.d + results_[0] = sveLogicOpPredicated_3vecs( + sourceValues_, VL_bits, + [](int64_t x, int64_t y) -> int64_t { return std::min(x, y); }); + break; + } case Opcode::AArch64_SMIN_ZPmZ_S: { // smin zd.s, pg/m, zn.s, zm.s results_[0] = sveLogicOpPredicated_3vecs( sourceValues_, VL_bits, [](int32_t x, int32_t y) -> int32_t { return std::min(x, y); }); break; } + case Opcode::AArch64_SMIN_ZPmZ_H: { // smin zd.h, pg/m, zn.h, zm.h + results_[0] = sveLogicOpPredicated_3vecs( + sourceValues_, VL_bits, + [](int16_t x, int16_t y) -> int16_t { return std::min(x, y); }); + break; + } + case Opcode::AArch64_SMIN_ZPmZ_B: { // smin zd.b, pg/m, zn.b, zm.b + results_[0] = sveLogicOpPredicated_3vecs( + sourceValues_, VL_bits, + [](int8_t x, int8_t y) -> int8_t { return std::min(x, y); }); + break; + } case Opcode::AArch64_SMINv4i32: { // smin vd.4s, vn.4s, vm.4s results_[0] = vecLogicOp_3vecs( sourceValues_, @@ -4359,6 +4588,14 @@ void Instruction::execute() { sourceValues_[1].get()); break; } + case Opcode::AArch64_SPLICE_ZPZ_D: { // splice zdn.d, pv, zdn.t, zm.d + results_[0] = sveSplice(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_SPLICE_ZPZ_S: { // splice zdn.s, pv, zdn.t, zm.s + results_[0] = sveSplice(sourceValues_, VL_bits); + break; + } case Opcode::AArch64_SSHLLv2i32_shift: { // sshll vd.2d, vn.2s, #imm results_[0] = vecShllShift_vecImm( sourceValues_, metadata_, false); @@ -4945,12 +5182,14 @@ void Instruction::execute() { memoryData_[0] = sourceValues_[0]; break; } + case Opcode::AArch64_STLXRB: // stlxrb ws, wt, [xn] + case Opcode::AArch64_STLXRH: // stlxrh ws, wt, [xn] case Opcode::AArch64_STLXRW: // stlxr ws, wt, [xn] case Opcode::AArch64_STLXRX: { // stlxr ws, xt, [xn] // STORE memoryData_[0] = sourceValues_[0]; - // TODO: Implement atomic memory access - results_[0] = static_cast(0); + // TODO: Implement atomic memory access + results_[0] = {0, 8}; break; } case Opcode::AArch64_STPDi: // stp dt1, dt2, [xn, #imm] @@ -5568,6 +5807,26 @@ void Instruction::execute() { results_[0] = vecUMinP(sourceValues_); break; } + case Opcode::AArch64_UMAXVv16i8v: { // umaxv bd, vn.16b + results_[0] = vecUMaxV(sourceValues_); + break; + } + case Opcode::AArch64_UMAXVv4i16v: { // umaxv hd, vn.4h + results_[0] = vecUMaxV(sourceValues_); + break; + } + case Opcode::AArch64_UMAXVv4i32v: { // umaxv sd, vn.4s + results_[0] = vecUMaxV(sourceValues_); + break; + } + case Opcode::AArch64_UMAXVv8i16v: { // umaxv hd, vn.8h + results_[0] = vecUMaxV(sourceValues_); + break; + } + case Opcode::AArch64_UMAXVv8i8v: { // umaxv bd, vn.8b + results_[0] = vecUMaxV(sourceValues_); + break; + } case Opcode::AArch64_UMOVvi32_idx0: // umov wd, vn.s[0] case Opcode::AArch64_UMOVvi32: { // umov wd, vn.s[index] const uint32_t* vec = sourceValues_[0].getAsVector(); @@ -5726,85 +5985,129 @@ void Instruction::execute() { break; } case Opcode::AArch64_WHILELO_PWW_B: { // whilelo pd.b, wn, wm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint32_t x, uint32_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELO_PWW_D: { // whilelo pd.d, wn, wm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint32_t x, uint32_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELO_PWW_H: { // whilelo pd.h, wn, wm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint32_t x, uint32_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELO_PWW_S: { // whilelo pd.s, wn, wm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint32_t x, uint32_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELO_PXX_B: { // whilelo pd.b, xn, xm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint64_t x, uint64_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELO_PXX_D: { // whilelo pd.d, xn, xm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint64_t x, uint64_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELO_PXX_H: { // whilelo pd.h, xn, xm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint64_t x, uint64_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELO_PXX_S: { // whilelo pd.s, xn, xm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint64_t x, uint64_t y) -> bool { return x < y; }); + results_[0] = nzcv; + results_[1] = output; + break; + } + case Opcode::AArch64_WHILELS_PXX_B: { // whilels pd.b, xn, xm + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint64_t x, uint64_t y) -> bool { return x <= y; }); + results_[0] = nzcv; + results_[1] = output; + break; + } + case Opcode::AArch64_WHILELS_PXX_D: { // whilels pd.d, xn, xm + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint64_t x, uint64_t y) -> bool { return x <= y; }); + results_[0] = nzcv; + results_[1] = output; + break; + } + case Opcode::AArch64_WHILELS_PXX_H: { // whilels pd.h, xn, xm + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint64_t x, uint64_t y) -> bool { return x <= y; }); + results_[0] = nzcv; + results_[1] = output; + break; + } + case Opcode::AArch64_WHILELS_PXX_S: { // whilels pd.s, xn, xm + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint64_t x, uint64_t y) -> bool { return x <= y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELT_PXX_B: { // whilelt pd.b, xn, xm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](int64_t x, int64_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELT_PXX_D: { // whilelt pd.d, xn, xm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](int64_t x, int64_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELT_PXX_H: { // whilelt pd.h, xn, xm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](int64_t x, int64_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELT_PXX_S: { // whilelt pd.s, xn, xm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](int64_t x, int64_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; diff --git a/src/lib/pipeline/ReorderBuffer.cc b/src/lib/pipeline/ReorderBuffer.cc index e72e6e79dc..20a2970995 100644 --- a/src/lib/pipeline/ReorderBuffer.cc +++ b/src/lib/pipeline/ReorderBuffer.cc @@ -81,9 +81,31 @@ unsigned int ReorderBuffer::commit(uint64_t maxCommitSize) { unsigned int n; for (n = 0; n < maxCommits; n++) { auto& uop = buffer_[0]; + if (!uop->canCommit()) { + // If an instruction has been stuck at the head of the rob for + // sufficiently long, assume an error in SimEng has occured. + robHeadRepeatCounter_++; + if (robHeadRepeatCounter_ > robHeadRepeatLimit_) { + std::cerr << "[SimEng:ReorderBuffer] Instruction stuck unable to " + "commit at the head of ROB for 10,000,000 cycles at " + "instruction address 0x" + << std::hex << uop->getInstructionAddress() << std::dec + << " (MicroOp Index: " << uop->getMicroOpIndex() + << "). This is unexpected behaviour for most valid core " + "configurations, though may arise in designs with very " + "high latencies or bottlenecks. If this is not the case, " + "please try re-running. If this may be expected, you can " + "increase this limit in " + "`SimEng/src/include/pipeline/ReorderBuffer.hh` under the " + "variable `robHeadRepeatLimit_`. Please raise " + "an issue on GitHub if the problem persists." + << std::endl; + exit(1); + } break; } + robHeadRepeatCounter_ = 0; if (uop->isLastMicroOp()) instructionsCommitted_++; diff --git a/test/regression/aarch64/AArch64RegressionTest.hh b/test/regression/aarch64/AArch64RegressionTest.hh index 32d975b09d..3b2490666d 100644 --- a/test/regression/aarch64/AArch64RegressionTest.hh +++ b/test/regression/aarch64/AArch64RegressionTest.hh @@ -503,7 +503,7 @@ class AArch64RegressionTest : public RegressionTest { std::array generatedArray; generatedArray.fill(0); // Fill array by cycling through source elements - for (int i = 0; i < (num_bytes / sizeof(T)); i++) { + for (size_t i = 0; i < (num_bytes / sizeof(T)); i++) { generatedArray[i] = src[i % src.size()]; } return generatedArray; diff --git a/test/regression/aarch64/Syscall.cc b/test/regression/aarch64/Syscall.cc index 0866c278e2..c7c19eb9a2 100644 --- a/test/regression/aarch64/Syscall.cc +++ b/test/regression/aarch64/Syscall.cc @@ -1080,7 +1080,7 @@ TEST_P(Syscall, sched_getaffinity) { )"); EXPECT_EQ(getGeneralRegister(21), -1); EXPECT_EQ(getGeneralRegister(22), -1); - EXPECT_EQ(getGeneralRegister(23), 1); + EXPECT_EQ(getGeneralRegister(23), 8); } // TODO: write tgkill test diff --git a/test/regression/aarch64/instructions/bitmanip.cc b/test/regression/aarch64/instructions/bitmanip.cc index a72dcb64dc..8622169db0 100644 --- a/test/regression/aarch64/instructions/bitmanip.cc +++ b/test/regression/aarch64/instructions/bitmanip.cc @@ -71,11 +71,17 @@ TEST_P(InstBitmanip, extr) { extr w4, w1, w2, 4 extr w5, w1, w2, 24 extr w6, w1, w2, 31 + + # Check alias + ror w7, w1, 31 + ror w8, w1, 24 )"); EXPECT_EQ(getGeneralRegister(3), 0x12345678); EXPECT_EQ(getGeneralRegister(4), 0xF1234567); EXPECT_EQ(getGeneralRegister(5), 0xADBEEF12); EXPECT_EQ(getGeneralRegister(6), 0xBD5B7DDE); + EXPECT_EQ(getGeneralRegister(7), 0xBD5B7DDF); + EXPECT_EQ(getGeneralRegister(8), 0xADBEEFDE); // 64-bit initialHeapData_.resize(16); diff --git a/test/regression/aarch64/instructions/load.cc b/test/regression/aarch64/instructions/load.cc index 09269eebb8..bf5a3cad47 100644 --- a/test/regression/aarch64/instructions/load.cc +++ b/test/regression/aarch64/instructions/load.cc @@ -3,6 +3,7 @@ namespace { using InstLoad = AArch64RegressionTest; +using namespace simeng::arch::aarch64::InstructionGroups; TEST_P(InstLoad, ld1r) { // 8-bit @@ -695,6 +696,45 @@ TEST_P(InstLoad, ldarb) { EXPECT_EQ(getGeneralRegister(7), 64); } +TEST_P(InstLoad, ldaxrb) { + initialHeapData_.resize(8); + uint32_t* heap = reinterpret_cast(initialHeapData_.data()); + heap[0] = 0xDEADBEEF; + heap[1] = 0x12345678; + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + ldaxrb w1, [x0] + add x0, x0, #1 + ldaxrb w2, [x0] + add x0, x0, #1 + ldaxrb w3, [x0] + add x0, x0, #1 + ldaxrb w4, [x0] + add x0, x0, #1 + ldaxrb w5, [x0] + add x0, x0, #1 + ldaxrb w6, [x0] + add x0, x0, #1 + ldaxrb w7, [x0] + add x0, x0, #1 + ldaxrb w8, [x0] + )"); + EXPECT_EQ(getGeneralRegister(1), 0xEF); + EXPECT_EQ(getGeneralRegister(2), 0xBE); + EXPECT_EQ(getGeneralRegister(3), 0xAD); + EXPECT_EQ(getGeneralRegister(4), 0xDE); + EXPECT_EQ(getGeneralRegister(5), 0x78); + EXPECT_EQ(getGeneralRegister(6), 0x56); + EXPECT_EQ(getGeneralRegister(7), 0x34); + EXPECT_EQ(getGeneralRegister(8), 0x12); + + EXPECT_GROUP(R"(ldaxrb w8, [x0])", LOAD_INT); +} + TEST_P(InstLoad, ldrb) { initialHeapData_.resize(8); uint32_t* heap = reinterpret_cast(initialHeapData_.data()); @@ -1277,17 +1317,21 @@ TEST_P(InstLoad, ldrsw) { mov x0, 0 mov x8, 214 svc #0 - mov x5, 1 + mov x6, 1 # Load 32-bit values from heap and sign-extend to 64-bits ldrsw x1, [x0, #4] ldrsw x2, [x0], #4 ldrsw x3, [x0] - ldrsw x4, [x0, x5, lsl #2] + ldrsw x4, [x0, x6, lsl #2] + ldrsw x5, [x0, w6, uxtw #2] )"); EXPECT_EQ(getGeneralRegister(1), INT32_MAX); EXPECT_EQ(getGeneralRegister(2), -2); EXPECT_EQ(getGeneralRegister(3), INT32_MAX); EXPECT_EQ(getGeneralRegister(4), -5); + EXPECT_EQ(getGeneralRegister(5), -5); + + EXPECT_GROUP(R"(ldrsw x4, [x0, x6, lsl #2])", LOAD_INT); // ldursw RUN_AARCH64(R"( diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc index a4731f388f..c66f6f3c6f 100644 --- a/test/regression/aarch64/instructions/neon.cc +++ b/test/regression/aarch64/instructions/neon.cc @@ -356,6 +356,28 @@ TEST_P(InstNeon, addv) { CHECK_NEON(1, uint8_t, {40}); } +TEST_P(InstNeon, uaddlv) { + // 16-bit + initialHeapData_.resize(16); + uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + for (int i = 0; i < 16; i++) { + heap8[i] = (i + 1); + } + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr q0, [x0] + uaddlv h1, v0.8b + )"); + CHECK_NEON(1, uint16_t, {36}); + + EXPECT_GROUP(R"(uaddlv h1, v0.8b)", SCALAR_SIMPLE_ARTH_NOSHIFT); +} + TEST_P(InstNeon, and) { initialHeapData_.resize(32); uint32_t* heap = reinterpret_cast(initialHeapData_.data()); @@ -691,18 +713,53 @@ TEST_P(InstNeon, cmeq) { CHECK_NEON(2, uint8_t, {0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0xFF}); CHECK_NEON(3, uint8_t, {0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00}); - // 32-bit + // 32-bit, 2 lane initialHeapData_.resize(128); - uint32_t* heap32 = reinterpret_cast(initialHeapData_.data()); - heap32[0] = 10; - heap32[1] = 11; - heap32[2] = 12; - heap32[3] = 13; + uint32_t* heapv2i32 = reinterpret_cast(initialHeapData_.data()); + heapv2i32[0] = 10; + heapv2i32[1] = 0; + + heapv2i32[2] = 0; + heapv2i32[3] = 12; + + heapv2i32[4] = 15; + heapv2i32[5] = 9; + + heapv2i32[6] = 0; + heapv2i32[7] = 0; + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr q0, [x0] + ldr q1, [x0, #8] + ldr q2, [x0, #16] + ldr q3, [x0, #24] + cmeq v4.2s, v0.2s, #0 + cmeq v5.2s, v1.2s, #0 + cmeq v6.2s, v2.2s, #0 + cmeq v7.2s, v3.2s, #0 + )"); + CHECK_NEON(4, uint32_t, {0, 0xFFFFFFFFu}); + CHECK_NEON(5, uint32_t, {0xFFFFFFFFu, 0}); + CHECK_NEON(6, uint32_t, {0, 0}); + CHECK_NEON(7, uint32_t, {0xFFFFFFFFu, 0xFFFFFFFFu}); - heap32[4] = 13; - heap32[5] = 11; - heap32[6] = 12; - heap32[7] = 10; + // 32-bit, 4 lane + initialHeapData_.resize(128); + uint32_t* heapv4i32 = reinterpret_cast(initialHeapData_.data()); + heapv4i32[0] = 10; + heapv4i32[1] = 11; + heapv4i32[2] = 12; + heapv4i32[3] = 13; + + heapv4i32[4] = 13; + heapv4i32[5] = 11; + heapv4i32[6] = 12; + heapv4i32[7] = 10; RUN_AARCH64(R"( # Get heap address @@ -715,6 +772,8 @@ TEST_P(InstNeon, cmeq) { cmeq v2.4s, v0.4s, v1.4s )"); CHECK_NEON(2, uint32_t, {0, 0xFFFFFFFFu, 0xFFFFFFFFu, 0}); + + EXPECT_GROUP(R"(cmeq v2.4s, v0.4s, v1.4s)", VECTOR_SIMPLE_CMP); } TEST_P(InstNeon, cmhs) { @@ -780,16 +839,40 @@ TEST_P(InstNeon, cmhs) { } TEST_P(InstNeon, cmhi) { + // 32-bit, 2 lane initialHeapData_.resize(32); - uint32_t* heap = reinterpret_cast(initialHeapData_.data()); - heap[0] = 42; - heap[1] = 7; - heap[2] = UINT32_MAX; - heap[3] = 7; - heap[4] = 1; - heap[5] = (1u << 31) - 1; - heap[6] = 0; - heap[7] = 7; + uint32_t* heapv2i32 = reinterpret_cast(initialHeapData_.data()); + heapv2i32[0] = UINT32_MAX; + heapv2i32[1] = 7; + + heapv2i32[2] = 1; + heapv2i32[3] = 7; + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr q0, [x0] + ldr q1, [x0, #8] + cmhi v2.2s, v0.2s, v1.2s + cmhi v3.2s, v1.2s, v0.2s + )"); + CHECK_NEON(2, uint32_t, {0xFFFFFFFF, 0x0}); + CHECK_NEON(3, uint32_t, {0x0, 0x0}); + + // 32-bit, 4 lane + initialHeapData_.resize(32); + uint32_t* heapv4i32 = reinterpret_cast(initialHeapData_.data()); + heapv4i32[0] = 42; + heapv4i32[1] = 7; + heapv4i32[2] = UINT32_MAX; + heapv4i32[3] = 7; + heapv4i32[4] = 1; + heapv4i32[5] = (1u << 31) - 1; + heapv4i32[6] = 0; + heapv4i32[7] = 7; RUN_AARCH64(R"( # Get heap address @@ -804,6 +887,8 @@ TEST_P(InstNeon, cmhi) { )"); CHECK_NEON(2, uint32_t, {0xFFFFFFFF, 0x0, 0xFFFFFFFF, 0x0}); CHECK_NEON(3, uint32_t, {0x0, 0xFFFFFFFF, 0x0, 0x0}); + + EXPECT_GROUP(R"(cmhi v3.4s, v1.4s, v0.4s)", VECTOR_SIMPLE_CMP); } TEST_P(InstNeon, cnt) { @@ -1011,6 +1096,29 @@ TEST_P(InstNeon, eor) { CHECK_NEON(3, uint8_t, {1, 3, 1, 7, 1, 3, 1, 15, 0, 0, 0, 0, 0, 0, 0, 0}); } +TEST_P(InstNeon, orn) { + initialHeapData_.resize(16); + uint8_t* heap = reinterpret_cast(initialHeapData_.data()); + for (int i = 0; i < 8; i++) { + heap[i] = i; + heap[i + 8] = i + 1; + } + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr q0, [x0] + ldr q1, [x0, #8] + + orn v2.8b, v0.8b, v1.8b + )"); + CHECK_NEON(2, uint8_t, {254, 253, 254, 251, 254, 253, 254, 247}); + + EXPECT_GROUP(R"(orn v2.8b, v0.8b, v1.8b)", VECTOR_SIMPLE_LOGICAL_NOSHIFT); +} + TEST_P(InstNeon, ext) { RUN_AARCH64(R"( movi v0.16b, #0xAB @@ -1531,11 +1639,12 @@ TEST_P(InstNeon, fcvtl2) { TEST_P(InstNeon, fdiv) { initialHeapData_.resize(32); - double* heap = reinterpret_cast(initialHeapData_.data()); - heap[0] = 1.0; - heap[1] = -42.5; - heap[2] = -0.125; - heap[3] = 16.0; + // 2 Doubles + double* heapv2f64 = reinterpret_cast(initialHeapData_.data()); + heapv2f64[0] = 1.0; + heapv2f64[1] = -42.5; + heapv2f64[2] = -0.125; + heapv2f64[3] = 16.0; RUN_AARCH64(R"( # Get heap address @@ -1548,6 +1657,29 @@ TEST_P(InstNeon, fdiv) { fdiv v2.2d, v0.2d, v1.2d )"); CHECK_NEON(2, double, {-8.0, -2.65625}); + + // 4 Floats + float* heapv4f32 = reinterpret_cast(initialHeapData_.data()); + heapv4f32[0] = 1.0f; + heapv4f32[1] = -42.5f; + heapv4f32[2] = 10.0f; + heapv4f32[3] = 0.0f; + heapv4f32[4] = -0.125f; + heapv4f32[5] = 16.0f; + heapv4f32[6] = -2.0f; + heapv4f32[7] = 256.0f; + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr q0, [x0] + ldr q1, [x0, #16] + fdiv v2.4s, v0.4s, v1.4s + )"); + CHECK_NEON(2, float, {-8.0f, -2.65625f, -5.0f, 0.0f}); } TEST_P(InstNeon, fmla) { @@ -2746,6 +2878,97 @@ TEST_P(InstNeon, umaxp) { 0xCC, 0xBB, 0xAA, 0x99, 0x88}); } +TEST_P(InstNeon, umaxv) { + // umaxv vd, vn.t + initialHeapData_.resize(32); + uint8_t* heap = reinterpret_cast(initialHeapData_.data()); + + // v0 + heap[0] = 0x01; + heap[1] = 0x00; + heap[2] = 0xFF; + heap[3] = 0xAA; + heap[4] = 0xBB; + heap[5] = 0xCC; + heap[6] = 0xDD; + heap[7] = 0xEE; + heap[8] = 0x07; + heap[9] = 0x00; + heap[10] = 0xFC; + heap[11] = 0xFD; + heap[12] = 0xBA; + heap[13] = 0xCA; + heap[14] = 0x39; + heap[15] = 0xEF; + + // v1 + heap[16] = 0x00; + heap[17] = 0x00; + heap[18] = 0xEE; + heap[19] = 0x11; + heap[20] = 0x22; + heap[21] = 0x33; + heap[22] = 0x44; + heap[23] = 0x55; + heap[24] = 0x26; + heap[25] = 0xFF; + heap[26] = 0xEA; + heap[27] = 0xFA; + heap[28] = 0x14; + heap[29] = 0x43; + heap[30] = 0x21; + heap[31] = 0xAE; + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr q0, [x0] + ldr q1, [x0, #16] + umaxv h2, v0.4h + umaxv h3, v1.4h + + umaxv h4, v0.8h + umaxv h5, v1.8h + + umaxv s6, v0.4s + umaxv s7, v1.4s + + umaxv b8, v0.8b + umaxv b9, v1.8b + + umaxv b10, v0.16b + umaxv b11, v1.16b + + )"); + CHECK_NEON(2, uint16_t, + {0xEEDD, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}); + CHECK_NEON(3, uint16_t, + {0x5544, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}); + CHECK_NEON(4, uint16_t, + {0xFDFC, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}); + CHECK_NEON(5, uint16_t, + {0xFF26, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}); + CHECK_NEON(6, uint32_t, {0xFDFC0007, 0x00000000, 0x00000000, 0x00000000}); + CHECK_NEON(7, uint32_t, {0xFAEAFF26, 0x00000000, 0x00000000, 0x00000000}); + CHECK_NEON(8, uint8_t, + {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00}) + CHECK_NEON(9, uint8_t, + {0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00}) + CHECK_NEON(10, uint8_t, + {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00}) + CHECK_NEON(11, uint8_t, + {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00}) + + EXPECT_GROUP(R"(umaxv b11, v1.16b)", SCALAR_SIMPLE_ARTH_NOSHIFT); +} + TEST_P(InstNeon, smax) { initialHeapData_.resize(32); uint32_t* heap = reinterpret_cast(initialHeapData_.data()); diff --git a/test/regression/aarch64/instructions/store.cc b/test/regression/aarch64/instructions/store.cc index 6d6876b494..2b43e510e4 100644 --- a/test/regression/aarch64/instructions/store.cc +++ b/test/regression/aarch64/instructions/store.cc @@ -3,6 +3,7 @@ namespace { using InstStore = AArch64RegressionTest; +using namespace simeng::arch::aarch64::InstructionGroups; TEST_P(InstStore, stlr) { // stlrb @@ -60,6 +61,101 @@ TEST_P(InstStore, stlr) { 0xBABA); } +TEST_P(InstStore, stlxr) { + // stlxrb + RUN_AARCH64(R"( + mov w0, 0xAB + mov w1, 0x12 + mov w2, 0xCD + mov w3, 0x34 + sub sp, sp, #4 + stlxrb w4, w0, [sp] + add sp, sp, #1 + stlxrb w5, w1, [sp] + add sp, sp, #1 + stlxrb w6, w2, [sp] + add sp, sp, #1 + stlxrb w7, w3, [sp] + add sp, sp, #1 + )"); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 4), + 0xAB); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 3), + 0x12); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 2), + 0xCD); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 1), + 0x34); + EXPECT_EQ(getGeneralRegister(4), 0); + EXPECT_EQ(getGeneralRegister(5), 0); + EXPECT_EQ(getGeneralRegister(6), 0); + EXPECT_EQ(getGeneralRegister(7), 0); + + EXPECT_GROUP(R"(stlxrb w7, w3, [sp])", STORE_ADDRESS_INT); + + // stlxrh + RUN_AARCH64(R"( + mov w0, 0xABCD + mov w1, 0x1234 + mov w2, 0xCDEF + mov w3, 0x3456 + sub sp, sp, #8 + stlxrh w4, w0, [sp] + add sp, sp, #2 + stlxrh w5, w1, [sp] + add sp, sp, #2 + stlxrh w6, w2, [sp] + add sp, sp, #2 + stlxrh w7, w3, [sp] + add sp, sp, #2 + )"); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 8), + 0xABCD); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 6), + 0x1234); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 4), + 0xCDEF); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 2), + 0x3456); + EXPECT_EQ(getGeneralRegister(4), 0); + EXPECT_EQ(getGeneralRegister(5), 0); + EXPECT_EQ(getGeneralRegister(6), 0); + EXPECT_EQ(getGeneralRegister(7), 0); + + EXPECT_GROUP(R"(stlxrh w7, w3, [sp])", STORE_ADDRESS_INT); + + // stlxr + RUN_AARCH64(R"( + mov w0, 0xABCD + mov w1, 0x1234 + mov w2, 0xCDEF + mov w3, 0x3456 + sub sp, sp, #24 + stlxr w4, x0, [sp] + add sp, sp, #8 + stlxr w5, x1, [sp] + add sp, sp, #8 + stlxr w6, w2, [sp] + add sp, sp, #4 + stlxr w7, w3, [sp] + add sp, sp, #4 + )"); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 24), + 0xABCD); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 16), + 0x1234); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 8), + 0xCDEF); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 4), + 0x3456); + EXPECT_EQ(getGeneralRegister(4), 0); + EXPECT_EQ(getGeneralRegister(5), 0); + EXPECT_EQ(getGeneralRegister(6), 0); + EXPECT_EQ(getGeneralRegister(7), 0); + + EXPECT_GROUP(R"(stlxr w7, w3, [sp])", STORE_ADDRESS_INT); +} + TEST_P(InstStore, strb) { RUN_AARCH64(R"( mov w0, 0xAB diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 6a52d46b95..f94ee28262 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -1322,6 +1322,194 @@ TEST_P(InstSve, cmphi_vec) { EXPECT_EQ(getNZCV(), 0b0110); } +TEST_P(InstSve, cmphs_vec) { + // 8-bit + RUN_AARCH64(R"( + ptrue p0.b + dup z0.b, #-5 + dup z1.b, #4 + + cmphs p1.b, p0/z, z0.b, z1.b + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 1)); + EXPECT_EQ(getNZCV(), 0b1000); + + RUN_AARCH64(R"( + mov x0, #0 + mov x1, #2 + addvl x0, x0, #1 + sdiv x0, x0, x1 + + whilelo p0.b, xzr, x0 + dup z0.b, #-5 + dup z1.b, #4 + + cmphs p1.b, p0/z, z0.b, z1.b + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 16, {1}, 1)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + ptrue p0.b + dup z0.b, #-5 + dup z1.b, #4 + + cmphs p1.b, p0/z, z1.b, z0.b + )"); + CHECK_PREDICATE(1, uint64_t, {0, 0, 0, 0}); + EXPECT_EQ(getNZCV(), 0b0110); + + RUN_AARCH64(R"( + ptrue p0.b + dup z0.b, #6 + dup z1.b, #6 + + cmphs p1.b, p0/z, z1.b, z0.b + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 1)); + EXPECT_EQ(getNZCV(), 0b1000); + + // 16-bit + RUN_AARCH64(R"( + ptrue p0.h + dup z0.h, #-5 + dup z1.h, #4 + + cmphs p1.h, p0/z, z0.h, z1.h + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 2)); + EXPECT_EQ(getNZCV(), 0b1000); + + RUN_AARCH64(R"( + mov x0, #0 + mov x1, #4 + addvl x0, x0, #1 + sdiv x0, x0, x1 + + whilelo p0.h, xzr, x0 + dup z0.h, #-5 + dup z1.h, #4 + + cmphs p1.h, p0/z, z0.h, z1.h + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 16, {1}, 2)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + ptrue p0.h + dup z0.h, #-5 + dup z1.h, #4 + + cmphs p1.h, p0/z, z1.h, z0.h + )"); + CHECK_PREDICATE(1, uint64_t, {0, 0, 0, 0}); + EXPECT_EQ(getNZCV(), 0b0110); + + RUN_AARCH64(R"( + ptrue p0.h + dup z0.h, #6 + dup z1.h, #6 + + cmphs p1.h, p0/z, z1.h, z0.h + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 2)); + EXPECT_EQ(getNZCV(), 0b1000); + + // 32-bit + RUN_AARCH64(R"( + ptrue p0.s + dup z0.s, #-5 + dup z1.s, #4 + + cmphs p1.s, p0/z, z0.s, z1.s + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 4)); + EXPECT_EQ(getNZCV(), 0b1000); + + RUN_AARCH64(R"( + mov x0, #0 + mov x1, #8 + addvl x0, x0, #1 + sdiv x0, x0, x1 + + whilelo p0.s, xzr, x0 + dup z0.s, #-5 + dup z1.s, #4 + + cmphs p1.s, p0/z, z0.s, z1.s + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 16, {1}, 4)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + ptrue p0.s + dup z0.s, #-5 + dup z1.s, #4 + + cmphs p1.s, p0/z, z1.s, z0.s + )"); + CHECK_PREDICATE(1, uint64_t, {0, 0, 0, 0}); + EXPECT_EQ(getNZCV(), 0b0110); + + RUN_AARCH64(R"( + ptrue p0.s + dup z0.s, #6 + dup z1.s, #6 + + cmphs p1.s, p0/z, z1.s, z0.s + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 4)); + EXPECT_EQ(getNZCV(), 0b1000); + + // 64-bit + RUN_AARCH64(R"( + ptrue p0.d + dup z0.d, #-5 + dup z1.d, #4 + + cmphs p1.d, p0/z, z0.d, z1.d + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 8)); + EXPECT_EQ(getNZCV(), 0b1000); + + RUN_AARCH64(R"( + mov x0, #0 + mov x1, #16 + addvl x0, x0, #1 + sdiv x0, x0, x1 + + whilelo p0.d, xzr, x0 + dup z0.d, #-5 + dup z1.d, #4 + + cmphs p1.d, p0/z, z0.d, z1.d + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 16, {1}, 8)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + ptrue p0.d + dup z0.d, #-5 + dup z1.d, #4 + + cmphs p1.d, p0/z, z1.d, z0.d + )"); + CHECK_PREDICATE(1, uint64_t, {0, 0, 0, 0}); + EXPECT_EQ(getNZCV(), 0b0110); + + RUN_AARCH64(R"( + ptrue p0.d + dup z0.d, #6 + dup z1.d, #6 + + cmphs p1.d, p0/z, z1.d, z0.d + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 8)); + EXPECT_EQ(getNZCV(), 0b1000); + + EXPECT_GROUP(R"(cmphs p1.d, p0/z, z1.d, z0.d)", PREDICATE); +} + TEST_P(InstSve, cnt) { // pattern = all RUN_AARCH64(R"( @@ -1544,6 +1732,146 @@ TEST_P(InstSve, cpy) { CHECK_NEON(4, int64_t, fillNeon({12}, VL / 8)); CHECK_NEON(5, int64_t, fillNeon({static_cast(-2048)}, VL / 16)); + + // SIMD & FP scalar + // Tests are different for 8/16 bit vs 32/64 bit due to the lack of fmov + // support for h and b registers + // 8-bit + RUN_AARCH64(R"( + mov x0, #0 + mov x1, #2 + addvl x0, x0, #1 + sdiv x0, x0, x1 + + ptrue p0.b + whilelo p1.b, xzr, x0 + + cpy z6.b, p0/z, #10 + cpy z7.b, p0/z, #-8 + cpy z8.b, p0/z, #12 + cpy z9.b, p0/z, #-16 + cpy z10.b, p0/z, #12 + cpy z11.b, p0/z, #-8 + + cpy z0.b, p0/m, b6 + cpy z1.b, p0/m, b7 + cpy z2.b, p1/m, b8 + cpy z3.b, p1/m, b9 + + # Test Alias + mov z4.b, p0/m, b10 + mov z5.b, p1/m, b11 + )"); + CHECK_NEON(0, int8_t, fillNeon({10}, VL / 8)); + CHECK_NEON(1, int8_t, fillNeon({-8}, VL / 8)); + CHECK_NEON(2, int8_t, fillNeon({12}, VL / 16)); + CHECK_NEON(3, int8_t, fillNeon({-16}, VL / 16)); + CHECK_NEON(4, int8_t, fillNeon({12}, VL / 8)); + CHECK_NEON(5, int8_t, fillNeon({-8}, VL / 16)); + + // 16-bit + RUN_AARCH64(R"( + mov x0, #0 + mov x1, #4 + addvl x0, x0, #1 + sdiv x0, x0, x1 + + ptrue p0.h + whilelo p1.h, xzr, x0 + + cpy z6.h, p0/z, #10 + cpy z7.h, p0/z, #8, lsl #8 + cpy z8.h, p0/z, #-12 + cpy z9.h, p0/z, #-16, lsl #8 + cpy z10.h, p0/z, #12 + cpy z11.h, p0/z, #-8, lsl #8 + + cpy z0.h, p0/m, h6 + cpy z1.h, p0/m, h7 + cpy z2.h, p1/m, h8 + cpy z3.h, p1/m, h9 + + # Test Alias + mov z4.h, p0/m, h10 + mov z5.h, p1/m, h11 + )"); + CHECK_NEON(0, int16_t, fillNeon({10}, VL / 8)); + CHECK_NEON(1, int16_t, + fillNeon({static_cast(2048)}, VL / 8)); + CHECK_NEON(2, int16_t, fillNeon({-12}, VL / 16)); + CHECK_NEON(3, int16_t, + fillNeon({static_cast(-4096)}, VL / 16)); + CHECK_NEON(4, int16_t, fillNeon({12}, VL / 8)); + CHECK_NEON(5, int16_t, + fillNeon({static_cast(-2048)}, VL / 16)); + + // 32-bit + RUN_AARCH64(R"( + mov x0, #0 + mov x1, #8 + addvl x0, x0, #1 + sdiv x0, x0, x1 + + ptrue p0.s + whilelo p1.s, xzr, x0 + + fmov s6, #10 + fmov s7, #-8 + fmov s8, #12 + fmov s9, #-16 + fmov s10, #12 + fmov s11, #-8 + + cpy z0.s, p0/m, s6 + cpy z1.s, p0/m, s7 + cpy z2.s, p1/m, s8 + cpy z3.s, p1/m, s9 + + # Test Alias + mov z4.S, p0/m, s10 + mov z5.S, p1/m, s11 + )"); + CHECK_NEON(0, float, fillNeon({10}, VL / 8)); + CHECK_NEON(1, float, fillNeon({static_cast(-8)}, VL / 8)); + CHECK_NEON(2, float, fillNeon({12}, VL / 16)); + CHECK_NEON(3, float, fillNeon({static_cast(-16)}, VL / 16)); + CHECK_NEON(4, float, fillNeon({12}, VL / 8)); + CHECK_NEON(5, float, fillNeon({static_cast(-8)}, VL / 16)); + + // 64-bit + RUN_AARCH64(R"( + mov x0, #0 + mov x1, #16 + addvl x0, x0, #1 + sdiv x0, x0, x1 + + ptrue p0.d + whilelo p1.d, xzr, x0 + + fmov d6, #10 + fmov d7, #-8 + fmov d8, #12 + fmov d9, #-16 + fmov d10, #12 + fmov d11, #-8 + + cpy z0.d, p0/m, d6 + cpy z1.d, p0/m, d7 + cpy z2.d, p1/m, d8 + cpy z3.d, p1/m, d9 + + # Test Alias + mov z4.d, p0/m, d10 + mov z5.d, p1/m, d11 + )"); + CHECK_NEON(0, double, fillNeon({10}, VL / 8)); + CHECK_NEON(1, double, fillNeon({static_cast(-8)}, VL / 8)); + CHECK_NEON(2, double, fillNeon({12}, VL / 16)); + CHECK_NEON(3, double, fillNeon({static_cast(-16)}, VL / 16)); + CHECK_NEON(4, double, fillNeon({12}, VL / 8)); + CHECK_NEON(5, double, fillNeon({static_cast(-8)}, VL / 16)); + + EXPECT_GROUP(R"(cpy z3.d, p1/m, d9)", SVE_SIMPLE_ARTH_NOSHIFT); } TEST_P(InstSve, fcpy) { @@ -3451,6 +3779,8 @@ TEST_P(InstSve, fdiv) { CHECK_NEON(1, double, fillNeon(dresults, VL / 8)); std::rotate(dsrcB.begin(), dsrcB.begin() + ((VL / 128) % 8), dsrcB.end()); CHECK_NEON(2, double, fillNeonCombined(dresults, dsrcB, VL / 8)); + + EXPECT_GROUP(R"(fdiv z2.d, p0/m, z2.d, z0.d)", SVE_DIV_OR_SQRT); } TEST_P(InstSve, fnmls) { @@ -4610,10 +4940,17 @@ TEST_P(InstSve, index) { CHECK_NEON(7, uint64_t, fillNeonBaseAndOffset(10, 10, VL / 8)); } -TEST_P(InstSve, ld1rd) { - initialHeapData_.resize(16); - uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); - fillHeap(heap64, {0xDEADBEEF, 0x12345678}, 2); +TEST_P(InstSve, ftsmul) { + initialHeapData_.resize(VL / 4); + // 64-bit arrangement + double* dheap = reinterpret_cast(initialHeapData_.data()); + std::vector srcA64 = {1.0, 2.0, 4.0, 12.34}; + // Note that "the elements of the second source vector hold the corresponding + // value of the quadrant Q number as an integer not a floating-point value". + // We use doubles anyway as we only care about the sign bit, and currently + // "fillHeapCombined" only takes a single templated type + std::vector srcB64 = {1.0, -5.4, 0.0, 78.2}; + fillHeapCombined(dheap, srcA64, srcB64, VL / 32); RUN_AARCH64(R"( # Get heap address @@ -4621,24 +4958,251 @@ TEST_P(InstSve, ld1rd) { mov x8, 214 svc #0 - # Load and broadcast values from heap + mov x1, #0 + mov x2, #0 + mov x3, #8 + addvl x2, x2, #1 + udiv x2, x2, x3 ptrue p0.d - ld1rd {z0.d}, p0/z, [x0] - ld1rd {z1.d}, p0/z, [x0, #8] - # Test for inactive lanes - mov x1, #0 - addvl x1, x1, #1 - mov x2, #16 - udiv x1, x1, x2 - whilelo p1.d, xzr, x1 - ld1rd {z2.d}, p1/z, [x0] - ld1rd {z3.d}, p1/z, [x0, #8] + ld1d {z0.d}, p0/z, [x0, x1, lsl #3] + ld1d {z1.d}, p0/z, [x0, x2, lsl #3] + + ftsmul z2.d, z0.d, z1.d + ftsmul z3.d, z1.d, z0.d )"); - CHECK_NEON(0, uint64_t, fillNeon({0xDEADBEEF}, VL / 8)); - CHECK_NEON(1, uint64_t, fillNeon({0x12345678}, VL / 8)); - CHECK_NEON(2, uint64_t, fillNeon({0xDEADBEEF}, VL / 16)); - CHECK_NEON(3, uint64_t, fillNeon({0x12345678}, VL / 16)); + CHECK_NEON(2, double, fillNeon({1.0, -4.0, 16.0, 152.2756}, VL / 8)); + CHECK_NEON(3, double, fillNeon({1.0, 29.16, 0.0, 6115.24}, VL / 8)); + + // 32-bit arrangement + initialHeapData_.resize(VL / 8); + float* fheap = reinterpret_cast(initialHeapData_.data()); + std::vector fsrcA = {1.0f, 2.0f, 4.0f, 12.34f, + -3.0f, -19.6f, 0.0f, 7.0f}; + // Note that "the elements of the second source vector hold the corresponding + // value of the quadrant Q number as an integer not a floating-point value". + // We use floats anyway as we only care about the sign bit, and currently + // "fillHeapCombined" only takes a single templated type + std::vector fsrcB = {1.0f, -5.4f, 0.0f, 78.2f, + 2.1f, -26.42f, 12.0f, 3.5f}; + fillHeapCombined(fheap, fsrcA, fsrcB, VL / 32); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #8 + addvl x2, x2, #1 + sdiv x2, x2, x3 + + whilelo p0.s, xzr, x2 + ptrue p1.s + + ld1w {z0.s}, p0/z, [x0, x1, lsl #2] + ld1w {z1.s}, p0/z, [x0, x2, lsl #2] + + ftsmul z2.s, z0.s, z1.s + ftsmul z3.s, z1.s, z0.s + )"); + CHECK_NEON(2, float, + fillNeon( + {1.0f, -4.0f, 16.0f, 152.2756f, 9.0f, -384.16f, 0.0f, 49.0f}, + VL / 16)); + CHECK_NEON(3, float, + fillNeon({1.0f, 29.16f, 0.0f, 6115.24f, -4.41f, -698.0164f, + 144.0f, 12.25f}, + VL / 16)); + + EXPECT_GROUP(R"(ftsmul z3.s, z1.s, z0.s)", SVE_SIMPLE_ARTH_NOSHIFT); +} + +TEST_P(InstSve, ftssel) { + initialHeapData_.resize(VL / 4); + // 64-bit arrangement + // We use uint64_t to model doubles here as we care about the bit patterns + // rather than values + uint64_t* dheap = reinterpret_cast(initialHeapData_.data()); + std::vector srcA64 = {0x1234, 0xABCD, 0x00000000F0F0FFFF, 0x9876}; + // Note that "The use of the second operand is consistent with it holding an + // integer corresponding to the desired sine-wave quadrant." + std::vector srcB64 = {0x0, 0x8000000000000000, 0x4000000000000000, + 0xC000000000000000}; + fillHeapCombined(dheap, srcA64, srcB64, VL / 32); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #8 + addvl x2, x2, #1 + udiv x2, x2, x3 + ptrue p0.d + + ld1d {z0.d}, p0/z, [x0, x1, lsl #3] + ld1d {z1.d}, p0/z, [x0, x2, lsl #3] + + ftssel z2.d, z0.d, z1.d + )"); + CHECK_NEON(2, uint64_t, + fillNeon({0x1234, 0x3ff0000000000000, 0x80000000F0F0FFFF, + 0xbff0000000000000}, + VL / 8)); + + // 32-bit arrangement + // We use uint32_t to model floats here as we care about the bit patterns + // rather than values + initialHeapData_.resize(VL / 8); + uint32_t* fheap = reinterpret_cast(initialHeapData_.data()); + std::vector fsrcA = {0x1234, 0xABCD, 0x00F0FFFF, 0x9876}; + // Note that "the elements of the second source vector hold the corresponding + // value of the quadrant Q number as an integer not a floating-point value". + std::vector fsrcB = {0x0, 0x80000000, 0x40000000, 0xC0000000}; + fillHeapCombined(fheap, fsrcA, fsrcB, VL / 32); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #8 + addvl x2, x2, #1 + sdiv x2, x2, x3 + + whilelo p0.s, xzr, x2 + ptrue p1.s + + ld1w {z0.s}, p0/z, [x0, x1, lsl #2] + ld1w {z1.s}, p0/z, [x0, x2, lsl #2] + + ftssel z2.s, z0.s, z1.s + )"); + CHECK_NEON(2, uint32_t, + fillNeon({0x1234, 0x3f800000, 0x80F0FFFF, 0xBF800000}, + VL / 16)); + + EXPECT_GROUP(R"(ftssel z2.s, z0.s, z1.s)", SVE_SIMPLE_ARTH_NOSHIFT); +} + +TEST_P(InstSve, ftmad) { + initialHeapData_.resize(VL / 4); + // 64-bit arrangement + double* dheap = reinterpret_cast(initialHeapData_.data()); + std::vector srcA64 = {0.0, 0.5, -0.5, 0.75}; + std::vector srcB64 = {0.0, 0.5, -0.4, -0.2}; + fillHeapCombined(dheap, srcA64, srcB64, VL / 32); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #8 + addvl x2, x2, #1 + udiv x2, x2, x3 + ptrue p0.d + + ld1d {z0.d}, p0/z, [x0, x1, lsl #3] + ld1d {z1.d}, p0/z, [x0, x2, lsl #3] + mov z2.d, z0.d + mov z3.d, z0.d + mov z4.d, z0.d + + ftmad z2.d, z2.d, z1.d, #0 + ftmad z3.d, z3.d, z1.d, #2 + ftmad z4.d, z4.d, z1.d, #7 + )"); + CHECK_NEON(2, double, fillNeon({1.0, 1.25, 0.8, 1.15}, VL / 8)); + CHECK_NEON(3, double, + fillNeon({0.008333333333320002, 0.258333333333320002, + -0.15833333333333355, 0.19166666666666645}, + VL / 8)); + CHECK_NEON( + 4, double, + fillNeon({0.0, 0.25, -0.20000000001135337, 0.1499999999886466}, + VL / 8)); + + // 32-bit arrangement + initialHeapData_.resize(VL / 4); + float* fheap = reinterpret_cast(initialHeapData_.data()); + std::vector fsrcA = {0.0f, 0.5f, -0.5f, 0.75f}; + std::vector fsrcB = {0.0f, 0.5f, -0.4f, -0.2f}; + fillHeapCombined(fheap, fsrcA, fsrcB, VL / 16); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #4 + addvl x2, x2, #1 + sdiv x2, x2, x3 + + whilelo p0.s, xzr, x2 + + ld1w {z2.s}, p0/z, [x0] + ld1w {z3.s}, p0/z, [x0] + ld1w {z4.s}, p0/z, [x0, x1, lsl #2] + ld1w {z1.s}, p0/z, [x0, x2, lsl #2] + + ftmad z2.s, z2.s, z1.s, #0 + ftmad z3.s, z3.s, z1.s, #2 + ftmad z4.s, z4.s, z1.s, #7 + )"); + CHECK_NEON(2, float, fillNeon({1.0f, 1.25f, 0.8f, 1.15f}, VL / 8)); + CHECK_NEON(3, float, + fillNeon( + {0.00833333f, 0.25833333f, -0.1583334f, 0.1916666f}, VL / 8)); + CHECK_NEON(4, float, fillNeon({0.0f, 0.25f, -0.2f, 0.15f}, VL / 8)); + + EXPECT_GROUP(R"(ftmad z4.s, z4.s, z1.s, #7)", SVE_MUL); +} + +TEST_P(InstSve, ld1rd) { + initialHeapData_.resize(16); + uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + fillHeap(heap64, {0xDEADBEEF, 0x12345678}, 2); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + # Load and broadcast values from heap + ptrue p0.d + ld1rd {z0.d}, p0/z, [x0] + ld1rd {z1.d}, p0/z, [x0, #8] + + # Test for inactive lanes + mov x1, #0 + addvl x1, x1, #1 + mov x2, #16 + udiv x1, x1, x2 + whilelo p1.d, xzr, x1 + ld1rd {z2.d}, p1/z, [x0] + ld1rd {z3.d}, p1/z, [x0, #8] + )"); + CHECK_NEON(0, uint64_t, fillNeon({0xDEADBEEF}, VL / 8)); + CHECK_NEON(1, uint64_t, fillNeon({0x12345678}, VL / 8)); + CHECK_NEON(2, uint64_t, fillNeon({0xDEADBEEF}, VL / 16)); + CHECK_NEON(3, uint64_t, fillNeon({0x12345678}, VL / 16)); } TEST_P(InstSve, ld1rqd) { @@ -5647,6 +6211,29 @@ TEST_P(InstSve, pfalse) { CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {0}, 1)); } +TEST_P(InstSve, pfirst) { + RUN_AARCH64(R"( + ptrue p0.b + pfalse p1.b + ptrue p2.b + ptrue p3.b + pfalse p4.b + pfalse p5.b + + pfirst p2.b, p0, p2.b + pfirst p3.b, p1, p3.b + pfirst p4.b, p0, p4.b + pfirst p5.b, p1, p5.b + )"); + CHECK_PREDICATE(2, uint64_t, fillPred(VL / 8, {1}, 1)); + CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 1)); + CHECK_PREDICATE(4, uint64_t, fillPred(1, {1}, 1)); + CHECK_PREDICATE(5, uint64_t, fillPred(VL / 8, {0}, 1)); + EXPECT_EQ(getNZCV(), 0b0110); + + EXPECT_GROUP(R"(pfirst p5.b, p1, p5.b)", PREDICATE); +} + TEST_P(InstSve, ptrue) { RUN_AARCH64(R"( ptrue p0.s @@ -5660,6 +6247,108 @@ TEST_P(InstSve, ptrue) { CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 2)); } +TEST_P(InstSve, pnext) { + initialHeapData_.resize(1024); + uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + + // B arrangement + // Allow 32 Byte space for each predicate register for when VL=2048 + std::vector src = {0xAAAA, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0xAA00, 0x0, 0x0, 0x0}; + fillHeap(heap64, src, 12); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr p2, [x0] + add x0, x0, #32 + ldr p0, [x0] + + pnext p0.b, p2, p0.b + + ldr p1, [x0] + add x0, x0, #32 + ldr p3, [x0] + + pnext p1.b, p3, p1.b + )"); + CHECK_PREDICATE(0, uint64_t, + fillPredFromSource({0x02, 0, 0, 0}, 32)); + CHECK_PREDICATE(1, uint64_t, + fillPredFromSource({0x0200, 0, 0, 0}, 32)); + EXPECT_EQ(getNZCV(), 0b0010); + + // H arrangement + src = {0x555, 0x0, 0x0, 0x0, 0x333, 0x0, 0x0, 0x0}; + fillHeap(heap64, src, 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr p1, [x0] + add x0, x0, #32 + ldr p0, [x0] + + pnext p0.h, p1, p0.h + )"); + CHECK_PREDICATE(0, uint64_t, + fillPredFromSource({0x400, 0, 0, 0}, 32)); + EXPECT_EQ(getNZCV(), 0b0010); + + // S arrangement + src = {0x9, 0x0, 0x0, 0x0, 0x6, 0x0, 0x0, 0x0}; + fillHeap(heap64, src, 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr p1, [x0] + add x0, x0, #32 + ldr p0, [x0] + + pnext p0.s, p1, p0.s + )"); + CHECK_PREDICATE(0, uint64_t, + fillPredFromSource({0x1, 0, 0, 0}, 32)); + EXPECT_EQ(getNZCV(), 0b1010); + + // D arrangement + src = {0x3, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, + 0xF3, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; + fillHeap(heap64, src, 12); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr p2, [x0] + add x0, x0, #32 + ldr p0, [x0] + + pnext p0.d, p2, p0.d + + add x0, x0, #32 + ldr p3, [x0] + add x0, x0, #32 + ldr p1, [x0] + + pnext p1.d, p3, p1.d + )"); + CHECK_PREDICATE(0, uint64_t, fillPredFromSource({0, 0, 0, 0}, 32)); + CHECK_PREDICATE(1, uint64_t, + fillPredFromSource({0x1, 0, 0, 0}, 32)); + EXPECT_EQ(getNZCV(), 0b1010); + + EXPECT_GROUP(R"(pnext p1.d, p3, p1.d)", PREDICATE); +} + TEST_P(InstSve, punpk) { RUN_AARCH64(R"( ptrue p0.b @@ -5965,6 +6654,64 @@ TEST_P(InstSve, sel) { } TEST_P(InstSve, smax) { + // 64-bit + initialHeapData_.resize(VL / 4); + int64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + std::vector srcA64 = {1, 2, 3, 4, 5, 6, 7, 8, + -9, -10, -11, -12, 13, 14, -15, -1}; + std::vector srcB64 = {16, 15, 14, 13, -12, -11, -10, -9, + 8, 7, 6, 5, 4, 3, -2, -1}; + fillHeapCombined(heap64, srcA64, srcB64, VL / 32); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #0 + mov x4, #8 + mov x5, #2 + addvl x2, x2, #1 + udiv x2, x2, x4 + udiv x3, x2, x5 + whilelo p1.d, xzr, x3 + ptrue p0.d + + ld1d {z0.d}, p0/z, [x0, x1, lsl #3] + ld1d {z1.d}, p0/z, [x0, x2, lsl #3] + ld1d {z2.d}, p0/z, [x0, x2, lsl #3] + ld1d {z3.d}, p0/z, [x0, x1, lsl #3] + ld1d {z4.d}, p0/z, [x0, x1, lsl #3] + ld1d {z5.d}, p0/z, [x0, x1, lsl #3] + + smax z1.d, p0/m, z1.d, z0.d + smax z2.d, p1/m, z2.d, z0.d + + smax z3.d, z3.d, #0 + smax z4.d, z4.d, #-128 + smax z5.d, z5.d, #127 + )"); + std::vector results64 = {16, 15, 14, 13, 5, 6, 7, 8, + 8, 7, 6, 5, 13, 14, -2, -1}; + CHECK_NEON(1, int64_t, fillNeon(results64, VL / 8)); + std::rotate(srcB64.begin(), srcB64.begin() + ((VL / 128) % 16), srcB64.end()); + CHECK_NEON(2, int64_t, fillNeonCombined(results64, srcB64, VL / 8)); + + CHECK_NEON(3, int64_t, + fillNeon( + {1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 13, 14, 0, 0}, VL / 8)); + CHECK_NEON(4, int64_t, + fillNeon( + {1, 2, 3, 4, 5, 6, 7, 8, -9, -10, -11, -12, 13, 14, -15, -1}, + VL / 8)); + CHECK_NEON(5, int64_t, + fillNeon({127, 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127}, + VL / 8)); + // 32-bit initialHeapData_.resize(VL / 4); int32_t* heap32 = reinterpret_cast(initialHeapData_.data()); @@ -6022,9 +6769,184 @@ TEST_P(InstSve, smax) { fillNeon({127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}, VL / 8)); + + // 16-bit + initialHeapData_.resize(VL / 4); + int16_t* heap16 = reinterpret_cast(initialHeapData_.data()); + std::vector srcA16 = {1, 2, 3, 4, 5, 6, 7, 8, + -9, -10, -11, -12, 13, 14, -15, -1}; + std::vector srcB16 = {16, 15, 14, 13, -12, -11, -10, -9, + 8, 7, 6, 5, 4, 3, -2, -1}; + fillHeapCombined(heap16, srcA16, srcB16, VL / 8); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #0 + mov x4, #2 + mov x5, #2 + addvl x2, x2, #1 + udiv x2, x2, x4 + udiv x3, x2, x5 + whilelo p1.h, xzr, x3 + ptrue p0.h + + ld1h {z0.h}, p0/z, [x0, x1, lsl #1] + ld1h {z1.h}, p0/z, [x0, x2, lsl #1] + ld1h {z2.h}, p0/z, [x0, x2, lsl #1] + ld1h {z3.h}, p0/z, [x0, x1, lsl #1] + ld1h {z4.h}, p0/z, [x0, x1, lsl #1] + ld1h {z5.h}, p0/z, [x0, x1, lsl #1] + + smax z1.h, p0/m, z1.h, z0.h + smax z2.h, p1/m, z2.h, z0.h + + smax z3.h, z3.h, #0 + smax z4.h, z4.h, #-128 + smax z5.h, z5.h, #127 + )"); + std::vector results16 = {16, 15, 14, 13, 5, 6, 7, 8, + 8, 7, 6, 5, 13, 14, -2, -1}; + CHECK_NEON(1, int16_t, fillNeon(results16, VL / 8)); + std::rotate(srcB16.begin(), srcB16.begin() + ((VL / 32) % 16), srcB16.end()); + CHECK_NEON(2, int16_t, fillNeonCombined(results16, srcB16, VL / 8)); + + CHECK_NEON(3, int16_t, + fillNeon( + {1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 13, 14, 0, 0}, VL / 8)); + CHECK_NEON(4, int16_t, + fillNeon( + {1, 2, 3, 4, 5, 6, 7, 8, -9, -10, -11, -12, 13, 14, -15, -1}, + VL / 8)); + CHECK_NEON(5, int16_t, + fillNeon({127, 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127}, + VL / 8)); + + // 8-bit + initialHeapData_.resize(VL / 4); + int8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + std::vector srcA8 = {1, 2, 3, 4, 5, 6, 7, 8, + -9, -10, -11, -12, 13, 14, -15, -1}; + std::vector srcB8 = {16, 15, 14, 13, -12, -11, -10, -9, + 8, 7, 6, 5, 4, 3, -2, -1}; + fillHeapCombined(heap8, srcA8, srcB8, VL / 4); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #0 + mov x4, #1 + mov x5, #2 + addvl x2, x2, #1 + udiv x2, x2, x4 + udiv x3, x2, x5 + whilelo p1.b, xzr, x3 + ptrue p0.b + + ld1b {z0.b}, p0/z, [x0, x1] + ld1b {z1.b}, p0/z, [x0, x2] + ld1b {z2.b}, p0/z, [x0, x2] + ld1b {z3.b}, p0/z, [x0, x1] + ld1b {z4.b}, p0/z, [x0, x1] + ld1b {z5.b}, p0/z, [x0, x1] + + smax z1.b, p0/m, z1.b, z0.b + smax z2.b, p1/m, z2.b, z0.b + + smax z3.b, z3.b, #0 + smax z4.b, z4.b, #-128 + smax z5.b, z5.b, #127 + )"); + std::vector results8 = {16, 15, 14, 13, 5, 6, 7, 8, + 8, 7, 6, 5, 13, 14, -2, -1}; + CHECK_NEON(1, int8_t, fillNeon(results8, VL / 8)); + std::rotate(srcB8.begin(), srcB8.begin() + ((VL / 16) % 16), srcB8.end()); + CHECK_NEON(2, int8_t, fillNeonCombined(results8, srcB8, VL / 8)); + + CHECK_NEON(3, int8_t, + fillNeon( + {1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 13, 14, 0, 0}, VL / 8)); + CHECK_NEON(4, int8_t, + fillNeon( + {1, 2, 3, 4, 5, 6, 7, 8, -9, -10, -11, -12, 13, 14, -15, -1}, + VL / 8)); + CHECK_NEON(5, int8_t, + fillNeon({127, 127, 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127}, + VL / 8)); + + EXPECT_GROUP(R"(smax z5.b, z5.b, #127)", SVE_SIMPLE_ARTH_NOSHIFT); } -TEST_P(InstSve, smin) { +TEST_P(InstSve, smin) { + // 64-bit + initialHeapData_.resize(VL / 4); + int64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + std::vector srcA64 = {1, 2, 3, 4, 5, 6, 7, 8, + -9, -10, -11, -12, 13, 14, -15, -1}; + std::vector srcB64 = {16, 15, 14, 13, -12, -11, -10, -9, + 8, 7, 6, 5, 4, 3, -2, -1}; + fillHeapCombined(heap64, srcA64, srcB64, VL / 32); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #0 + mov x4, #8 + mov x5, #2 + addvl x2, x2, #1 + udiv x2, x2, x4 + udiv x3, x2, x5 + whilelo p1.d, xzr, x3 + ptrue p0.d + + ld1d {z0.d}, p0/z, [x0, x1, lsl #3] + ld1d {z1.d}, p0/z, [x0, x2, lsl #3] + ld1d {z2.d}, p0/z, [x0, x2, lsl #3] + + smin z1.d, p0/m, z1.d, z0.d + smin z2.d, p1/m, z2.d, z0.d + + sminv d3, p1, z1.d + sminv d4, p0, z2.d + )"); + + std::vector results64 = {1, 2, 3, 4, -12, -11, -10, -9, + -9, -10, -11, -12, 4, 3, -15, -1}; + std::array arrA64 = fillNeon(results64, VL / 8); + std::rotate(srcB64.begin(), srcB64.begin() + ((VL / 128) % 16), srcB64.end()); + std::array arrB64 = + fillNeonCombined(results64, srcB64, VL / 8); + + CHECK_NEON(1, int64_t, arrA64); + CHECK_NEON(2, int64_t, arrB64); + // Find miniumum element. Modify search end point to only consider the + // elements within the current VL and predication. + int64_t minElemA64 = arrA64[std::distance( + arrA64.begin(), + std::min_element(arrA64.begin(), arrA64.end() - (32 - VL / 128)))]; + int64_t minElemB64 = arrB64[std::distance( + arrB64.begin(), + std::min_element(arrB64.begin(), arrB64.end() - (32 - VL / 64)))]; + CHECK_NEON(3, int64_t, {minElemA64, 0, 0, 0}); + CHECK_NEON(4, int64_t, {minElemB64, 0, 0, 0}); + // 32-bit initialHeapData_.resize(VL / 4); int32_t* heap32 = reinterpret_cast(initialHeapData_.data()); @@ -6064,23 +6986,140 @@ TEST_P(InstSve, smin) { std::vector results32 = {1, 2, 3, 4, -12, -11, -10, -9, -9, -10, -11, -12, 4, 3, -15, -1}; - std::array arrA = fillNeon(results32, VL / 8); + std::array arrA32 = fillNeon(results32, VL / 8); std::rotate(srcB32.begin(), srcB32.begin() + ((VL / 64) % 16), srcB32.end()); - std::array arrB = + std::array arrB32 = fillNeonCombined(results32, srcB32, VL / 8); - CHECK_NEON(1, int32_t, arrA); - CHECK_NEON(2, int32_t, arrB); + CHECK_NEON(1, int32_t, arrA32); + CHECK_NEON(2, int32_t, arrB32); + // Find miniumum element. Modify search end point to only consider the + // elements within the current VL and predication. + int32_t minElemA32 = arrA32[std::distance( + arrA32.begin(), + std::min_element(arrA32.begin(), arrA32.end() - (64 - VL / 64)))]; + int32_t minElemB32 = arrB32[std::distance( + arrB32.begin(), + std::min_element(arrB32.begin(), arrB32.end() - (64 - VL / 32)))]; + CHECK_NEON(3, int32_t, {minElemA32, 0, 0, 0}); + CHECK_NEON(4, int32_t, {minElemB32, 0, 0, 0}); + + // 16-bit + initialHeapData_.resize(VL / 4); + int16_t* heap16 = reinterpret_cast(initialHeapData_.data()); + std::vector srcA16 = {1, 2, 3, 4, 5, 6, 7, 8, + -9, -10, -11, -12, 13, 14, -15, -1}; + std::vector srcB16 = {16, 15, 14, 13, -12, -11, -10, -9, + 8, 7, 6, 5, 4, 3, -2, -1}; + fillHeapCombined(heap16, srcA16, srcB16, VL / 8); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #0 + mov x4, #2 + mov x5, #2 + addvl x2, x2, #1 + udiv x2, x2, x4 + udiv x3, x2, x5 + whilelo p1.h, xzr, x3 + ptrue p0.h + + ld1h {z0.h}, p0/z, [x0, x1, lsl #1] + ld1h {z1.h}, p0/z, [x0, x2, lsl #1] + ld1h {z2.h}, p0/z, [x0, x2, lsl #1] + + smin z1.h, p0/m, z1.h, z0.h + smin z2.h, p1/m, z2.h, z0.h + + sminv h3, p1, z1.h + sminv h4, p0, z2.h + )"); + + std::vector results16 = {1, 2, 3, 4, -12, -11, -10, -9, + -9, -10, -11, -12, 4, 3, -15, -1}; + std::array arrA16 = fillNeon(results16, VL / 8); + std::rotate(srcB16.begin(), srcB16.begin() + ((VL / 32) % 16), srcB16.end()); + std::array arrB16 = + fillNeonCombined(results16, srcB16, VL / 8); + + CHECK_NEON(1, int16_t, arrA16); + CHECK_NEON(2, int16_t, arrB16); + // Find miniumum element. Modify search end point to only consider the + // elements within the current VL and predication. + int16_t minElemA16 = arrA16[std::distance( + arrA16.begin(), + std::min_element(arrA16.begin(), arrA16.end() - (128 - VL / 32)))]; + int16_t minElemB16 = arrB16[std::distance( + arrB16.begin(), + std::min_element(arrB16.begin(), arrB16.end() - (128 - VL / 16)))]; + CHECK_NEON(3, int16_t, {minElemA16, 0, 0, 0}); + CHECK_NEON(4, int16_t, {minElemB16, 0, 0, 0}); + + // 8-bit + initialHeapData_.resize(VL / 4); + int8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + std::vector srcA8 = {1, 2, 3, 4, 5, 6, 7, 8, + -9, -10, -11, -12, 13, 14, -15, -1}; + std::vector srcB8 = {16, 15, 14, 13, -12, -11, -10, -9, + 8, 7, 6, 5, 4, 3, -2, -1}; + fillHeapCombined(heap8, srcA8, srcB8, VL / 4); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #0 + mov x4, #1 + mov x5, #2 + addvl x2, x2, #1 + udiv x2, x2, x4 + udiv x3, x2, x5 + whilelo p1.b, xzr, x3 + ptrue p0.b + + ld1b {z0.b}, p0/z, [x0, x1] + ld1b {z1.b}, p0/z, [x0, x2] + ld1b {z2.b}, p0/z, [x0, x2] + + smin z1.b, p0/m, z1.b, z0.b + smin z2.b, p1/m, z2.b, z0.b + + sminv b3, p1, z1.b + sminv b4, p0, z2.b + )"); + + std::vector results8 = {1, 2, 3, 4, -12, -11, -10, -9, + -9, -10, -11, -12, 4, 3, -15, -1}; + std::array arrA8 = fillNeon(results8, VL / 8); + std::rotate(srcB8.begin(), srcB8.begin() + ((VL / 16) % 16), srcB8.end()); + std::array arrB8 = + fillNeonCombined(results8, srcB8, VL / 8); + + CHECK_NEON(1, int8_t, arrA8); + CHECK_NEON(2, int8_t, arrB8); // Find miniumum element. Modify search end point to only consider the // elements within the current VL and predication. - int32_t minElemA = arrA[std::distance( - arrA.begin(), - std::min_element(arrA.begin(), arrA.end() - (64 - VL / 64)))]; - int32_t minElemB = arrB[std::distance( - arrB.begin(), - std::min_element(arrB.begin(), arrB.end() - (64 - VL / 32)))]; - CHECK_NEON(3, int32_t, {minElemA, 0, 0, 0}); - CHECK_NEON(4, int32_t, {minElemB, 0, 0, 0}); + int8_t minElemA8 = arrA8[std::distance( + arrA8.begin(), + std::min_element(arrA8.begin(), arrA8.end() - (256 - VL / 16)))]; + int8_t minElemB8 = arrB8[std::distance( + arrB8.begin(), + std::min_element(arrB8.begin(), arrB8.end() - (256 - VL / 8)))]; + CHECK_NEON(3, int8_t, {minElemA8, 0, 0, 0}); + CHECK_NEON(4, int8_t, {minElemB8, 0, 0, 0}); + + EXPECT_GROUP(R"(smin z2.b, p1/m, z2.b, z0.b)", SVE_SIMPLE_ARTH_NOSHIFT); + EXPECT_GROUP(R"(sminv b4, p0, z2.b)", SCALAR_SIMPLE_ARTH_NOSHIFT); } TEST_P(InstSve, smulh) { @@ -6164,6 +7203,260 @@ TEST_P(InstSve, smulh) { fillNeonCombined({-12}, {-1076902265}, VL / 8)); } +TEST_P(InstSve, clastb) { + // 64 bit + RUN_AARCH64(R"( + movz x0, #0xCDEF + movk x0, #0x89AB, LSL #16 + movk x0, #0x4567, LSL #32 + movk x0, #0x0123, LSL #48 + movz x1, #0x4321 + movk x1, #0x8765, LSL #16 + movk x1, #0xCBA9, LSL #32 + movk x1, #0x1FED, LSL #48 + + dup z2.d, x0 + dup z3.d, x1 + + pfalse p0.b + clastb d2, p0, d2, z3.d + mov z0.d, z2.d + + ptrue p0.d + clastb d2, p0, d2, z3.d + mov z1.d, z2.d + )"); + CHECK_NEON(0, uint64_t, fillNeon({0x0123456789ABCDEF}, 8)); + CHECK_NEON(1, uint64_t, fillNeon({0x1FEDCBA987654321}, 8)); + + // 32 bit + RUN_AARCH64(R"( + movz x0, #0xCDEF + movk x0, #0x89AB, LSL #16 + movk x0, #0x4567, LSL #32 + movk x0, #0x0123, LSL #48 + movz x1, #0x4321 + movk x1, #0x8765, LSL #16 + movk x1, #0xCBA9, LSL #32 + movk x1, #0x1FED, LSL #48 + + dup z2.d, x0 + dup z3.d, x1 + + pfalse p0.b + clastb s2, p0, s2, z3.s + mov z0.d, z2.d + + ptrue p0.s + clastb s2, p0, s2, z3.s + mov z1.d, z2.d + )"); + CHECK_NEON(0, uint64_t, fillNeon({0x89ABCDEF}, 8)); + CHECK_NEON(1, uint64_t, fillNeon({0x1FEDCBA9}, 8)); + + // 16 bit + RUN_AARCH64(R"( + movz x0, #0xCDEF + movk x0, #0x89AB, LSL #16 + movk x0, #0x4567, LSL #32 + movk x0, #0x0123, LSL #48 + movz x1, #0x4321 + movk x1, #0x8765, LSL #16 + movk x1, #0xCBA9, LSL #32 + movk x1, #0x1FED, LSL #48 + + dup z2.d, x0 + dup z3.d, x1 + + pfalse p0.b + clastb h2, p0, h2, z3.h + mov z0.d, z2.d + + ptrue p0.h + clastb h2, p0, h2, z3.h + mov z1.d, z2.d + )"); + CHECK_NEON(0, uint64_t, fillNeon({0xCDEF}, 8)); + CHECK_NEON(1, uint64_t, fillNeon({0x1FED}, 8)); + + // 8 bit + RUN_AARCH64(R"( + movz x0, #0xCDEF + movk x0, #0x89AB, LSL #16 + movk x0, #0x4567, LSL #32 + movk x0, #0x0123, LSL #48 + movz x1, #0x4321 + movk x1, #0x8765, LSL #16 + movk x1, #0xCBA9, LSL #32 + movk x1, #0x1FED, LSL #48 + + dup z2.d, x0 + dup z3.d, x1 + + pfalse p0.b + clastb b2, p0, b2, z3.b + mov z0.d, z2.d + + ptrue p0.b + clastb b2, p0, b2, z3.b + mov z1.d, z2.d + )"); + CHECK_NEON(0, uint64_t, fillNeon({0xEF}, 8)); + CHECK_NEON(1, uint64_t, fillNeon({0x1F}, 8)); + + EXPECT_GROUP(R"(clastb b2, p0, b2, z3.b)", SCALAR_SIMPLE_ARTH_NOSHIFT); +} + +TEST_P(InstSve, lastb) { + // 64 bit + RUN_AARCH64(R"( + movz x0, #0xCDEF + movk x0, #0x89AB, LSL #16 + movk x0, #0x4567, LSL #32 + movk x0, #0x0123, LSL #48 + + movz x1, #0x4321 + movk x1, #0x8765, LSL #16 + movk x1, #0xCBA9, LSL #32 + movk x1, #0x1FED, LSL #48 + + ptrue p0.d + dup z2.d, x0 + dup z3.d, x1 + + pfalse p0.b + lastb d4, p0, z2.d + mov z0.d, z4.d + + ptrue p0.d + lastb d5, p0, z3.d + mov z1.d, z5.d + )"); + CHECK_NEON(0, uint64_t, fillNeon({0x0123456789ABCDEF}, 8)); + CHECK_NEON(1, uint64_t, fillNeon({0x1FEDCBA987654321}, 8)); + + // 32 bit + RUN_AARCH64(R"( + movz x0, #0xCDEF + movk x0, #0x89AB, LSL #16 + movk x0, #0x4567, LSL #32 + movk x0, #0x0123, LSL #48 + movz x1, #0x4321 + movk x1, #0x8765, LSL #16 + movk x1, #0xCBA9, LSL #32 + movk x1, #0x1FED, LSL #48 + + dup z2.d, x0 + dup z3.d, x1 + + pfalse p0.b + lastb s4, p0, z2.s + mov z0.d, z4.d + + ptrue p0.s + lastb s4, p0, z3.s + mov z1.d, z4.d + )"); + CHECK_NEON(0, uint64_t, fillNeon({0x01234567}, 8)); + CHECK_NEON(1, uint64_t, fillNeon({0x1FEDCBA9}, 8)); + + // 16 bit + RUN_AARCH64(R"( + movz x0, #0xCDEF + movk x0, #0x89AB, LSL #16 + movk x0, #0x4567, LSL #32 + movk x0, #0x0123, LSL #48 + movz x1, #0x4321 + movk x1, #0x8765, LSL #16 + movk x1, #0xCBA9, LSL #32 + movk x1, #0x1FED, LSL #48 + + dup z2.d, x0 + dup z3.d, x1 + + pfalse p0.b + lastb h4, p0, z2.h + mov z0.d, z4.d + + ptrue p0.h + lastb h4, p0, z3.h + mov z1.d, z4.d + )"); + CHECK_NEON(0, uint64_t, fillNeon({0x0123}, 8)); + CHECK_NEON(1, uint64_t, fillNeon({0x1FED}, 8)); + + // 8 bit + RUN_AARCH64(R"( + movz x0, #0xCDEF + movk x0, #0x89AB, LSL #16 + movk x0, #0x4567, LSL #32 + movk x0, #0x0123, LSL #48 + movz x1, #0x4321 + movk x1, #0x8765, LSL #16 + movk x1, #0xCBA9, LSL #32 + movk x1, #0x1FED, LSL #48 + + dup z2.d, x0 + dup z3.d, x1 + + pfalse p0.b + lastb b4, p0, z2.b + mov z0.d, z4.d + + ptrue p0.b + lastb b4, p0, z3.b + mov z1.d, z4.d + )"); + CHECK_NEON(0, uint64_t, fillNeon({0x01}, 8)); + CHECK_NEON(1, uint64_t, fillNeon({0x1F}, 8)); + + EXPECT_GROUP(R"(lastb b4, p0, z3.b)", SCALAR_SIMPLE_ARTH_NOSHIFT); +} + +TEST_P(InstSve, splice) { + // 64-bit arrangement + RUN_AARCH64(R"( + fmov z0.d, #1.5 + fmov z1.d, #-0.5 + fmov z2.d, #1.5 + + ptrue p0.d + + mov x2, #0 + mov x4, #16 + addvl x2, x2, #1 + udiv x2, x2, x4 + whilelo p1.d, xzr, x2 + + splice z0.d, p0, z0.d, z1.d + splice z2.d, p1, z2.d, z1.d + )"); + CHECK_NEON(0, double, fillNeon({1.5}, VL / 8)); + CHECK_NEON(2, double, fillNeonCombined({1.5}, {-0.5}, VL / 8)); + + // 32-bit arrangement + RUN_AARCH64(R"( + fmov z0.s, #1.5 + fmov z1.s, #-0.5 + fmov z2.s, #1.5 + + ptrue p0.s + + mov x2, #0 + mov x4, #8 + addvl x2, x2, #1 + udiv x2, x2, x4 + whilelo p1.s, xzr, x2 + + splice z0.s, p0, z0.s, z1.s + splice z2.s, p1, z2.s, z1.s + )"); + CHECK_NEON(0, float, fillNeon({1.5}, VL / 8)); + CHECK_NEON(2, float, fillNeonCombined({1.5}, {-0.5}, VL / 8)); + + EXPECT_GROUP(R"(splice z2.s, p1, z2.s, z1.s)", SVE_SIMPLE_ARTH_NOSHIFT); +} + TEST_P(InstSve, st1b) { initialHeapData_.resize(VL / 4); uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); @@ -7704,6 +8997,202 @@ TEST_P(InstSve, whilelo) { EXPECT_EQ(getNZCV(), 0b0110); } +TEST_P(InstSve, whilels) { + // 8-bit arrangement, 64-bit source operands + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + + whilels p0.b, xzr, x0 + )"); + CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {1}, 1)); + EXPECT_EQ(getNZCV(), 0b1000); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #2 + udiv x2, x0, x1 + + whilels p1.b, x2, x0 + )"); + CHECK_PREDICATE(1, uint64_t, fillPred((VL / 16) + 1, {1}, 1)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #2 + udiv x2, x0, x1 + mov x3, #4 + udiv x4, x0, x3 + add x5, x4, x2 + + whilels p2.b, x5, x0 + )"); + CHECK_PREDICATE(2, uint64_t, fillPred((VL / 32) + 1, {1}, 1)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + sub x0, x0, #1 + mov x1, #0 + + whilels p3.b, x1, x0 + )"); + CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 1)); + EXPECT_EQ(getNZCV(), 0b1000); + + // 16-bit arrangement, 64-bit source operands + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #2 + udiv x2, x0, x1 + + whilels p0.h, xzr, x2 + )"); + CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {1}, 2)); + EXPECT_EQ(getNZCV(), 0b1000); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #2 + udiv x0, x0, x1 + udiv x2, x0, x1 + + whilels p1.h, x2, x0 + )"); + CHECK_PREDICATE(1, uint64_t, fillPred((VL / 16) + 1, {1}, 2)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #4 + udiv x2, x0, x1 + mov x3, #8 + udiv x4, x0, x3 + mov x5, #2 + udiv x0, x0, x5 + add x6, x4, x2 + + whilels p2.h, x6, x0 + )"); + CHECK_PREDICATE(2, uint64_t, fillPred((VL / 32) + 1, {1}, 2)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #2 + udiv x0, x0, x1 + sub x0, x0, #1 + + whilels p3.h, xzr, x0 + )"); + CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 2)); + EXPECT_EQ(getNZCV(), 0b1000); + + // 32-bit arrangement, 64-bit source operands + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #4 + udiv x2, x0, x1 + + whilels p0.s, xzr, x2 + )"); + CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {1}, 4)); + EXPECT_EQ(getNZCV(), 0b1000); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #2 + mov x2, #4 + udiv x0, x0, x2 + udiv x3, x0, x1 + + whilels p1.s, x3, x0 + )"); + CHECK_PREDICATE(1, uint64_t, fillPred((VL / 16) + 1, {1}, 4)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #8 + udiv x2, x0, x1 + mov x3, #16 + udiv x4, x0, x3 + mov x5, #4 + udiv x0, x0, x5 + add x6, x4, x2 + + whilels p2.s, x6, x0 + )"); + CHECK_PREDICATE(2, uint64_t, fillPred((VL / 32) + 1, {1}, 4)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #4 + udiv x0, x0, x1 + sub x0, x0, #1 + + whilels p3.s, xzr, x0 + )"); + CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 4)); + EXPECT_EQ(getNZCV(), 0b1000); + + // 64-bit arrangement, 64-bit source operands + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #8 + udiv x2, x0, x1 + + whilels p0.d, xzr, x2 + )"); + CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {1}, 8)); + EXPECT_EQ(getNZCV(), 0b1000); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #2 + mov x2, #8 + udiv x0, x0, x2 + udiv x3, x0, x1 + + whilels p1.d, x3, x0 + )"); + CHECK_PREDICATE(1, uint64_t, fillPred((VL / 16) + 1, {1}, 8)); + if (VL == 128) { + EXPECT_EQ(getNZCV(), 0b1000); + } else { + EXPECT_EQ(getNZCV(), 0b1010); + } + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #8 + udiv x0, x0, x1 + sub x0, x0, #1 + + whilels p3.d, xzr, x0 + )"); + CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 8)); + EXPECT_EQ(getNZCV(), 0b1000); + + EXPECT_GROUP(R"(whilels p3.d, xzr, x0)", PREDICATE); +} + TEST_P(InstSve, whilelt) { // 8-bit arrangement, 64-bit source operands RUN_AARCH64(R"(