From 9a9ca3fee3e268e03f64ccd1760850aabed0ba08 Mon Sep 17 00:00:00 2001 From: JosephMoore25 Date: Wed, 1 May 2024 18:55:08 +0100 Subject: [PATCH 01/38] Added LDRSWroW, LDAXRB, stlxrb insts --- src/lib/arch/aarch64/Instruction_address.cc | 15 +++++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 12 ++++++++++++ 2 files changed, 27 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index 357077e7b3..3f27b5acc3 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -451,6 +451,10 @@ span Instruction::generateAddresses() { setMemoryAddresses({{sourceValues_[0].get(), 8}}); break; } + case Opcode::AArch64_LDAXRB: { // ldaxrb wt, [xn] + setMemoryAddresses({{sourceValues_[0].get(), 1}}); + break; + } case Opcode::AArch64_LDAXRW: { // ldaxr wd, [xn] setMemoryAddresses({{sourceValues_[0].get(), 4}}); break; @@ -749,6 +753,13 @@ span Instruction::generateAddresses() { setMemoryAddresses({{sourceValues_[0].get() + offset, 4}}); break; } + case Opcode::AArch64_LDRSWroW: { // ldrsw xt, [xn, wm{, extend + // {#amount}}] + uint64_t offset = extendOffset(sourceValues_[1].get(), + metadata_.operands[1]); + setMemoryAddresses({{sourceValues_[0].get() + offset, 4}}); + break; + } case Opcode::AArch64_LDRSWui: { // ldrsw xt, [xn{, #pimm}] uint64_t base = sourceValues_[0].get() + metadata_.operands[1].mem.disp; @@ -1350,6 +1361,10 @@ span Instruction::generateAddresses() { setMemoryAddresses({{sourceValues_[1].get(), 8}}); break; } + case Opcode::AArch64_STLXRB: { // stlxrb ws, wt, [xn] + setMemoryAddresses({{sourceValues_[1].get(), 1}}); + break; + } case Opcode::AArch64_STLXRW: { // stlxr ws, wt, [xn] setMemoryAddresses({{sourceValues_[1].get(), 4}}); break; diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 20b62904b9..b8352c79bf 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -3330,6 +3330,11 @@ void Instruction::execute() { results_[0] = memoryData_[0]; break; } + case Opcode::AArch64_LDAXRB: { // ldaxrb wt, [xn] + // LOAD + results_[0] = memoryData_[0].zeroExtend(1, 8); + break; + } case Opcode::AArch64_LDAXRW: { // ldaxr wd, [xn] // LOAD results_[0] = memoryData_[0].zeroExtend(4, 8); @@ -3603,6 +3608,12 @@ void Instruction::execute() { results_[0] = static_cast(memoryData_[0].get()); break; } + case Opcode::AArch64_LDRSWroW: { // ldrsw xt, [xn, wm, {extend + // {#amount}}] + // LOAD + results_[0] = static_cast(memoryData_[0].get()); + break; + } case Opcode::AArch64_LDRSWui: { // ldrsw xt, [xn{, #pimm}] // LOAD results_[0] = static_cast(memoryData_[0].get()); @@ -4945,6 +4956,7 @@ void Instruction::execute() { memoryData_[0] = sourceValues_[0]; break; } + case Opcode::AArch64_STLXRB: // stlxrb ws, wt, [xn] case Opcode::AArch64_STLXRW: // stlxr ws, wt, [xn] case Opcode::AArch64_STLXRX: { // stlxr ws, xt, [xn] // STORE From 9adaeee95dcea32951e9b1f4a4969fb8fe266379 Mon Sep 17 00:00:00 2001 From: JosephMoore25 Date: Wed, 1 May 2024 19:01:09 +0100 Subject: [PATCH 02/38] Magic OMP affinity fix (thanks Jack) --- src/lib/arch/aarch64/ExceptionHandler.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/lib/arch/aarch64/ExceptionHandler.cc b/src/lib/arch/aarch64/ExceptionHandler.cc index ae98dddb1a..33701b049b 100644 --- a/src/lib/arch/aarch64/ExceptionHandler.cc +++ b/src/lib/arch/aarch64/ExceptionHandler.cc @@ -429,15 +429,16 @@ bool ExceptionHandler::init() { << std::endl; return fatal(); } - uint64_t retval = (pid == 0) ? 1 : 0; - stateChange = {ChangeType::REPLACEMENT, {R0}, {retval}}; - stateChange.memoryAddresses.push_back({mask, 1}); + uint64_t retval = static_cast(bitmask); + stateChange = {ChangeType::REPLACEMENT, {R0}, {sizeof(retval)}}; + stateChange.memoryAddresses.push_back({mask, 8}); stateChange.memoryAddressValues.push_back(bitmask); } else { stateChange = {ChangeType::REPLACEMENT, {R0}, {-1ll}}; } break; } + case 131: { // tgkill // TODO: Functionality temporarily omitted since simeng only has a // single thread at the moment From 70f0387cae35aa3f15dbb185135097c3237e09f4 Mon Sep 17 00:00:00 2001 From: JosephMoore25 Date: Mon, 20 May 2024 12:07:22 +0100 Subject: [PATCH 03/38] Added Cpy (Simd&FP scalar) instruction and alias, with tests for each size --- .../simeng/arch/aarch64/helpers/sve.hh | 26 ++++ src/lib/arch/aarch64/Instruction_execute.cc | 16 +++ test/regression/aarch64/instructions/sve.cc | 117 ++++++++++++++++++ 3 files changed, 159 insertions(+) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 2c33ccfbe6..4c162bcddf 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -257,6 +257,32 @@ RegisterValue sveCpy_imm( return {out, 256}; } +/** Helper function for SVE instructions with the format `cpy zd, pg/m, vn + * T represents the type of sourceValues (e.g. for zd.d, T = int64_t). + * Returns correctly formatted RegisterValue. */ +template +RegisterValue sveCpy_Scalar( + srcValContainer& sourceValues, + const simeng::arch::aarch64::InstructionMetadata& metadata, + const uint16_t VL_bits) { + const T* zd = sourceValues[0].getAsVector(); + const uint64_t* p = sourceValues[1].getAsVector(); + const T vn = sourceValues[2].get(); + + const uint16_t partition_num = VL_bits / (sizeof(T) * 8); + T out[256 / sizeof(T)] = {0}; + + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); + if (p[i / (64 / sizeof(T))] & shifted_active) { + out[i] = vn; + } else { + out[i] = zd[i]; + } + } + return {out, 256}; +} + /** Helper function for SVE instructions with the format `dec xdn{, * pattern{, MUL #imm}}`. * T represents the type of operation (e.g. for DECD, T = uint64_t). diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index b8352c79bf..a7f8da3e86 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -949,6 +949,22 @@ void Instruction::execute() { results_[0] = sveCpy_imm(sourceValues_, metadata_, VL_bits); break; } + case Opcode::AArch64_CPY_ZPmV_B: { // cpy zd.b, pg/m, vn.b + results_[0] = sveCpy_Scalar(sourceValues_, metadata_, VL_bits); + break; + } + case Opcode::AArch64_CPY_ZPmV_D: { // cpy zd.d, pg/m, vn.d + results_[0] = sveCpy_Scalar(sourceValues_, metadata_, VL_bits); + break; + } + case Opcode::AArch64_CPY_ZPmV_H: { // cpy zd.h, pg/m, vn.h + results_[0] = sveCpy_Scalar(sourceValues_, metadata_, VL_bits); + break; + } + case Opcode::AArch64_CPY_ZPmV_S: { // cpy zd.s, pg/m, vn.s + results_[0] = sveCpy_Scalar(sourceValues_, metadata_, VL_bits); + break; + } case Opcode::AArch64_DUPi32: { // dup vd, vn.s[index] results_[0] = vecDup_gprOrIndex(sourceValues_, metadata_, false); diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 6a52d46b95..e75b1c2061 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -1544,6 +1544,123 @@ TEST_P(InstSve, cpy) { CHECK_NEON(4, int64_t, fillNeon({12}, VL / 8)); CHECK_NEON(5, int64_t, fillNeon({static_cast(-2048)}, VL / 16)); + + // SIMD & FP scalar + // 8-bit + RUN_AARCH64(R"( + mov x0, #0 + mov x1, #2 + addvl x0, x0, #1 + sdiv x0, x0, x1 + + ptrue p0.b + whilelo p1.b, xzr, x0 + + dup z1.b, #10 + dup z2.b, #-8 + + cpy z0.b, p0/m, z1.b + cpy z1.b, p0/m, z2.b + cpy z2.b, p1/m, z1.b + cpy z3.b, p1/m, z2.b + + # Test Alias + mov z4.b, p0/m, z1.b + mov z5.b, p1/m, z2.b + )"); + CHECK_NEON(0, int8_t, fillNeon({10}, VL / 8)); + CHECK_NEON(1, int8_t, fillNeon({-8}, VL / 8)); + CHECK_NEON(2, int8_t, fillNeon({10}, VL / 16)); + CHECK_NEON(3, int8_t, fillNeon({-8}, VL / 16)); + CHECK_NEON(4, int8_t, fillNeon({-8}, VL / 8)); + CHECK_NEON(5, int8_t, fillNeon({-8}, VL / 16)); + + // 16-bit + RUN_AARCH64(R"( + mov x0, #0 + mov x1, #4 + addvl x0, x0, #1 + sdiv x0, x0, x1 + + ptrue p0.h + whilelo p1.h, xzr, x0 + + dup z1.h, #10 + dup z2.h, #2048 + + cpy z0.h, p0/m, z1.h + cpy z1.h, p0/m, z2.h + cpy z2.h, p1/m, z1.h + cpy z3.h, p1/m, z2.h + + # Test Alias + mov z4.h, p0/m, z1.h + mov z5.h, p1/m, z2.h + )"); + CHECK_NEON(0, int16_t, fillNeon({10}, VL / 8)); + CHECK_NEON(1, int16_t, fillNeon({2048}, VL / 8)); + CHECK_NEON(2, int16_t, fillNeon({10}, VL / 16)); + CHECK_NEON(3, int16_t, fillNeon({2048}, VL / 16)); + CHECK_NEON(4, int16_t, fillNeon({2048}, VL / 8)); + CHECK_NEON(5, int16_t, fillNeon({2048}, VL / 16)); + + // 32-bit + RUN_AARCH64(R"( + mov x0, #0 + mov x1, #8 + addvl x0, x0, #1 + sdiv x0, x0, x1 + + ptrue p0.s + whilelo p1.s, xzr, x0 + + dup z1.s, #10 + dup z2.s, #2048 + + cpy z0.s, p0/m, z1.s + cpy z1.s, p0/m, z2.s + cpy z2.s, p1/m, z1.s + cpy z3.s, p1/m, z2.s + + # Test Alias + mov z4.s, p0/m, z1.s + mov z5.s, p1/m, z2.s + )"); + CHECK_NEON(0, int32_t, fillNeon({10}, VL / 8)); + CHECK_NEON(1, int32_t, fillNeon({2048}, VL / 8)); + CHECK_NEON(2, int32_t, fillNeon({10}, VL / 16)); + CHECK_NEON(3, int32_t, fillNeon({2048}, VL / 16)); + CHECK_NEON(4, int32_t, fillNeon({2048}, VL / 8)); + CHECK_NEON(5, int32_t, fillNeon({2048}, VL / 16)); + + // 64-bit + RUN_AARCH64(R"( + mov x0, #0 + mov x1, #16 + addvl x0, x0, #1 + sdiv x0, x0, x1 + + ptrue p0.d + whilelo p1.d, xzr, x0 + + dup z1.d, #10 + dup z2.d, #2048 + + cpy z0.d, p0/m, z1.d + cpy z1.d, p0/m, z2.d + cpy z2.d, p1/m, z1.d + cpy z3.d, p1/m, z2.d + + # Test Alias + mov z4.d, p0/m, z1.d + mov z5.d, p1/m, z2.d + )"); + CHECK_NEON(0, int64_t, fillNeon({10}, VL / 8)); + CHECK_NEON(1, int64_t, fillNeon({2048}, VL / 8)); + CHECK_NEON(2, int64_t, fillNeon({10}, VL / 16)); + CHECK_NEON(3, int64_t, fillNeon({2048}, VL / 16)); + CHECK_NEON(4, int64_t, fillNeon({2048}, VL / 8)); + CHECK_NEON(5, int64_t, fillNeon({2048}, VL / 16)); } TEST_P(InstSve, fcpy) { From 1873378bdae477d35a7e3f299c841ae91eedc80d Mon Sep 17 00:00:00 2001 From: JosephMoore25 Date: Mon, 20 May 2024 22:53:23 +0100 Subject: [PATCH 04/38] Fixed OMP getaffinity syscall for new fix. Fixed tests for CPY_ZPmV instructions --- test/regression/aarch64/Syscall.cc | 2 +- test/regression/aarch64/instructions/sve.cc | 151 +++++++++++--------- 2 files changed, 87 insertions(+), 66 deletions(-) diff --git a/test/regression/aarch64/Syscall.cc b/test/regression/aarch64/Syscall.cc index 0866c278e2..c7c19eb9a2 100644 --- a/test/regression/aarch64/Syscall.cc +++ b/test/regression/aarch64/Syscall.cc @@ -1080,7 +1080,7 @@ TEST_P(Syscall, sched_getaffinity) { )"); EXPECT_EQ(getGeneralRegister(21), -1); EXPECT_EQ(getGeneralRegister(22), -1); - EXPECT_EQ(getGeneralRegister(23), 1); + EXPECT_EQ(getGeneralRegister(23), 8); } // TODO: write tgkill test diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index e75b1c2061..f7d4d445e6 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -1546,33 +1546,39 @@ TEST_P(InstSve, cpy) { fillNeon({static_cast(-2048)}, VL / 16)); // SIMD & FP scalar + // Tests are different for 8/16 bit vs 32/64 bit due to the lack of fmov + // support for h and b registers // 8-bit RUN_AARCH64(R"( - mov x0, #0 - mov x1, #2 - addvl x0, x0, #1 - sdiv x0, x0, x1 - - ptrue p0.b - whilelo p1.b, xzr, x0 - - dup z1.b, #10 - dup z2.b, #-8 - - cpy z0.b, p0/m, z1.b - cpy z1.b, p0/m, z2.b - cpy z2.b, p1/m, z1.b - cpy z3.b, p1/m, z2.b - - # Test Alias - mov z4.b, p0/m, z1.b - mov z5.b, p1/m, z2.b - )"); + mov x0, #0 + mov x1, #2 + addvl x0, x0, #1 + sdiv x0, x0, x1 + + ptrue p0.b + whilelo p1.b, xzr, x0 + + cpy z6.b, p0/z, #10 + cpy z7.b, p0/z, #-8 + cpy z8.b, p0/z, #12 + cpy z9.b, p0/z, #-16 + cpy z10.b, p0/z, #12 + cpy z11.b, p0/z, #-8 + + cpy z0.b, p0/m, b6 + cpy z1.b, p0/m, b7 + cpy z2.b, p1/m, b8 + cpy z3.b, p1/m, b9 + + # Test Alias + mov z4.b, p0/m, b10 + mov z5.b, p1/m, b11 + )"); CHECK_NEON(0, int8_t, fillNeon({10}, VL / 8)); CHECK_NEON(1, int8_t, fillNeon({-8}, VL / 8)); - CHECK_NEON(2, int8_t, fillNeon({10}, VL / 16)); - CHECK_NEON(3, int8_t, fillNeon({-8}, VL / 16)); - CHECK_NEON(4, int8_t, fillNeon({-8}, VL / 8)); + CHECK_NEON(2, int8_t, fillNeon({12}, VL / 16)); + CHECK_NEON(3, int8_t, fillNeon({-16}, VL / 16)); + CHECK_NEON(4, int8_t, fillNeon({12}, VL / 8)); CHECK_NEON(5, int8_t, fillNeon({-8}, VL / 16)); // 16-bit @@ -1585,24 +1591,31 @@ TEST_P(InstSve, cpy) { ptrue p0.h whilelo p1.h, xzr, x0 - dup z1.h, #10 - dup z2.h, #2048 + cpy z6.h, p0/z, #10 + cpy z7.h, p0/z, #8, lsl #8 + cpy z8.h, p0/z, #-12 + cpy z9.h, p0/z, #-16, lsl #8 + cpy z10.h, p0/z, #12 + cpy z11.h, p0/z, #-8, lsl #8 - cpy z0.h, p0/m, z1.h - cpy z1.h, p0/m, z2.h - cpy z2.h, p1/m, z1.h - cpy z3.h, p1/m, z2.h + cpy z0.h, p0/m, h6 + cpy z1.h, p0/m, h7 + cpy z2.h, p1/m, h8 + cpy z3.h, p1/m, h9 # Test Alias - mov z4.h, p0/m, z1.h - mov z5.h, p1/m, z2.h + mov z4.h, p0/m, h10 + mov z5.h, p1/m, h11 )"); CHECK_NEON(0, int16_t, fillNeon({10}, VL / 8)); - CHECK_NEON(1, int16_t, fillNeon({2048}, VL / 8)); - CHECK_NEON(2, int16_t, fillNeon({10}, VL / 16)); - CHECK_NEON(3, int16_t, fillNeon({2048}, VL / 16)); - CHECK_NEON(4, int16_t, fillNeon({2048}, VL / 8)); - CHECK_NEON(5, int16_t, fillNeon({2048}, VL / 16)); + CHECK_NEON(1, int16_t, + fillNeon({static_cast(2048)}, VL / 8)); + CHECK_NEON(2, int16_t, fillNeon({-12}, VL / 16)); + CHECK_NEON(3, int16_t, + fillNeon({static_cast(-4096)}, VL / 16)); + CHECK_NEON(4, int16_t, fillNeon({12}, VL / 8)); + CHECK_NEON(5, int16_t, + fillNeon({static_cast(-2048)}, VL / 16)); // 32-bit RUN_AARCH64(R"( @@ -1614,24 +1627,28 @@ TEST_P(InstSve, cpy) { ptrue p0.s whilelo p1.s, xzr, x0 - dup z1.s, #10 - dup z2.s, #2048 + fmov s6, #10 + fmov s7, #-8 + fmov s8, #12 + fmov s9, #-16 + fmov s10, #12 + fmov s11, #-8 - cpy z0.s, p0/m, z1.s - cpy z1.s, p0/m, z2.s - cpy z2.s, p1/m, z1.s - cpy z3.s, p1/m, z2.s + cpy z0.s, p0/m, s6 + cpy z1.s, p0/m, s7 + cpy z2.s, p1/m, s8 + cpy z3.s, p1/m, s9 # Test Alias - mov z4.s, p0/m, z1.s - mov z5.s, p1/m, z2.s + mov z4.S, p0/m, s10 + mov z5.S, p1/m, s11 )"); - CHECK_NEON(0, int32_t, fillNeon({10}, VL / 8)); - CHECK_NEON(1, int32_t, fillNeon({2048}, VL / 8)); - CHECK_NEON(2, int32_t, fillNeon({10}, VL / 16)); - CHECK_NEON(3, int32_t, fillNeon({2048}, VL / 16)); - CHECK_NEON(4, int32_t, fillNeon({2048}, VL / 8)); - CHECK_NEON(5, int32_t, fillNeon({2048}, VL / 16)); + CHECK_NEON(0, float, fillNeon({10}, VL / 8)); + CHECK_NEON(1, float, fillNeon({static_cast(-8)}, VL / 8)); + CHECK_NEON(2, float, fillNeon({12}, VL / 16)); + CHECK_NEON(3, float, fillNeon({static_cast(-16)}, VL / 16)); + CHECK_NEON(4, float, fillNeon({12}, VL / 8)); + CHECK_NEON(5, float, fillNeon({static_cast(-8)}, VL / 16)); // 64-bit RUN_AARCH64(R"( @@ -1643,24 +1660,28 @@ TEST_P(InstSve, cpy) { ptrue p0.d whilelo p1.d, xzr, x0 - dup z1.d, #10 - dup z2.d, #2048 + fmov d6, #10 + fmov d7, #-8 + fmov d8, #12 + fmov d9, #-16 + fmov d10, #12 + fmov d11, #-8 - cpy z0.d, p0/m, z1.d - cpy z1.d, p0/m, z2.d - cpy z2.d, p1/m, z1.d - cpy z3.d, p1/m, z2.d + cpy z0.d, p0/m, d6 + cpy z1.d, p0/m, d7 + cpy z2.d, p1/m, d8 + cpy z3.d, p1/m, d9 # Test Alias - mov z4.d, p0/m, z1.d - mov z5.d, p1/m, z2.d - )"); - CHECK_NEON(0, int64_t, fillNeon({10}, VL / 8)); - CHECK_NEON(1, int64_t, fillNeon({2048}, VL / 8)); - CHECK_NEON(2, int64_t, fillNeon({10}, VL / 16)); - CHECK_NEON(3, int64_t, fillNeon({2048}, VL / 16)); - CHECK_NEON(4, int64_t, fillNeon({2048}, VL / 8)); - CHECK_NEON(5, int64_t, fillNeon({2048}, VL / 16)); + mov z4.d, p0/m, d10 + mov z5.d, p1/m, d11 + )"); + CHECK_NEON(0, double, fillNeon({10}, VL / 8)); + CHECK_NEON(1, double, fillNeon({static_cast(-8)}, VL / 8)); + CHECK_NEON(2, double, fillNeon({12}, VL / 16)); + CHECK_NEON(3, double, fillNeon({static_cast(-16)}, VL / 16)); + CHECK_NEON(4, double, fillNeon({12}, VL / 8)); + CHECK_NEON(5, double, fillNeon({static_cast(-8)}, VL / 16)); } TEST_P(InstSve, fcpy) { From 351832716060a8da59e70863ae7a5d6479997a4a Mon Sep 17 00:00:00 2001 From: JosephMoore25 Date: Tue, 21 May 2024 16:54:31 +0100 Subject: [PATCH 05/38] Added more instructions so stream+sve compiles with armclang23. Some instructions/helpers from neoverse-v2 branch. --- .../simeng/arch/aarch64/helpers/sve.hh | 91 ++++++++-- src/lib/arch/aarch64/Instruction_execute.cc | 170 +++++++++++++++--- 2 files changed, 227 insertions(+), 34 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 4c162bcddf..27cb63d3ae 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -962,6 +962,34 @@ RegisterValue sveIndex( return {out, 256}; } +/** Helper function for SVE instructions with the format `lastb vd, pg, zn`. + * T represents the vector register type (e.g. zd.d would be uint64_t). + * Returns correctly formatted RegisterValue. */ +template +RegisterValue sveLastBScalar(srcValContainer& sourceValues, + const uint16_t VL_bits) { + const uint64_t* p = sourceValues[0].getAsVector(); + const T* n = sourceValues[1].getAsVector(); + + const uint16_t partition_num = VL_bits / (sizeof(T) * 8); + T out; + + // Get last active element + int lastElem = 0; + for (int i = partition_num - 1; i >= 0; i--) { + uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); + if (p[i / (64 / sizeof(T))] & shifted_active) { + lastElem = i; + break; + } + // If no active lane has been found, select highest element instead + if (i == 0) lastElem = partition_num - 1; + } + + out = n[lastElem]; + return {out, 256}; +} + /** Helper function for SVE instructions with the format ` * pd, pg/z, pn, pm`. * T represents the type of sourceValues (e.g. for pn.d, T = uint64_t). @@ -1451,6 +1479,51 @@ RegisterValue sveSminv(srcValContainer& sourceValues, const uint16_t VL_bits) { return {out, 256}; } +/** Helper function for SVE instructions with the format `splice zd, pg, zn, + * zm`. + * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t). + * Returns correctly formatted RegisterValue. */ +template +RegisterValue sveSplice(srcValContainer& sourceValues, const uint16_t VL_bits) { + const uint64_t* p = sourceValues[0].getAsVector(); + const T* n = sourceValues[1].getAsVector(); + const T* m = sourceValues[2].getAsVector(); + + const uint16_t partition_num = VL_bits / (sizeof(T) * 8); + T out[256 / sizeof(T)] = {0}; + + // Get last active element + int lastElem = 0; + for (int i = partition_num - 1; i >= 0; i--) { + uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); + if (p[i / (64 / sizeof(T))] & shifted_active) { + lastElem = i; + break; + } + } + + // Extract region from n as denoted by predicate p. Copy region into the + // lowest elements of the destination operand + bool active = false; + int index = 0; + for (int i = 0; i <= lastElem; i++) { + uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); + if (p[i / (64 / sizeof(T))] & shifted_active) active = true; + if (active) { + out[index] = n[i]; + index++; + } + } + + // Set any unassigned elements to the lowest elements in m + int elemsLeft = partition_num - index; + for (int i = 0; i < elemsLeft; i++) { + out[index] = m[i]; + index++; + } + return {out, 256}; +} + /** Helper function for SVE instructions with the format `Sub zd, zn, * zm`. * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t). @@ -1660,33 +1733,31 @@ RegisterValue sveUzp_vecs(srcValContainer& sourceValues, const uint16_t VL_bits, return {out, 256}; } -/** Helper function for SVE instructions with the format `whilelo pd, - * n, m`. +/** Helper function for SVE instructions with the format `while pd, n, m`. * T represents the type of sourceValues n and m (e.g. for wn, T = uint32_t). * P represents the type of operand p (e.g. for pd.b, P = uint8_t). * Returns tuple of type [pred results (array of 4 uint64_t), nzcv]. */ template -std::tuple, uint8_t> sveWhilelo( - srcValContainer& sourceValues, const uint16_t VL_bits, bool calcNZCV) { +std::tuple, uint8_t> sveWhile( + srcValContainer& sourceValues, const uint16_t VL_bits, + std::function func) { const T n = sourceValues[0].get(); const T m = sourceValues[1].get(); const uint16_t partition_num = VL_bits / (sizeof(P) * 8); std::array out = {0, 0, 0, 0}; - uint16_t index = 0; for (int i = 0; i < partition_num; i++) { // Determine whether lane should be active and shift to align with // element in predicate register. uint64_t shifted_active = - (n + i) < m ? 1ull << ((i % (64 / (sizeof(P))) * (sizeof(P)))) : 0; - out[index / (64 / (sizeof(P)))] = - out[index / (64 / (sizeof(P)))] | shifted_active; - index++; + func((n + i), m) ? 1ull << ((i % (64 / (sizeof(P))) * (sizeof(P)))) : 0; + out[i / (64 / (sizeof(P)))] |= shifted_active; } // Byte count = sizeof(P) as destination predicate is predicate of P // bytes. - uint8_t nzcv = calcNZCV ? getNZCVfromPred(out, VL_bits, sizeof(P)) : 0; + uint8_t nzcv = getNZCVfromPred(out, VL_bits, sizeof(P)); return {out, nzcv}; } diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index a7f8da3e86..e2ac80dab8 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -2573,6 +2573,22 @@ void Instruction::execute() { vecInsIndex_gpr(sourceValues_, metadata_); break; } + case Opcode::AArch64_LASTB_VPZ_D: { // lastb dd, pg, zn.d + results_[0] = sveLastBScalar(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_LASTB_VPZ_S: { // lastb sd, pg, zn.s + results_[0] = sveLastBScalar(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_LASTB_VPZ_H: { // lastb hd, pg, zn.h + results_[0] = sveLastBScalar(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_LASTB_VPZ_B: { // lastb bd, pg, zn.b + results_[0] = sveLastBScalar(sourceValues_, VL_bits); + break; + } case Opcode::AArch64_LD1_MXIPXX_H_D: { // ld1d {zath.d[ws, #imm]}, pg/z, // [{, xm, lsl #3}] // SME, LOAD @@ -4327,34 +4343,88 @@ void Instruction::execute() { results_[0] = maddl_4ops(sourceValues_); break; } + case Opcode::AArch64_SMAX_ZI_D: { // smax zdn.d, zdn.d, #imm + results_[0] = sveMax_vecImm(sourceValues_, metadata_, VL_bits); + break; + } case Opcode::AArch64_SMAX_ZI_S: { // smax zdn.s, zdn.s, #imm results_[0] = sveMax_vecImm(sourceValues_, metadata_, VL_bits); break; } + case Opcode::AArch64_SMAX_ZI_H: { // smax zdn.h, zdn.h, #imm + results_[0] = sveMax_vecImm(sourceValues_, metadata_, VL_bits); + break; + } + case Opcode::AArch64_SMAX_ZI_B: { // smax zdn.b, zdn.b, #imm + results_[0] = sveMax_vecImm(sourceValues_, metadata_, VL_bits); + break; + } + case Opcode::AArch64_SMAX_ZPmZ_D: { // smax zd.d, pg/m, zn.d, zm.d + results_[0] = sveMaxPredicated_vecs(sourceValues_, VL_bits); + break; + } case Opcode::AArch64_SMAX_ZPmZ_S: { // smax zd.s, pg/m, zn.s, zm.s results_[0] = sveMaxPredicated_vecs(sourceValues_, VL_bits); break; } + case Opcode::AArch64_SMAX_ZPmZ_H: { // smax zd.h, pg/m, zn.h, zm.h + results_[0] = sveMaxPredicated_vecs(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_SMAX_ZPmZ_B: { // smax zd.b, pg/m, zn.b, zm.b + results_[0] = sveMaxPredicated_vecs(sourceValues_, VL_bits); + break; + } case Opcode::AArch64_SMAXv4i32: { // smax vd.4s, vn.4s, vm.4s results_[0] = vecLogicOp_3vecs( sourceValues_, [](int32_t x, int32_t y) -> int32_t { return std::max(x, y); }); break; } + case Opcode::AArch64_SMINV_VPZ_D: { // sminv sd, pg, zn.d + results_[0] = sveSminv(sourceValues_, VL_bits); + break; + } case Opcode::AArch64_SMINV_VPZ_S: { // sminv sd, pg, zn.s results_[0] = sveSminv(sourceValues_, VL_bits); break; } + case Opcode::AArch64_SMINV_VPZ_H: { // sminv sd, pg, zn.h + results_[0] = sveSminv(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_SMINV_VPZ_B: { // sminv sd, pg, zn.b + results_[0] = sveSminv(sourceValues_, VL_bits); + break; + } case Opcode::AArch64_SMINVv4i32v: { // sminv sd, vn.4s results_[0] = vecMinv_2ops(sourceValues_); break; } + case Opcode::AArch64_SMIN_ZPmZ_D: { // smin zd.d, pg/m, zn.d, zm.d + results_[0] = sveLogicOpPredicated_3vecs( + sourceValues_, VL_bits, + [](int64_t x, int64_t y) -> int64_t { return std::min(x, y); }); + break; + } case Opcode::AArch64_SMIN_ZPmZ_S: { // smin zd.s, pg/m, zn.s, zm.s results_[0] = sveLogicOpPredicated_3vecs( sourceValues_, VL_bits, [](int32_t x, int32_t y) -> int32_t { return std::min(x, y); }); break; } + case Opcode::AArch64_SMIN_ZPmZ_H: { // smin zd.h, pg/m, zn.h, zm.h + results_[0] = sveLogicOpPredicated_3vecs( + sourceValues_, VL_bits, + [](int16_t x, int16_t y) -> int16_t { return std::min(x, y); }); + break; + } + case Opcode::AArch64_SMIN_ZPmZ_B: { // smin zd.b, pg/m, zn.b, zm.b + results_[0] = sveLogicOpPredicated_3vecs( + sourceValues_, VL_bits, + [](int8_t x, int8_t y) -> int8_t { return std::min(x, y); }); + break; + } case Opcode::AArch64_SMINv4i32: { // smin vd.4s, vn.4s, vm.4s results_[0] = vecLogicOp_3vecs( sourceValues_, @@ -4386,6 +4456,14 @@ void Instruction::execute() { sourceValues_[1].get()); break; } + case Opcode::AArch64_SPLICE_ZPZ_D: { // splice zdn.d, pv, zdn.t, zm.d + results_[0] = sveSplice(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_SPLICE_ZPZ_S: { // splice zdn.s, pv, zdn.t, zm.s + results_[0] = sveSplice(sourceValues_, VL_bits); + break; + } case Opcode::AArch64_SSHLLv2i32_shift: { // sshll vd.2d, vn.2s, #imm results_[0] = vecShllShift_vecImm( sourceValues_, metadata_, false); @@ -5754,85 +5832,129 @@ void Instruction::execute() { break; } case Opcode::AArch64_WHILELO_PWW_B: { // whilelo pd.b, wn, wm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint32_t x, uint32_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELO_PWW_D: { // whilelo pd.d, wn, wm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint32_t x, uint32_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELO_PWW_H: { // whilelo pd.h, wn, wm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint32_t x, uint32_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELO_PWW_S: { // whilelo pd.s, wn, wm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint32_t x, uint32_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELO_PXX_B: { // whilelo pd.b, xn, xm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint64_t x, uint64_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELO_PXX_D: { // whilelo pd.d, xn, xm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint64_t x, uint64_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELO_PXX_H: { // whilelo pd.h, xn, xm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint64_t x, uint64_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELO_PXX_S: { // whilelo pd.s, xn, xm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint64_t x, uint64_t y) -> bool { return x < y; }); + results_[0] = nzcv; + results_[1] = output; + break; + } + case Opcode::AArch64_WHILELS_PXX_B: { // whilels pd.b, xn, xm + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint64_t x, uint64_t y) -> bool { return x <= y; }); + results_[0] = nzcv; + results_[1] = output; + break; + } + case Opcode::AArch64_WHILELS_PXX_D: { // whilels pd.d, xn, xm + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint64_t x, uint64_t y) -> bool { return x <= y; }); + results_[0] = nzcv; + results_[1] = output; + break; + } + case Opcode::AArch64_WHILELS_PXX_H: { // whilels pd.h, xn, xm + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint64_t x, uint64_t y) -> bool { return x <= y; }); + results_[0] = nzcv; + results_[1] = output; + break; + } + case Opcode::AArch64_WHILELS_PXX_S: { // whilels pd.s, xn, xm + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](uint64_t x, uint64_t y) -> bool { return x <= y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELT_PXX_B: { // whilelt pd.b, xn, xm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](int64_t x, int64_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELT_PXX_D: { // whilelt pd.d, xn, xm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](int64_t x, int64_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELT_PXX_H: { // whilelt pd.h, xn, xm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](int64_t x, int64_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; } case Opcode::AArch64_WHILELT_PXX_S: { // whilelt pd.s, xn, xm - auto [output, nzcv] = - sveWhilelo(sourceValues_, VL_bits, true); + auto [output, nzcv] = sveWhile( + sourceValues_, VL_bits, + [](int64_t x, int64_t y) -> bool { return x < y; }); results_[0] = nzcv; results_[1] = output; break; From 81889ab63429296234f3a3a604b58d4d722af7a1 Mon Sep 17 00:00:00 2001 From: JosephMoore25 Date: Wed, 22 May 2024 19:24:58 +0100 Subject: [PATCH 06/38] Added a couple more instructions, working towards minibude armclang23 --- .../simeng/arch/aarch64/helpers/sve.hh | 22 ++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 36 +++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 27cb63d3ae..fd9047f635 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -1347,6 +1347,28 @@ std::array svePsel( return out; } +/** Helper function for SVE instructions with the format `pfirst pdn, pg, pdn`. + * Returns an array of 4 uint64_t elements. */ +std::array svePfirst(srcValContainer& sourceValues, + const uint16_t VL_bits) { + const uint16_t partition_num = VL_bits / 8; + const uint64_t* p = sourceValues[0].getAsVector(); + const uint64_t* dn = sourceValues[1].getAsVector(); + // Set destination d as source n to copy all false lanes and the active lanes + // beyond the first + std::array out = {dn[0], dn[1], dn[2], dn[3]}; + + // Get the first active lane and set same lane in destination predicate + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % (64))); + if (p[i / 64] & shifted_active) { + out[i / 64] |= shifted_active; + break; + } + } + return out; +} + /** Helper function for SVE instructions with the format `ptrue pd{, * pattern}. * T represents the type of sourceValues (e.g. for pd.d, T = uint64_t). diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index e2ac80dab8..e93ed70fce 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -833,6 +833,38 @@ void Instruction::execute() { results_[1] = output; break; } + case Opcode::AArch64_CMPHS_PPzZZ_B: { // cmphs pd.b, pg/z, zn.b, zm.b + auto [output, nzcv] = sveCmpPredicated_toPred( + sourceValues_, metadata_, VL_bits, true, + [](uint8_t x, uint8_t y) -> bool { return x >= y; }); + results_[0] = nzcv; + results_[1] = output; + break; + } + case Opcode::AArch64_CMPHS_PPzZZ_D: { // cmphs pd.d, pg/z, zn.d, zm.d + auto [output, nzcv] = sveCmpPredicated_toPred( + sourceValues_, metadata_, VL_bits, true, + [](uint64_t x, uint64_t y) -> bool { return x >= y; }); + results_[0] = nzcv; + results_[1] = output; + break; + } + case Opcode::AArch64_CMPHS_PPzZZ_H: { // cmphs pd.h, pg/z, zn.h, zm.h + auto [output, nzcv] = sveCmpPredicated_toPred( + sourceValues_, metadata_, VL_bits, true, + [](uint16_t x, uint16_t y) -> bool { return x >= y; }); + results_[0] = nzcv; + results_[1] = output; + break; + } + case Opcode::AArch64_CMPHS_PPzZZ_S: { // cmphs pd.s, pg/z, zn.s, zm.s + auto [output, nzcv] = sveCmpPredicated_toPred( + sourceValues_, metadata_, VL_bits, true, + [](uint32_t x, uint32_t y) -> bool { return x >= y; }); + results_[0] = nzcv; + results_[1] = output; + break; + } case Opcode::AArch64_CMPNE_PPzZI_B: { // cmpne pd.b, pg/z. zn.b, #imm auto [output, nzcv] = sveCmpPredicated_toPred( sourceValues_, metadata_, VL_bits, true, @@ -4055,6 +4087,10 @@ void Instruction::execute() { results_[0] = out; break; } + case Opcode::AArch64_PFIRST_B: { // pfirst pdn.b, pg, pdn.b + results_[0] = svePfirst(sourceValues_, VL_bits); + break; + } case Opcode::AArch64_PRFMui: { // prfm op, [xn, xm{, extend{, #amount}}] break; } From c6c600018add51796b5c7a5bf601862e4686654b Mon Sep 17 00:00:00 2001 From: JosephMoore25 Date: Thu, 30 May 2024 14:26:54 +0100 Subject: [PATCH 07/38] Added ClastB instructions with tests that (finally) pass. More tests to come --- .../simeng/arch/aarch64/helpers/sve.hh | 31 +++++++++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 16 ++++++++++ test/regression/aarch64/instructions/sve.cc | 30 ++++++++++++++++++ 3 files changed, 77 insertions(+) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index fd9047f635..b1871f646f 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -990,6 +990,37 @@ RegisterValue sveLastBScalar(srcValContainer& sourceValues, return {out, 256}; } +/** Helper function for SVE instructions with the format `clastb vd, pg, vd, + * zn`. T represents the vector register type (e.g. zd.d would be uint64_t). + * Returns correctly formatted RegisterValue. */ +template +RegisterValue sveCLastBScalar(srcValContainer& sourceValues, + const uint16_t VL_bits) { + const uint64_t* p = sourceValues[1].getAsVector(); + const uint64_t* m = sourceValues[2].getAsVector(); + const T* n = sourceValues[3].getAsVector(); + + const uint16_t partition_num = VL_bits / (sizeof(T) * 8); + T out; + + // Get last active element + int lastElem = -1; + for (int i = partition_num - 1; i >= 0; i--) { + uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); + if (p[i / (64 / sizeof(T))] & shifted_active) { + lastElem = i; + break; + } + } + + if (lastElem < 0) { + out = static_cast(static_cast(m[0])); + } else { + out = static_cast(static_cast(n[lastElem])); + } + return {out, 256}; +} + /** Helper function for SVE instructions with the format ` * pd, pg/z, pn, pm`. * T represents the type of sourceValues (e.g. for pn.d, T = uint64_t). diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index e93ed70fce..2f16247aa1 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -2621,6 +2621,22 @@ void Instruction::execute() { results_[0] = sveLastBScalar(sourceValues_, VL_bits); break; } + case Opcode::AArch64_CLASTB_VPZ_D: { // clastb dd, pg, dn, zn.d + results_[0] = sveCLastBScalar(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_CLASTB_VPZ_S: { // clastb sd, pg, sn, zn.s + results_[0] = sveCLastBScalar(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_CLASTB_VPZ_H: { // clastb hd, pg, hn, zn.h + results_[0] = sveCLastBScalar(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_CLASTB_VPZ_B: { // clastb bd, pg, bn, zn.b + results_[0] = sveCLastBScalar(sourceValues_, VL_bits); + break; + } case Opcode::AArch64_LD1_MXIPXX_H_D: { // ld1d {zath.d[ws, #imm]}, pg/z, // [{, xm, lsl #3}] // SME, LOAD diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index f7d4d445e6..dbb0961f96 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -6302,6 +6302,36 @@ TEST_P(InstSve, smulh) { fillNeonCombined({-12}, {-1076902265}, VL / 8)); } +TEST_P(InstSve, clastb) { + RUN_AARCH64(R"( + movz x0, #0xCDEF + movk x0, #0x89AB, LSL #16 + movk x0, #0x4567, LSL #32 + movk x0, #0x0123, LSL #48 + movz x1, #0x4321 + movk x1, #0x8765, LSL #16 + movk x1, #0xCBA9, LSL #32 + movk x1, #0x1FED, LSL #48 + + dup z0.d, x0 + dup z1.d, x1 + + pfalse p0.b + clastb d0, p0, d0, z1.d + mov z4.d, z0.d + + ptrue p0.d + clastb d0, p0, d0, z1.d + mov z5.d, z0.d + )"); + // EXPECT_EQ(getGeneralRegister(0), (0x0123456789ABCDEF)); + CHECK_NEON(4, uint64_t, + fillNeon({0x0123456789ABCDEF}, 8)); // False + + CHECK_NEON(5, uint64_t, fillNeon({0x1FEDCBA987654321}, 8)); // + // True +} + TEST_P(InstSve, st1b) { initialHeapData_.resize(VL / 4); uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); From 240ae68113ea24e082af53d80c5368357ecce39d Mon Sep 17 00:00:00 2001 From: JosephMoore25 Date: Thu, 30 May 2024 15:16:49 +0100 Subject: [PATCH 08/38] Cleaned up clastb tests and added S,H,B cases --- test/regression/aarch64/instructions/sve.cc | 94 ++++++++++++++++++--- 1 file changed, 83 insertions(+), 11 deletions(-) diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index dbb0961f96..417a73e436 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -6303,6 +6303,7 @@ TEST_P(InstSve, smulh) { } TEST_P(InstSve, clastb) { + // 64 bit RUN_AARCH64(R"( movz x0, #0xCDEF movk x0, #0x89AB, LSL #16 @@ -6313,23 +6314,94 @@ TEST_P(InstSve, clastb) { movk x1, #0xCBA9, LSL #32 movk x1, #0x1FED, LSL #48 - dup z0.d, x0 - dup z1.d, x1 + dup z2.d, x0 + dup z3.d, x1 pfalse p0.b - clastb d0, p0, d0, z1.d - mov z4.d, z0.d + clastb d2, p0, d2, z3.d + mov z0.d, z2.d ptrue p0.d - clastb d0, p0, d0, z1.d - mov z5.d, z0.d + clastb d2, p0, d2, z3.d + mov z1.d, z2.d )"); - // EXPECT_EQ(getGeneralRegister(0), (0x0123456789ABCDEF)); - CHECK_NEON(4, uint64_t, - fillNeon({0x0123456789ABCDEF}, 8)); // False + CHECK_NEON(0, uint64_t, fillNeon({0x0123456789ABCDEF}, 8)); + CHECK_NEON(1, uint64_t, fillNeon({0x1FEDCBA987654321}, 8)); + + // 32 bit + RUN_AARCH64(R"( + movz x0, #0xCDEF + movk x0, #0x89AB, LSL #16 + movk x0, #0x4567, LSL #32 + movk x0, #0x0123, LSL #48 + movz x1, #0x4321 + movk x1, #0x8765, LSL #16 + movk x1, #0xCBA9, LSL #32 + movk x1, #0x1FED, LSL #48 - CHECK_NEON(5, uint64_t, fillNeon({0x1FEDCBA987654321}, 8)); // - // True + dup z2.d, x0 + dup z3.d, x1 + + pfalse p0.b + clastb s2, p0, s2, z3.s + mov z0.d, z2.d + + ptrue p0.s + clastb s2, p0, s2, z3.s + mov z1.d, z2.d + )"); + CHECK_NEON(0, uint64_t, fillNeon({0x89ABCDEF}, 8)); + CHECK_NEON(1, uint64_t, fillNeon({0x1FEDCBA9}, 8)); + + // 16 bit + RUN_AARCH64(R"( + movz x0, #0xCDEF + movk x0, #0x89AB, LSL #16 + movk x0, #0x4567, LSL #32 + movk x0, #0x0123, LSL #48 + movz x1, #0x4321 + movk x1, #0x8765, LSL #16 + movk x1, #0xCBA9, LSL #32 + movk x1, #0x1FED, LSL #48 + + dup z2.d, x0 + dup z3.d, x1 + + pfalse p0.b + clastb h2, p0, h2, z3.h + mov z0.d, z2.d + + ptrue p0.h + clastb h2, p0, h2, z3.h + mov z1.d, z2.d + )"); + CHECK_NEON(0, uint64_t, fillNeon({0xCDEF}, 8)); + CHECK_NEON(1, uint64_t, fillNeon({0x1FED}, 8)); + + // 8 bit + RUN_AARCH64(R"( + movz x0, #0xCDEF + movk x0, #0x89AB, LSL #16 + movk x0, #0x4567, LSL #32 + movk x0, #0x0123, LSL #48 + movz x1, #0x4321 + movk x1, #0x8765, LSL #16 + movk x1, #0xCBA9, LSL #32 + movk x1, #0x1FED, LSL #48 + + dup z2.d, x0 + dup z3.d, x1 + + pfalse p0.b + clastb b2, p0, b2, z3.b + mov z0.d, z2.d + + ptrue p0.b + clastb b2, p0, b2, z3.b + mov z1.d, z2.d + )"); + CHECK_NEON(0, uint64_t, fillNeon({0xEF}, 8)); + CHECK_NEON(1, uint64_t, fillNeon({0x1F}, 8)); } TEST_P(InstSve, st1b) { From 5e798500400a52424a204c18a4b1203704f6adbe Mon Sep 17 00:00:00 2001 From: JosephMoore25 Date: Thu, 30 May 2024 18:22:47 +0100 Subject: [PATCH 09/38] Dirty WIP for pnext instruction --- .../simeng/arch/aarch64/helpers/sve.hh | 45 +++++++++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 16 +++++++ 2 files changed, 61 insertions(+) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index b1871f646f..bc53e4bcff 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -1400,6 +1400,51 @@ std::array svePfirst(srcValContainer& sourceValues, return out; } +/** Helper function for SVE instructions with the format `pnext pdn, pv, pdn`. + * Returns an array of 4 uint64_t elements. */ +template +std::array svePnext( + srcValContainer& sourceValues, + const simeng::arch::aarch64::InstructionMetadata& metadata, + const uint16_t VL_bits) { + const uint16_t partition_num = VL_bits / (sizeof(T) * 8); + const uint64_t* p = sourceValues[0].getAsVector(); + const uint64_t* dn = sourceValues[1].getAsVector(); + std::array out = {dn[0], dn[1], dn[2], dn[3]}; + + // Get pattern + const uint16_t count = + sveGetPattern(metadata.operandStr, sizeof(T) * 8, VL_bits); + // Exit early if count == 0 + if (count == 0) return out; + + // Create mask so we can zero the pattern + uint64_t mask = ~((1ULL << (64 - count * 8)) - 1); + out[0] &= mask; + + // Get last active element of dn.pattern + int lastElem = -1; + for (int i = partition_num - 1; i >= 0; i--) { + if (i < count) { + uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); + if (dn[i / (64 / sizeof(T))] & shifted_active) { + lastElem = i; + break; + } + } + } + // Get next active element of p, starting from last of dn.pattern + for (int i = lastElem + 1; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); + if (p[i / (64 / sizeof(T))] & shifted_active) { + out[i / (64 / sizeof(T))] |= shifted_active; + break; + } + } + + return out; +} + /** Helper function for SVE instructions with the format `ptrue pd{, * pattern}. * T represents the type of sourceValues (e.g. for pd.d, T = uint64_t). diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 2f16247aa1..ad0448c8db 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -4107,6 +4107,22 @@ void Instruction::execute() { results_[0] = svePfirst(sourceValues_, VL_bits); break; } + case Opcode::AArch64_PNEXT_B: { // pnext pdn.b, pv, pdn.b + results_[0] = svePnext(sourceValues_, metadata_, VL_bits); + break; + } + case Opcode::AArch64_PNEXT_H: { // pnext pdn.h, pv, pdn.h + results_[0] = svePnext(sourceValues_, metadata_, VL_bits); + break; + } + case Opcode::AArch64_PNEXT_S: { // pnext pdn.s, pv, pdn.s + results_[0] = svePnext(sourceValues_, metadata_, VL_bits); + break; + } + case Opcode::AArch64_PNEXT_D: { // pnext pdn.d, pv, pdn.d + results_[0] = svePnext(sourceValues_, metadata_, VL_bits); + break; + } case Opcode::AArch64_PRFMui: { // prfm op, [xn, xm{, extend{, #amount}}] break; } From f8ea7f29f39919e1989d4f8d2e123759a5cb03cd Mon Sep 17 00:00:00 2001 From: JosephMoore25 Date: Mon, 3 Jun 2024 15:45:01 +0100 Subject: [PATCH 10/38] Added pnext inst along with tests --- .../simeng/arch/aarch64/helpers/sve.hh | 13 +-- test/regression/aarch64/instructions/sve.cc | 96 +++++++++++++++++++ 2 files changed, 101 insertions(+), 8 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index bc53e4bcff..7fa84ad4b8 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -1408,20 +1408,18 @@ std::array svePnext( const simeng::arch::aarch64::InstructionMetadata& metadata, const uint16_t VL_bits) { const uint16_t partition_num = VL_bits / (sizeof(T) * 8); - const uint64_t* p = sourceValues[0].getAsVector(); - const uint64_t* dn = sourceValues[1].getAsVector(); - std::array out = {dn[0], dn[1], dn[2], dn[3]}; + const uint64_t* p = sourceValues[1].getAsVector(); + const uint64_t* dn = sourceValues[2].getAsVector(); + // Set destination elements to 0 + std::array out = {0, 0, 0, 0}; // Get pattern const uint16_t count = sveGetPattern(metadata.operandStr, sizeof(T) * 8, VL_bits); + // Exit early if count == 0 if (count == 0) return out; - // Create mask so we can zero the pattern - uint64_t mask = ~((1ULL << (64 - count * 8)) - 1); - out[0] &= mask; - // Get last active element of dn.pattern int lastElem = -1; for (int i = partition_num - 1; i >= 0; i--) { @@ -1441,7 +1439,6 @@ std::array svePnext( break; } } - return out; } diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 417a73e436..29394d00a8 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -5798,6 +5798,102 @@ TEST_P(InstSve, ptrue) { CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 2)); } +TEST_P(InstSve, pnext) { + initialHeapData_.resize(1024); + uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + + // B arrangement + // Allow 32 Byte space for each predicate register for when VL=2048 + std::vector src = {0xAAAA, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0xAA00, 0x0, 0x0, 0x0}; + fillHeap(heap64, src, 12); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr p2, [x0] + add x0, x0, #32 + ldr p0, [x0] + + pnext p0.b, p2, p0.b + + ldr p1, [x0] + add x0, x0, #32 + ldr p3, [x0] + + pnext p1.b, p3, p1.b + )"); + CHECK_PREDICATE(0, uint64_t, + fillPredFromSource({0x02, 0, 0, 0}, 32)); + CHECK_PREDICATE(1, uint64_t, + fillPredFromSource({0x0200, 0, 0, 0}, 32)); + + // H arrangement + src = {0x5555, 0x0, 0x0, 0x0, 0x3333, 0x0, 0x0, 0x0}; + fillHeap(heap64, src, 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr p1, [x0] + add x0, x0, #32 + ldr p0, [x0] + + pnext p0.h, p1, p0.h + )"); + CHECK_PREDICATE(0, uint64_t, + fillPredFromSource({0x4000, 0, 0, 0}, 32)); + + // S arrangement + src = {0x9, 0x0, 0x0, 0x0, 0x6, 0x0, 0x0, 0x0}; + fillHeap(heap64, src, 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr p1, [x0] + add x0, x0, #32 + ldr p0, [x0] + + pnext p0.s, p1, p0.s + )"); + CHECK_PREDICATE(0, uint64_t, + fillPredFromSource({0x1, 0, 0, 0}, 32)); + + // D arrangement + src = {0x3, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, + 0xFF0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; + fillHeap(heap64, src, 12); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr p2, [x0] + add x0, x0, #32 + ldr p0, [x0] + + pnext p0.d, p2, p0.d + + add x0, x0, #32 + ldr p3, [x0] + add x0, x0, #32 + ldr p1, [x0] + + pnext p1.d, p3, p1.d + )"); + CHECK_PREDICATE(0, uint64_t, fillPredFromSource({0, 0, 0, 0}, 32)); + CHECK_PREDICATE(1, uint64_t, + fillPredFromSource({0x100, 0, 0, 0}, 32)); +} + TEST_P(InstSve, punpk) { RUN_AARCH64(R"( ptrue p0.b From 5992cd17d1a62e9ee4206964418538d97e068cd2 Mon Sep 17 00:00:00 2001 From: JosephMoore25 Date: Mon, 3 Jun 2024 17:00:55 +0100 Subject: [PATCH 11/38] Added NZCV changes to pnext and updated tests --- .../simeng/arch/aarch64/helpers/sve.hh | 9 ++++----- src/lib/arch/aarch64/Instruction_execute.cc | 20 +++++++++++++++---- test/regression/aarch64/instructions/sve.cc | 14 ++++++++----- 3 files changed, 29 insertions(+), 14 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 7fa84ad4b8..fa111280bb 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -1401,9 +1401,9 @@ std::array svePfirst(srcValContainer& sourceValues, } /** Helper function for SVE instructions with the format `pnext pdn, pv, pdn`. - * Returns an array of 4 uint64_t elements. */ + * Returns an array of 4 uint64_t elements, and updates the NZCV flags. */ template -std::array svePnext( +std::tuple, uint8_t> svePnext( srcValContainer& sourceValues, const simeng::arch::aarch64::InstructionMetadata& metadata, const uint16_t VL_bits) { @@ -1418,8 +1418,7 @@ std::array svePnext( sveGetPattern(metadata.operandStr, sizeof(T) * 8, VL_bits); // Exit early if count == 0 - if (count == 0) return out; - + if (count == 0) return {out, getNZCVfromPred(out, VL_bits, sizeof(T))}; // Get last active element of dn.pattern int lastElem = -1; for (int i = partition_num - 1; i >= 0; i--) { @@ -1439,7 +1438,7 @@ std::array svePnext( break; } } - return out; + return {out, getNZCVfromPred(out, VL_bits, sizeof(T))}; } /** Helper function for SVE instructions with the format `ptrue pd{, diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index ad0448c8db..bc56cb8934 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -4108,19 +4108,31 @@ void Instruction::execute() { break; } case Opcode::AArch64_PNEXT_B: { // pnext pdn.b, pv, pdn.b - results_[0] = svePnext(sourceValues_, metadata_, VL_bits); + auto [result, nzcv] = + svePnext(sourceValues_, metadata_, VL_bits); + results_[0] = nzcv; + results_[1] = result; break; } case Opcode::AArch64_PNEXT_H: { // pnext pdn.h, pv, pdn.h - results_[0] = svePnext(sourceValues_, metadata_, VL_bits); + auto [result, nzcv] = + svePnext(sourceValues_, metadata_, VL_bits); + results_[0] = nzcv; + results_[1] = result; break; } case Opcode::AArch64_PNEXT_S: { // pnext pdn.s, pv, pdn.s - results_[0] = svePnext(sourceValues_, metadata_, VL_bits); + auto [result, nzcv] = + svePnext(sourceValues_, metadata_, VL_bits); + results_[0] = nzcv; + results_[1] = result; break; } case Opcode::AArch64_PNEXT_D: { // pnext pdn.d, pv, pdn.d - results_[0] = svePnext(sourceValues_, metadata_, VL_bits); + auto [result, nzcv] = + svePnext(sourceValues_, metadata_, VL_bits); + results_[0] = nzcv; + results_[1] = result; break; } case Opcode::AArch64_PRFMui: { // prfm op, [xn, xm{, extend{, #amount}}] diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 29394d00a8..fde761a148 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -5829,9 +5829,10 @@ TEST_P(InstSve, pnext) { fillPredFromSource({0x02, 0, 0, 0}, 32)); CHECK_PREDICATE(1, uint64_t, fillPredFromSource({0x0200, 0, 0, 0}, 32)); + EXPECT_EQ(getNZCV(), 0b0010); // H arrangement - src = {0x5555, 0x0, 0x0, 0x0, 0x3333, 0x0, 0x0, 0x0}; + src = {0x555, 0x0, 0x0, 0x0, 0x333, 0x0, 0x0, 0x0}; fillHeap(heap64, src, 8); RUN_AARCH64(R"( # Get heap address @@ -5846,7 +5847,8 @@ TEST_P(InstSve, pnext) { pnext p0.h, p1, p0.h )"); CHECK_PREDICATE(0, uint64_t, - fillPredFromSource({0x4000, 0, 0, 0}, 32)); + fillPredFromSource({0x400, 0, 0, 0}, 32)); + EXPECT_EQ(getNZCV(), 0b0010); // S arrangement src = {0x9, 0x0, 0x0, 0x0, 0x6, 0x0, 0x0, 0x0}; @@ -5865,10 +5867,11 @@ TEST_P(InstSve, pnext) { )"); CHECK_PREDICATE(0, uint64_t, fillPredFromSource({0x1, 0, 0, 0}, 32)); + EXPECT_EQ(getNZCV(), 0b1010); // D arrangement - src = {0x3, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, - 0xFF0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; + src = {0x3, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, + 0xF3, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; fillHeap(heap64, src, 12); RUN_AARCH64(R"( # Get heap address @@ -5891,7 +5894,8 @@ TEST_P(InstSve, pnext) { )"); CHECK_PREDICATE(0, uint64_t, fillPredFromSource({0, 0, 0, 0}, 32)); CHECK_PREDICATE(1, uint64_t, - fillPredFromSource({0x100, 0, 0, 0}, 32)); + fillPredFromSource({0x1, 0, 0, 0}, 32)); + EXPECT_EQ(getNZCV(), 0b1010); } TEST_P(InstSve, punpk) { From 49dbbbe2eed2024d68e8fdf2b53de6efddc7c615 Mon Sep 17 00:00:00 2001 From: JosephMoore25 Date: Thu, 13 Jun 2024 17:36:03 +0100 Subject: [PATCH 12/38] Added weird FP Trig SVE insts (untested). Minibude now works with armclang23! --- .../simeng/arch/aarch64/helpers/sve.hh | 126 ++++++++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 24 ++++ 2 files changed, 150 insertions(+) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index fa111280bb..931ca27d42 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -875,6 +875,132 @@ RegisterValue sveFsqrtPredicated_2vecs(srcValContainer& sourceValues, return {out, 256}; } +/** Helper function for SVE instructions with the format `ftsmul zd, zn, zm`. + * T represents the type of sourceValues (e.g. for zn.d, T = double). + * Returns correctly formatted RegisterValue. U represents the same precision as + * T, but as an integer type for the second source register. */ +template +RegisterValue sveFTrigSMul(srcValContainer& sourceValues, + const uint16_t VL_bits) { + const T* n = sourceValues[0].getAsVector(); + const U* m = sourceValues[1].getAsVector(); + + const uint16_t partition_num = VL_bits / (sizeof(T) * 8); + T out[256 / sizeof(T)] = {0}; + + U bit_0_mask = 1ull << (sizeof(T) * 8 - 1); + // Square each element in the first source vector and then set the sign bit + // to a copy of bit 0 of the corresponding element in the second source + // register + for (int i = 0; i < partition_num; i++) { + out[i] = n[i] * n[i]; + T sign_bit = m[i] & bit_0_mask ? 1.0 : -1.0; + out[i] = std::abs(out[i]) * sign_bit; + } + + return {out, 256}; +} + +/** Helper function for SVE instructions with the format `ftssel zd, zn, zm`. + * T represents the type of sourceValues (e.g. for zn.d, T = double). + * Returns correctly formatted RegisterValue. U represents the same precision as + * T, but as an integer type for the second source register. */ +template +RegisterValue sveFTrigSSel(srcValContainer& sourceValues, + const uint16_t VL_bits) { + const T* n = sourceValues[0].getAsVector(); + const U* m = sourceValues[1].getAsVector(); + + const uint16_t partition_num = VL_bits / (sizeof(T) * 8); + T out[256 / sizeof(T)] = {0}; + + U bit_0_mask = 1ull << (sizeof(T) * 8 - 1); + U bit_1_mask = 1ull << (sizeof(T) * 8 - 2); + + // Place the value 1.0 or a copy of the first source vector element in the + // destination element, depending on bit 0 of the corresponding element of + // the second source vector. The sign bit of the destination element is + // copied from bit 1 of the second source vector + for (int i = 0; i < partition_num; i++) { + out[i] = m[i] & bit_0_mask ? 1.0 : n[i]; + T sign_bit = m[i] & bit_1_mask ? 1.0 : -1.0; + out[i] = std::abs(out[i]) * sign_bit; + } + + return {out, 256}; +} + +/** Helper function for SVE instructions with the format `ftmad zd, zn, zm, + * #imm`. T represents the type of sourceValues (e.g. for zn.d, T = double). + * Returns correctly formatted RegisterValue. **/ +template +RegisterValue sveFTrigMad( + srcValContainer& sourceValues, + const simeng::arch::aarch64::InstructionMetadata& metadata, + const uint16_t VL_bits) { + const T* n = sourceValues[0].getAsVector(); + const T* m = sourceValues[1].getAsVector(); + const uint8_t imm = static_cast(metadata.operands[1].imm); + + const std::array sin64 = {1.0, + -0.1666666666666661, + 0.8333333333320002e-02, + -0.1984126982840213e-03, + 0.2755731329901505e-05, + -0.2505070584637887e-07, + 0.1589413637195215e-09, + 0.0}; + + const std::array cos64 = {1.0, + -0.5000000000000000, + 0.4166666666666645e-01, + -0.1388888888886111e-02, + 0.2480158728388683e-04, + -0.2755731309913950e-06, + 0.2087558253975872e-08, + -0.1135338700720054e-10}; + + const std::array sin32 = {1.0, + -1.666666716337e-01, + 8.333330973983e-03, + -1.983967522392e-04, + 2.721174723774e-06, + 0.0, + 0.0, + 0.0}; + + const std::array cos32 = {1.0, + -5.000000000000e-01, + 4.166664928198e-02, + -1.388759003021e-03, + 2.446388680255e-05, + 0.0, + 0.0, + 0.0}; + + const uint16_t partition_num = VL_bits / (sizeof(T) * 8); + T out[256 / sizeof(T)] = {0}; + // std::array lut; + + for (int i = 0; i < partition_num; i++) { + T coeff; + const bool sign_bit = m[i] < 0 ? 1 : 0; + // If float then use those LUTs + if (sizeof(T) == 4) { + coeff = sign_bit ? cos32[imm] : sin32[imm]; + } + // Else if double use those LUTs + else { + coeff = sign_bit ? cos64[imm] : sin64[imm]; + } + // TODO: Add FP16 support if/when we eventually support these (may require + // C++23) + out[i] = n[i] * std::abs(m[i]) + coeff; + } + + return {out, 256}; +} + /** Helper function for SVE instructions with the format `inc * xdn{, pattern{, MUL #imm}}`. * T represents the type of operation (e.g. for INCB, T = int8_t). diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index bc56cb8934..1d50f77e85 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -355,6 +355,30 @@ void Instruction::execute() { sveAdr_packedOffsets(sourceValues_, metadata_, VL_bits); break; } + case Opcode::AArch64_FTSMUL_ZZZ_S: { // ftsmul zd.s, zn.s, zm.s + results_[0] = sveFTrigSMul(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_FTSMUL_ZZZ_D: { // ftsmul zd.d, zn.d, zm.d + results_[0] = sveFTrigSMul(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_FTSSEL_ZZZ_S: { // ftssel zd.s, zn.s, zm.s + results_[0] = sveFTrigSSel(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_FTSSEL_ZZZ_D: { // ftssel zd.d, zn.d, zm.d + results_[0] = sveFTrigSSel(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_FTMAD_ZZI_S: { // ftmad zd.s, zn.s, zm.s, #imm + results_[0] = sveFTrigMad(sourceValues_, metadata_, VL_bits); + break; + } + case Opcode::AArch64_FTMAD_ZZI_D: { // ftmad zd.s, zn.s, zm.s, #imm + results_[0] = sveFTrigMad(sourceValues_, metadata_, VL_bits); + break; + } case Opcode::AArch64_ANDSWri: { // ands wd, wn, #imm auto [result, nzcv] = logicOp_imm( sourceValues_, metadata_, true, From 2716a711e7ba913d286cec48a1f0cbc20f8bb30e Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Tue, 18 Jun 2024 18:12:59 +0100 Subject: [PATCH 13/38] Supported minisweep --- .../simeng/arch/aarch64/helpers/sve.hh | 13 +++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 22 +++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 931ca27d42..e7e841e82f 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -114,6 +114,19 @@ RegisterValue sveAddvPredicated(srcValContainer& sourceValues, return {out, 256}; } +/** Helper function for NEON instructions with the format `uaddlv Vd, Vn.T`. + * T represents the type of sourceValues (e.g. for vn.s, T = uint32_t). + * Returns correctly formatted RegisterValue. */ +template +RegisterValue sveAddlv(srcValContainer& sourceValues) { + const U* n = sourceValues[0].getAsVector(); + T out = 0; + for (int i = 0; i < I; i++) { + out += n[i]; + } + return {out, 256}; +} + /** Helper function for SVE instructions with the format `adr zd, [zn, zm{, * lsl #<1,2,3>}]`. * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t). diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 1d50f77e85..d4b795f876 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -205,6 +205,10 @@ void Instruction::execute() { results_[0] = vecSumElems_2ops(sourceValues_); break; } + case Opcode::AArch64_UADDLVv8i8v: { // uaddlv hd, vn.8b + results_[0] = sveAddlv(sourceValues_); + break; + } case Opcode::AArch64_ADDWri: { // add wd, wn, #imm{, shift} auto [result, nzcv] = addShift_imm(sourceValues_, metadata_, false); @@ -699,6 +703,12 @@ void Instruction::execute() { [](uint8_t x, uint8_t y) -> bool { return (x == y); }); break; } + case Opcode::AArch64_CMEQv2i32rz: { // cmeq vd.2s, vn.2s, #0 + results_[0] = vecCompare( + sourceValues_, true, + [](uint32_t x, uint32_t y) -> bool { return (x == y); }); + break; + } case Opcode::AArch64_CMEQv4i32: { // cmeq vd.4s, vn.4s, vm.4s results_[0] = vecCompare( sourceValues_, false, @@ -717,6 +727,12 @@ void Instruction::execute() { [](int8_t x, int8_t y) -> bool { return (x == y); }); break; } + case Opcode::AArch64_CMHIv2i32: { // cmhi vd.2s, vn.2s, vm.2s + results_[0] = vecCompare( + sourceValues_, false, + [](uint32_t x, uint32_t y) -> bool { return (x > y); }); + break; + } case Opcode::AArch64_CMHIv4i32: { // cmhi vd.4s, vn.4s, vm.4s results_[0] = vecCompare( sourceValues_, false, @@ -4122,6 +4138,12 @@ void Instruction::execute() { [](uint8_t x, uint8_t y) -> uint8_t { return x | y; }); break; } + case Opcode::AArch64_ORNv8i8: { // orn vd.8b, vn.8b, vn.8b + results_[0] = vecLogicOp_3vecs( + sourceValues_, + [](uint8_t x, uint8_t y) -> uint8_t { return x | (~y); }); + break; + } case Opcode::AArch64_PFALSE: { // pfalse pd.b uint64_t out[4] = {0, 0, 0, 0}; results_[0] = out; From 8c56ee5d350d9c272859542b91a7179671a4e997 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Wed, 19 Jun 2024 16:41:06 +0100 Subject: [PATCH 14/38] Added instructions to support CloverLeaf armclang23. Numerical error :O --- .../simeng/arch/aarch64/helpers/sve.hh | 20 ++++++++- src/lib/arch/aarch64/Instruction_execute.cc | 20 +++++++++ test/regression/aarch64/instructions/sve.cc | 43 +++++++++++++++++++ 3 files changed, 81 insertions(+), 2 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index e7e841e82f..7bf8d2a427 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -115,8 +115,9 @@ RegisterValue sveAddvPredicated(srcValContainer& sourceValues, } /** Helper function for NEON instructions with the format `uaddlv Vd, Vn.T`. - * T represents the type of sourceValues (e.g. for vn.s, T = uint32_t). - * Returns correctly formatted RegisterValue. */ + * T represents the type of the destination register (e.g. for h0, T = + * uint32_t). U represents the type of the sourceValues[0] (e.g. for v0.8b, U = + * uint8_t) Returns correctly formatted RegisterValue. */ template RegisterValue sveAddlv(srcValContainer& sourceValues) { const U* n = sourceValues[0].getAsVector(); @@ -127,6 +128,21 @@ RegisterValue sveAddlv(srcValContainer& sourceValues) { return {out, 256}; } +/** Helper function for NEON instructions with the format `umaxv Vd, Vn.T`. + * T represents the type of sourceValues (e.g. for vn.s, T = uint32_t). + * Returns correctly formatted RegisterValue. */ +template +RegisterValue sveUMaxV(srcValContainer& sourceValues) { + const T* n = sourceValues[0].getAsVector(); + T out = n[0]; + for (int i = 1; i < I; i++) { + std::cout << "Comparing " << n[i] << " and " << out; + out = std::max(n[i], out); + std::cout << ". " << out << " won\n"; + } + return {out, 256}; +} + /** Helper function for SVE instructions with the format `adr zd, [zn, zm{, * lsl #<1,2,3>}]`. * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t). diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index d4b795f876..f65f233082 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -5800,6 +5800,26 @@ void Instruction::execute() { results_[0] = vecUMinP(sourceValues_); break; } + case Opcode::AArch64_UMAXVv16i8v: { // umaxv bd, vn.16b + results_[0] = sveUMaxV(sourceValues_); + break; + } + case Opcode::AArch64_UMAXVv4i16v: { // umaxv hd, vn.4h + results_[0] = sveUMaxV(sourceValues_); + break; + } + case Opcode::AArch64_UMAXVv4i32v: { // umaxv sd, vn.4s + results_[0] = sveUMaxV(sourceValues_); + break; + } + case Opcode::AArch64_UMAXVv8i16v: { // umaxv hd, vn.8h + results_[0] = sveUMaxV(sourceValues_); + break; + } + case Opcode::AArch64_UMAXVv8i8v: { // umaxv bd, vn.8b + results_[0] = sveUMaxV(sourceValues_); + break; + } case Opcode::AArch64_UMOVvi32_idx0: // umov wd, vn.s[0] case Opcode::AArch64_UMOVvi32: { // umov wd, vn.s[index] const uint32_t* vec = sourceValues_[0].getAsVector(); diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index fde761a148..3ef1d1148a 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -6402,6 +6402,49 @@ TEST_P(InstSve, smulh) { fillNeonCombined({-12}, {-1076902265}, VL / 8)); } +TEST_P(InstSve, umaxp) { + // umaxv vd, vn.t + initialHeapData_.resize(32); + uint8_t* heap = reinterpret_cast(initialHeapData_.data()); + + // v0 + heap[0] = 0x01; + heap[1] = 0x00; + heap[2] = 0xFF; + heap[3] = 0xAA; + heap[4] = 0xBB; + heap[5] = 0xCC; + heap[6] = 0xDD; + heap[7] = 0xEE; + + // v1 + heap[8] = 0x00; + heap[9] = 0x00; + heap[10] = 0xEE; + heap[11] = 0x11; + heap[12] = 0x22; + heap[13] = 0x33; + heap[14] = 0x44; + heap[15] = 0x55; + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr q0, [x0] + ldr q1, [x0, #8] + umaxv h2, v0.4h + umaxv h3, v1.4h + + )"); + CHECK_NEON(2, uint16_t, + {0xEEDD, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}); + CHECK_NEON(3, uint16_t, + {0x5544, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}); +} + TEST_P(InstSve, clastb) { // 64 bit RUN_AARCH64(R"( From 32d0d6c8e15cc29e5d8bca62bf3b1c7aa194e607 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Wed, 19 Jun 2024 19:08:24 +0100 Subject: [PATCH 15/38] Added a test to start investigating what's wrong with cloverleaf --- src/include/simeng/arch/aarch64/helpers/sve.hh | 2 -- test/regression/aarch64/instructions/bitmanip.cc | 6 ++++++ test/regression/aarch64/instructions/sve.cc | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 7bf8d2a427..38f21baee5 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -136,9 +136,7 @@ RegisterValue sveUMaxV(srcValContainer& sourceValues) { const T* n = sourceValues[0].getAsVector(); T out = n[0]; for (int i = 1; i < I; i++) { - std::cout << "Comparing " << n[i] << " and " << out; out = std::max(n[i], out); - std::cout << ". " << out << " won\n"; } return {out, 256}; } diff --git a/test/regression/aarch64/instructions/bitmanip.cc b/test/regression/aarch64/instructions/bitmanip.cc index a72dcb64dc..8622169db0 100644 --- a/test/regression/aarch64/instructions/bitmanip.cc +++ b/test/regression/aarch64/instructions/bitmanip.cc @@ -71,11 +71,17 @@ TEST_P(InstBitmanip, extr) { extr w4, w1, w2, 4 extr w5, w1, w2, 24 extr w6, w1, w2, 31 + + # Check alias + ror w7, w1, 31 + ror w8, w1, 24 )"); EXPECT_EQ(getGeneralRegister(3), 0x12345678); EXPECT_EQ(getGeneralRegister(4), 0xF1234567); EXPECT_EQ(getGeneralRegister(5), 0xADBEEF12); EXPECT_EQ(getGeneralRegister(6), 0xBD5B7DDE); + EXPECT_EQ(getGeneralRegister(7), 0xBD5B7DDF); + EXPECT_EQ(getGeneralRegister(8), 0xADBEEFDE); // 64-bit initialHeapData_.resize(16); diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 3ef1d1148a..951a6f7627 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -6402,7 +6402,7 @@ TEST_P(InstSve, smulh) { fillNeonCombined({-12}, {-1076902265}, VL / 8)); } -TEST_P(InstSve, umaxp) { +TEST_P(InstSve, umaxv) { // umaxv vd, vn.t initialHeapData_.resize(32); uint8_t* heap = reinterpret_cast(initialHeapData_.data()); From 2bb065b790aeeb4b7f59c90ff96337ce2a6b8b49 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Thu, 20 Jun 2024 15:09:42 +0100 Subject: [PATCH 16/38] Added test for LDRSWroW --- test/regression/aarch64/instructions/load.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/regression/aarch64/instructions/load.cc b/test/regression/aarch64/instructions/load.cc index 09269eebb8..05ffdd90a0 100644 --- a/test/regression/aarch64/instructions/load.cc +++ b/test/regression/aarch64/instructions/load.cc @@ -1277,17 +1277,19 @@ TEST_P(InstLoad, ldrsw) { mov x0, 0 mov x8, 214 svc #0 - mov x5, 1 + mov x6, 1 # Load 32-bit values from heap and sign-extend to 64-bits ldrsw x1, [x0, #4] ldrsw x2, [x0], #4 ldrsw x3, [x0] - ldrsw x4, [x0, x5, lsl #2] + ldrsw x4, [x0, x6, lsl #2] + ldrsw x5, [x0, w6, uxtw #2] )"); EXPECT_EQ(getGeneralRegister(1), INT32_MAX); EXPECT_EQ(getGeneralRegister(2), -2); EXPECT_EQ(getGeneralRegister(3), INT32_MAX); EXPECT_EQ(getGeneralRegister(4), -5); + EXPECT_EQ(getGeneralRegister(5), -5); // ldursw RUN_AARCH64(R"( From b40d0113237fca8920053e4bfc7d56191d6d23a8 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Thu, 22 Aug 2024 17:33:52 +0000 Subject: [PATCH 17/38] Added mechanism to detect ROB loops. Also added FDIVv4f32 inst --- src/include/simeng/pipeline/ReorderBuffer.hh | 6 +++++ src/lib/arch/aarch64/Instruction_execute.cc | 4 ++++ src/lib/pipeline/ReorderBuffer.cc | 24 +++++++++++++++----- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/src/include/simeng/pipeline/ReorderBuffer.hh b/src/include/simeng/pipeline/ReorderBuffer.hh index 2e6e68e37b..d0622c2a30 100644 --- a/src/include/simeng/pipeline/ReorderBuffer.hh +++ b/src/include/simeng/pipeline/ReorderBuffer.hh @@ -125,6 +125,12 @@ class ReorderBuffer { */ uint64_t pc_; + /** The address of the last instruction at the head of the ROB to check if it's stuck */ + uint64_t last_inst_addr = 0; + + /** A counter for how many cycles the same instruction has been at the head of the ROB */ + uint64_t inst_repeat_counter = 0; + /** The sequence ID of the youngest instruction that should remain after the * current flush. */ uint64_t flushAfter_; diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index f65f233082..65541eefcd 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -1840,6 +1840,10 @@ void Instruction::execute() { results_[0] = vecFDiv(sourceValues_); break; } + case Opcode::AArch64_FDIVv4f32: { // fdiv vd.4s, vn.4s, vm.4s + results_[0] = vecFDiv(sourceValues_); + break; + } case Opcode::AArch64_FDUP_ZI_D: { // fdup zd.d, #imm results_[0] = sveDup_immOrScalar(sourceValues_, metadata_, VL_bits, true); diff --git a/src/lib/pipeline/ReorderBuffer.cc b/src/lib/pipeline/ReorderBuffer.cc index e72e6e79dc..e2ce8ebc63 100644 --- a/src/lib/pipeline/ReorderBuffer.cc +++ b/src/lib/pipeline/ReorderBuffer.cc @@ -36,20 +36,18 @@ void ReorderBuffer::reserve(const std::shared_ptr& insn) { void ReorderBuffer::commitMicroOps(uint64_t insnId) { if (buffer_.size()) { size_t index = 0; - uint64_t firstOp = UINT64_MAX; + int64_t firstOp = -1; bool validForCommit = false; - bool foundFirstInstance = false; // Find first instance of uop belonging to macro-op instruction for (; index < buffer_.size(); index++) { if (buffer_[index]->getInstructionId() == insnId) { firstOp = index; - foundFirstInstance = true; break; } } - if (foundFirstInstance) { + if (firstOp > -1) { // If found, see if all uops are committable for (; index < buffer_.size(); index++) { if (buffer_[index]->getInstructionId() != insnId) break; @@ -62,7 +60,7 @@ void ReorderBuffer::commitMicroOps(uint64_t insnId) { } if (!validForCommit) return; - assert(firstOp != UINT64_MAX && "firstOp hasn't been populated"); + assert(firstOp > -1 && "firstOp hasn't been populated"); // No early return thus all uops are committable for (; firstOp < buffer_.size(); firstOp++) { if (buffer_[firstOp]->getInstructionId() != insnId) break; @@ -81,6 +79,19 @@ unsigned int ReorderBuffer::commit(uint64_t maxCommitSize) { unsigned int n; for (n = 0; n < maxCommits; n++) { auto& uop = buffer_[0]; + if (uop->getInstructionAddress() == last_inst_addr) { + inst_repeat_counter++; + } else { + inst_repeat_counter = 0; + } + if (inst_repeat_counter > 10000000) { + std::cout << "Infinite loop detected in rob commit at instruction address " + << std::hex << uop->getInstructionAddress() << std::dec << " (" + << uop->getMicroOpIndex() << "). Killing.\n"; + exit(1); + } + last_inst_addr = uop->getInstructionAddress(); + if (!uop->canCommit()) { break; } @@ -97,7 +108,7 @@ unsigned int ReorderBuffer::commit(uint64_t maxCommitSize) { for (size_t i = 0; i < destinations.size(); i++) { rat_.commit(destinations[i]); } - + // If it's a memory op, commit the entry at the head of the respective queue if (uop->isLoad()) { lsq_.commitLoad(uop); @@ -227,3 +238,4 @@ uint64_t ReorderBuffer::getRetiredBranchesCount() const { } } // namespace pipeline } // namespace simeng + From 14cc2e1ab31dadde8f46390363dab3575077d75b Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Wed, 28 Aug 2024 18:37:58 +0100 Subject: [PATCH 18/38] Clang format --- src/include/simeng/pipeline/ReorderBuffer.hh | 6 ++++-- src/lib/pipeline/ReorderBuffer.cc | 10 +++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/include/simeng/pipeline/ReorderBuffer.hh b/src/include/simeng/pipeline/ReorderBuffer.hh index d0622c2a30..06a9aefadd 100644 --- a/src/include/simeng/pipeline/ReorderBuffer.hh +++ b/src/include/simeng/pipeline/ReorderBuffer.hh @@ -125,10 +125,12 @@ class ReorderBuffer { */ uint64_t pc_; - /** The address of the last instruction at the head of the ROB to check if it's stuck */ + /** The address of the last instruction at the head of the ROB to check if + * it's stuck */ uint64_t last_inst_addr = 0; - /** A counter for how many cycles the same instruction has been at the head of the ROB */ + /** A counter for how many cycles the same instruction has been at the head of + * the ROB */ uint64_t inst_repeat_counter = 0; /** The sequence ID of the youngest instruction that should remain after the diff --git a/src/lib/pipeline/ReorderBuffer.cc b/src/lib/pipeline/ReorderBuffer.cc index e2ce8ebc63..4887a69ad2 100644 --- a/src/lib/pipeline/ReorderBuffer.cc +++ b/src/lib/pipeline/ReorderBuffer.cc @@ -85,9 +85,10 @@ unsigned int ReorderBuffer::commit(uint64_t maxCommitSize) { inst_repeat_counter = 0; } if (inst_repeat_counter > 10000000) { - std::cout << "Infinite loop detected in rob commit at instruction address " - << std::hex << uop->getInstructionAddress() << std::dec << " (" - << uop->getMicroOpIndex() << "). Killing.\n"; + std::cout + << "Infinite loop detected in rob commit at instruction address " + << std::hex << uop->getInstructionAddress() << std::dec << " (" + << uop->getMicroOpIndex() << "). Killing.\n"; exit(1); } last_inst_addr = uop->getInstructionAddress(); @@ -108,7 +109,7 @@ unsigned int ReorderBuffer::commit(uint64_t maxCommitSize) { for (size_t i = 0; i < destinations.size(); i++) { rat_.commit(destinations[i]); } - + // If it's a memory op, commit the entry at the head of the respective queue if (uop->isLoad()) { lsq_.commitLoad(uop); @@ -238,4 +239,3 @@ uint64_t ReorderBuffer::getRetiredBranchesCount() const { } } // namespace pipeline } // namespace simeng - From bd3bfc8b1aa893ad4fadc6dd7b7ba12610a90575 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Thu, 29 Aug 2024 17:37:49 +0100 Subject: [PATCH 19/38] Fixed a couple build issues/warnings --- src/include/simeng/arch/aarch64/helpers/sve.hh | 6 +++--- src/lib/pipeline/ReorderBuffer.cc | 6 ++++-- test/regression/aarch64/AArch64RegressionTest.hh | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 38f21baee5..27e9ed7753 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -915,7 +915,7 @@ RegisterValue sveFTrigSMul(srcValContainer& sourceValues, const uint16_t partition_num = VL_bits / (sizeof(T) * 8); T out[256 / sizeof(T)] = {0}; - U bit_0_mask = 1ull << (sizeof(T) * 8 - 1); + U bit_0_mask = static_cast(1) << (sizeof(T) * 8 - 1); // Square each element in the first source vector and then set the sign bit // to a copy of bit 0 of the corresponding element in the second source // register @@ -941,8 +941,8 @@ RegisterValue sveFTrigSSel(srcValContainer& sourceValues, const uint16_t partition_num = VL_bits / (sizeof(T) * 8); T out[256 / sizeof(T)] = {0}; - U bit_0_mask = 1ull << (sizeof(T) * 8 - 1); - U bit_1_mask = 1ull << (sizeof(T) * 8 - 2); + U bit_0_mask = static_cast(1) << (sizeof(T) * 8 - 1); + U bit_1_mask = static_cast(1) << (sizeof(T) * 8 - 2); // Place the value 1.0 or a copy of the first source vector element in the // destination element, depending on bit 0 of the corresponding element of diff --git a/src/lib/pipeline/ReorderBuffer.cc b/src/lib/pipeline/ReorderBuffer.cc index 4887a69ad2..f70b50b8bc 100644 --- a/src/lib/pipeline/ReorderBuffer.cc +++ b/src/lib/pipeline/ReorderBuffer.cc @@ -36,18 +36,20 @@ void ReorderBuffer::reserve(const std::shared_ptr& insn) { void ReorderBuffer::commitMicroOps(uint64_t insnId) { if (buffer_.size()) { size_t index = 0; - int64_t firstOp = -1; + uint64_t firstOp = UINT64_MAX; bool validForCommit = false; + bool foundFirstInstance = false; // Find first instance of uop belonging to macro-op instruction for (; index < buffer_.size(); index++) { if (buffer_[index]->getInstructionId() == insnId) { firstOp = index; + foundFirstInstance = true; break; } } - if (firstOp > -1) { + if (foundFirstInstance) { // If found, see if all uops are committable for (; index < buffer_.size(); index++) { if (buffer_[index]->getInstructionId() != insnId) break; diff --git a/test/regression/aarch64/AArch64RegressionTest.hh b/test/regression/aarch64/AArch64RegressionTest.hh index 32d975b09d..3b2490666d 100644 --- a/test/regression/aarch64/AArch64RegressionTest.hh +++ b/test/regression/aarch64/AArch64RegressionTest.hh @@ -503,7 +503,7 @@ class AArch64RegressionTest : public RegressionTest { std::array generatedArray; generatedArray.fill(0); // Fill array by cycling through source elements - for (int i = 0; i < (num_bytes / sizeof(T)); i++) { + for (size_t i = 0; i < (num_bytes / sizeof(T)); i++) { generatedArray[i] = src[i % src.size()]; } return generatedArray; From 3e45c868a32992b7f5aa0fe54d81ca3bacd0bb01 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Fri, 30 Aug 2024 13:45:31 +0100 Subject: [PATCH 20/38] Added uaddlv test, as well as rolled back a ROB fix --- src/lib/pipeline/ReorderBuffer.cc | 2 +- test/regression/aarch64/instructions/neon.cc | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/lib/pipeline/ReorderBuffer.cc b/src/lib/pipeline/ReorderBuffer.cc index f70b50b8bc..1ff4a6b6c5 100644 --- a/src/lib/pipeline/ReorderBuffer.cc +++ b/src/lib/pipeline/ReorderBuffer.cc @@ -62,7 +62,7 @@ void ReorderBuffer::commitMicroOps(uint64_t insnId) { } if (!validForCommit) return; - assert(firstOp > -1 && "firstOp hasn't been populated"); + assert(firstOp != UINT64_MAX && "firstOp hasn't been populated"); // No early return thus all uops are committable for (; firstOp < buffer_.size(); firstOp++) { if (buffer_[firstOp]->getInstructionId() != insnId) break; diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc index a4731f388f..fdf7405c86 100644 --- a/test/regression/aarch64/instructions/neon.cc +++ b/test/regression/aarch64/instructions/neon.cc @@ -356,6 +356,26 @@ TEST_P(InstNeon, addv) { CHECK_NEON(1, uint8_t, {40}); } +TEST_P(InstNeon, uaddlv) { + // 16-bit + initialHeapData_.resize(16); + uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + for (int i = 0; i < 16; i++) { + heap8[i] = (i + 1); + } + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr q0, [x0] + uaddlv h1, v0.8b + )"); + CHECK_NEON(1, uint16_t, {36}); +} + TEST_P(InstNeon, and) { initialHeapData_.resize(32); uint32_t* heap = reinterpret_cast(initialHeapData_.data()); From 96034a525f77eece4a7053fda21b42dcf3c881ac Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Fri, 30 Aug 2024 17:30:36 +0100 Subject: [PATCH 21/38] Added tests for cmphs and a couple other insts. Fixed a couple bugs to do with cmphs --- src/lib/arch/aarch64/Instruction_execute.cc | 8 +- test/regression/aarch64/instructions/neon.cc | 97 ++++++++-- test/regression/aarch64/instructions/sve.cc | 186 +++++++++++++++++++ 3 files changed, 268 insertions(+), 23 deletions(-) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 65541eefcd..fecf3a36ae 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -875,7 +875,7 @@ void Instruction::execute() { } case Opcode::AArch64_CMPHS_PPzZZ_B: { // cmphs pd.b, pg/z, zn.b, zm.b auto [output, nzcv] = sveCmpPredicated_toPred( - sourceValues_, metadata_, VL_bits, true, + sourceValues_, metadata_, VL_bits, false, [](uint8_t x, uint8_t y) -> bool { return x >= y; }); results_[0] = nzcv; results_[1] = output; @@ -883,7 +883,7 @@ void Instruction::execute() { } case Opcode::AArch64_CMPHS_PPzZZ_D: { // cmphs pd.d, pg/z, zn.d, zm.d auto [output, nzcv] = sveCmpPredicated_toPred( - sourceValues_, metadata_, VL_bits, true, + sourceValues_, metadata_, VL_bits, false, [](uint64_t x, uint64_t y) -> bool { return x >= y; }); results_[0] = nzcv; results_[1] = output; @@ -891,7 +891,7 @@ void Instruction::execute() { } case Opcode::AArch64_CMPHS_PPzZZ_H: { // cmphs pd.h, pg/z, zn.h, zm.h auto [output, nzcv] = sveCmpPredicated_toPred( - sourceValues_, metadata_, VL_bits, true, + sourceValues_, metadata_, VL_bits, false, [](uint16_t x, uint16_t y) -> bool { return x >= y; }); results_[0] = nzcv; results_[1] = output; @@ -899,7 +899,7 @@ void Instruction::execute() { } case Opcode::AArch64_CMPHS_PPzZZ_S: { // cmphs pd.s, pg/z, zn.s, zm.s auto [output, nzcv] = sveCmpPredicated_toPred( - sourceValues_, metadata_, VL_bits, true, + sourceValues_, metadata_, VL_bits, false, [](uint32_t x, uint32_t y) -> bool { return x >= y; }); results_[0] = nzcv; results_[1] = output; diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc index fdf7405c86..320014d0cb 100644 --- a/test/regression/aarch64/instructions/neon.cc +++ b/test/regression/aarch64/instructions/neon.cc @@ -711,18 +711,53 @@ TEST_P(InstNeon, cmeq) { CHECK_NEON(2, uint8_t, {0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0xFF}); CHECK_NEON(3, uint8_t, {0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00}); - // 32-bit + // 32-bit, 2 lane initialHeapData_.resize(128); - uint32_t* heap32 = reinterpret_cast(initialHeapData_.data()); - heap32[0] = 10; - heap32[1] = 11; - heap32[2] = 12; - heap32[3] = 13; + uint32_t* heapv2i32 = reinterpret_cast(initialHeapData_.data()); + heapv2i32[0] = 10; + heapv2i32[1] = 0; + + heapv2i32[2] = 0; + heapv2i32[3] = 12; + + heapv2i32[4] = 15; + heapv2i32[5] = 9; + + heapv2i32[6] = 0; + heapv2i32[7] = 0; + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr q0, [x0] + ldr q1, [x0, #8] + ldr q2, [x0, #16] + ldr q3, [x0, #24] + cmeq v4.2s, v0.2s, #0 + cmeq v5.2s, v1.2s, #0 + cmeq v6.2s, v2.2s, #0 + cmeq v7.2s, v3.2s, #0 + )"); + CHECK_NEON(4, uint32_t, {0, 0xFFFFFFFFu}); + CHECK_NEON(5, uint32_t, {0xFFFFFFFFu, 0}); + CHECK_NEON(6, uint32_t, {0, 0}); + CHECK_NEON(7, uint32_t, {0xFFFFFFFFu, 0xFFFFFFFFu}); + + // 32-bit, 4 lane + initialHeapData_.resize(128); + uint32_t* heapv4i32 = reinterpret_cast(initialHeapData_.data()); + heapv4i32[0] = 10; + heapv4i32[1] = 11; + heapv4i32[2] = 12; + heapv4i32[3] = 13; - heap32[4] = 13; - heap32[5] = 11; - heap32[6] = 12; - heap32[7] = 10; + heapv4i32[4] = 13; + heapv4i32[5] = 11; + heapv4i32[6] = 12; + heapv4i32[7] = 10; RUN_AARCH64(R"( # Get heap address @@ -800,16 +835,40 @@ TEST_P(InstNeon, cmhs) { } TEST_P(InstNeon, cmhi) { + // 32-bit, 2 lane initialHeapData_.resize(32); - uint32_t* heap = reinterpret_cast(initialHeapData_.data()); - heap[0] = 42; - heap[1] = 7; - heap[2] = UINT32_MAX; - heap[3] = 7; - heap[4] = 1; - heap[5] = (1u << 31) - 1; - heap[6] = 0; - heap[7] = 7; + uint32_t* heapv2i32 = reinterpret_cast(initialHeapData_.data()); + heapv2i32[0] = UINT32_MAX; + heapv2i32[1] = 7; + + heapv2i32[2] = 1; + heapv2i32[3] = 7; + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr q0, [x0] + ldr q1, [x0, #8] + cmhi v2.2s, v0.2s, v1.2s + cmhi v3.2s, v1.2s, v0.2s + )"); + CHECK_NEON(2, uint32_t, {0xFFFFFFFF, 0x0}); + CHECK_NEON(3, uint32_t, {0x0, 0x0}); + + // 32-bit, 4 lane + initialHeapData_.resize(32); + uint32_t* heapv4i32 = reinterpret_cast(initialHeapData_.data()); + heapv4i32[0] = 42; + heapv4i32[1] = 7; + heapv4i32[2] = UINT32_MAX; + heapv4i32[3] = 7; + heapv4i32[4] = 1; + heapv4i32[5] = (1u << 31) - 1; + heapv4i32[6] = 0; + heapv4i32[7] = 7; RUN_AARCH64(R"( # Get heap address diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 951a6f7627..4b0fe0e8ff 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -1322,6 +1322,192 @@ TEST_P(InstSve, cmphi_vec) { EXPECT_EQ(getNZCV(), 0b0110); } +TEST_P(InstSve, cmphs_vec) { + // 8-bit + RUN_AARCH64(R"( + ptrue p0.b + dup z0.b, #-5 + dup z1.b, #4 + + cmphs p1.b, p0/z, z0.b, z1.b + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 1)); + EXPECT_EQ(getNZCV(), 0b1000); + + RUN_AARCH64(R"( + mov x0, #0 + mov x1, #2 + addvl x0, x0, #1 + sdiv x0, x0, x1 + + whilelo p0.b, xzr, x0 + dup z0.b, #-5 + dup z1.b, #4 + + cmphs p1.b, p0/z, z0.b, z1.b + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 16, {1}, 1)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + ptrue p0.b + dup z0.b, #-5 + dup z1.b, #4 + + cmphs p1.b, p0/z, z1.b, z0.b + )"); + CHECK_PREDICATE(1, uint64_t, {0, 0, 0, 0}); + EXPECT_EQ(getNZCV(), 0b0110); + + RUN_AARCH64(R"( + ptrue p0.b + dup z0.b, #6 + dup z1.b, #6 + + cmphs p1.b, p0/z, z1.b, z0.b + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 1)); + EXPECT_EQ(getNZCV(), 0b1000); + + // 16-bit + RUN_AARCH64(R"( + ptrue p0.h + dup z0.h, #-5 + dup z1.h, #4 + + cmphs p1.h, p0/z, z0.h, z1.h + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 2)); + EXPECT_EQ(getNZCV(), 0b1000); + + RUN_AARCH64(R"( + mov x0, #0 + mov x1, #4 + addvl x0, x0, #1 + sdiv x0, x0, x1 + + whilelo p0.h, xzr, x0 + dup z0.h, #-5 + dup z1.h, #4 + + cmphs p1.h, p0/z, z0.h, z1.h + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 16, {1}, 2)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + ptrue p0.h + dup z0.h, #-5 + dup z1.h, #4 + + cmphs p1.h, p0/z, z1.h, z0.h + )"); + CHECK_PREDICATE(1, uint64_t, {0, 0, 0, 0}); + EXPECT_EQ(getNZCV(), 0b0110); + + RUN_AARCH64(R"( + ptrue p0.h + dup z0.h, #6 + dup z1.h, #6 + + cmphs p1.h, p0/z, z1.h, z0.h + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 2)); + EXPECT_EQ(getNZCV(), 0b1000); + + // 32-bit + RUN_AARCH64(R"( + ptrue p0.s + dup z0.s, #-5 + dup z1.s, #4 + + cmphs p1.s, p0/z, z0.s, z1.s + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 4)); + EXPECT_EQ(getNZCV(), 0b1000); + + RUN_AARCH64(R"( + mov x0, #0 + mov x1, #8 + addvl x0, x0, #1 + sdiv x0, x0, x1 + + whilelo p0.s, xzr, x0 + dup z0.s, #-5 + dup z1.s, #4 + + cmphs p1.s, p0/z, z0.s, z1.s + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 16, {1}, 4)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + ptrue p0.s + dup z0.s, #-5 + dup z1.s, #4 + + cmphs p1.s, p0/z, z1.s, z0.s + )"); + CHECK_PREDICATE(1, uint64_t, {0, 0, 0, 0}); + EXPECT_EQ(getNZCV(), 0b0110); + + RUN_AARCH64(R"( + ptrue p0.s + dup z0.s, #6 + dup z1.s, #6 + + cmphs p1.s, p0/z, z1.s, z0.s + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 4)); + EXPECT_EQ(getNZCV(), 0b1000); + + // 64-bit + RUN_AARCH64(R"( + ptrue p0.d + dup z0.d, #-5 + dup z1.d, #4 + + cmphs p1.d, p0/z, z0.d, z1.d + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 8)); + EXPECT_EQ(getNZCV(), 0b1000); + + RUN_AARCH64(R"( + mov x0, #0 + mov x1, #16 + addvl x0, x0, #1 + sdiv x0, x0, x1 + + whilelo p0.d, xzr, x0 + dup z0.d, #-5 + dup z1.d, #4 + + cmphs p1.d, p0/z, z0.d, z1.d + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 16, {1}, 8)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + ptrue p0.d + dup z0.d, #-5 + dup z1.d, #4 + + cmphs p1.d, p0/z, z1.d, z0.d + )"); + CHECK_PREDICATE(1, uint64_t, {0, 0, 0, 0}); + EXPECT_EQ(getNZCV(), 0b0110); + + RUN_AARCH64(R"( + ptrue p0.d + dup z0.d, #6 + dup z1.d, #6 + + cmphs p1.d, p0/z, z1.d, z0.d + )"); + CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 8)); + EXPECT_EQ(getNZCV(), 0b1000); +} + TEST_P(InstSve, cnt) { // pattern = all RUN_AARCH64(R"( From 466fc3d772256618cf88c9cfd31cc8c4496d2bd4 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Mon, 9 Sep 2024 21:00:47 +0100 Subject: [PATCH 22/38] Added tests for FDIV and LASTB. Fixed LASTB logic. --- .../simeng/arch/aarch64/helpers/sve.hh | 6 +- test/regression/aarch64/instructions/neon.cc | 34 +++++- test/regression/aarch64/instructions/sve.cc | 104 ++++++++++++++++++ 3 files changed, 137 insertions(+), 7 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 27e9ed7753..8b9f32c89a 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -1121,8 +1121,9 @@ RegisterValue sveIndex( template RegisterValue sveLastBScalar(srcValContainer& sourceValues, const uint16_t VL_bits) { - const uint64_t* p = sourceValues[0].getAsVector(); - const T* n = sourceValues[1].getAsVector(); + // sourceValues are wrong and the correct value is in the previous index. + const uint64_t* p = sourceValues[1].getAsVector(); + const T* n = sourceValues[2].getAsVector(); const uint16_t partition_num = VL_bits / (sizeof(T) * 8); T out; @@ -1149,6 +1150,7 @@ RegisterValue sveLastBScalar(srcValContainer& sourceValues, template RegisterValue sveCLastBScalar(srcValContainer& sourceValues, const uint16_t VL_bits) { + // sourceValues are wrong and the correct value is in the previous index. const uint64_t* p = sourceValues[1].getAsVector(); const uint64_t* m = sourceValues[2].getAsVector(); const T* n = sourceValues[3].getAsVector(); diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc index 320014d0cb..680574158a 100644 --- a/test/regression/aarch64/instructions/neon.cc +++ b/test/regression/aarch64/instructions/neon.cc @@ -1610,11 +1610,12 @@ TEST_P(InstNeon, fcvtl2) { TEST_P(InstNeon, fdiv) { initialHeapData_.resize(32); - double* heap = reinterpret_cast(initialHeapData_.data()); - heap[0] = 1.0; - heap[1] = -42.5; - heap[2] = -0.125; - heap[3] = 16.0; + // 2 Doubles + double* heapv2f64 = reinterpret_cast(initialHeapData_.data()); + heapv2f64[0] = 1.0; + heapv2f64[1] = -42.5; + heapv2f64[2] = -0.125; + heapv2f64[3] = 16.0; RUN_AARCH64(R"( # Get heap address @@ -1627,6 +1628,29 @@ TEST_P(InstNeon, fdiv) { fdiv v2.2d, v0.2d, v1.2d )"); CHECK_NEON(2, double, {-8.0, -2.65625}); + + // 4 Floats + float* heapv4f32 = reinterpret_cast(initialHeapData_.data()); + heapv4f32[0] = 1.0f; + heapv4f32[1] = -42.5f; + heapv4f32[2] = 10.0f; + heapv4f32[3] = 0.0f; + heapv4f32[4] = -0.125f; + heapv4f32[5] = 16.0f; + heapv4f32[6] = -2.0f; + heapv4f32[7] = 256.0f; + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr q0, [x0] + ldr q1, [x0, #16] + fdiv v2.4s, v0.4s, v1.4s + )"); + CHECK_NEON(2, float, {-8.0f, -2.65625f, -5.0f, 0.0f}); } TEST_P(InstNeon, fmla) { diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 4b0fe0e8ff..a1a5429c2c 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -6733,6 +6733,110 @@ TEST_P(InstSve, clastb) { CHECK_NEON(1, uint64_t, fillNeon({0x1F}, 8)); } +TEST_P(InstSve, lastb) { + // 64 bit + RUN_AARCH64(R"( + movz x0, #0xCDEF + movk x0, #0x89AB, LSL #16 + movk x0, #0x4567, LSL #32 + movk x0, #0x0123, LSL #48 + + movz x1, #0x4321 + movk x1, #0x8765, LSL #16 + movk x1, #0xCBA9, LSL #32 + movk x1, #0x1FED, LSL #48 + + ptrue p0.d + dup z2.d, x0 + dup z3.d, x1 + + pfalse p0.b + lastb d4, p0, z2.d + mov z0.d, z4.d + + ptrue p0.d + lastb d5, p0, z3.d + mov z1.d, z5.d + )"); + CHECK_NEON(0, uint64_t, fillNeon({0x0123456789ABCDEF}, 8)); + CHECK_NEON(1, uint64_t, fillNeon({0x1FEDCBA987654321}, 8)); + + // 32 bit + RUN_AARCH64(R"( + movz x0, #0xCDEF + movk x0, #0x89AB, LSL #16 + movk x0, #0x4567, LSL #32 + movk x0, #0x0123, LSL #48 + movz x1, #0x4321 + movk x1, #0x8765, LSL #16 + movk x1, #0xCBA9, LSL #32 + movk x1, #0x1FED, LSL #48 + + dup z2.d, x0 + dup z3.d, x1 + + pfalse p0.b + lastb s4, p0, z2.s + mov z0.d, z4.d + + ptrue p0.s + lastb s4, p0, z3.s + mov z1.d, z4.d + )"); + CHECK_NEON(0, uint64_t, fillNeon({0x01234567}, 8)); + CHECK_NEON(1, uint64_t, fillNeon({0x1FEDCBA9}, 8)); + + // 16 bit + RUN_AARCH64(R"( + movz x0, #0xCDEF + movk x0, #0x89AB, LSL #16 + movk x0, #0x4567, LSL #32 + movk x0, #0x0123, LSL #48 + movz x1, #0x4321 + movk x1, #0x8765, LSL #16 + movk x1, #0xCBA9, LSL #32 + movk x1, #0x1FED, LSL #48 + + dup z2.d, x0 + dup z3.d, x1 + + pfalse p0.b + lastb h4, p0, z2.h + mov z0.d, z4.d + + ptrue p0.h + lastb h4, p0, z3.h + mov z1.d, z4.d + )"); + CHECK_NEON(0, uint64_t, fillNeon({0x0123}, 8)); + CHECK_NEON(1, uint64_t, fillNeon({0x1FED}, 8)); + + // 8 bit + RUN_AARCH64(R"( + movz x0, #0xCDEF + movk x0, #0x89AB, LSL #16 + movk x0, #0x4567, LSL #32 + movk x0, #0x0123, LSL #48 + movz x1, #0x4321 + movk x1, #0x8765, LSL #16 + movk x1, #0xCBA9, LSL #32 + movk x1, #0x1FED, LSL #48 + + dup z2.d, x0 + dup z3.d, x1 + + pfalse p0.b + lastb b4, p0, z2.b + mov z0.d, z4.d + + ptrue p0.b + lastb b4, p0, z3.b + mov z1.d, z4.d + )"); + CHECK_NEON(0, uint64_t, fillNeon({0x01}, 8)); + CHECK_NEON(1, uint64_t, fillNeon({0x1F}, 8)); +} + TEST_P(InstSve, st1b) { initialHeapData_.resize(VL / 4); uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); From 0aa2584943b44fde2a28f9c94236fb28ccc8326f Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Tue, 10 Sep 2024 15:53:49 +0100 Subject: [PATCH 23/38] Finally got smax tests --- test/regression/aarch64/instructions/sve.cc | 174 ++++++++++++++++++++ 1 file changed, 174 insertions(+) diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index a1a5429c2c..f79402491a 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -6389,6 +6389,64 @@ TEST_P(InstSve, sel) { } TEST_P(InstSve, smax) { + // 64-bit + initialHeapData_.resize(VL / 4); + int64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + std::vector srcA64 = {1, 2, 3, 4, 5, 6, 7, 8, + -9, -10, -11, -12, 13, 14, -15, -1}; + std::vector srcB64 = {16, 15, 14, 13, -12, -11, -10, -9, + 8, 7, 6, 5, 4, 3, -2, -1}; + fillHeapCombined(heap64, srcA64, srcB64, VL / 32); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #0 + mov x4, #8 + mov x5, #2 + addvl x2, x2, #1 + udiv x2, x2, x4 + udiv x3, x2, x5 + whilelo p1.d, xzr, x3 + ptrue p0.d + + ld1d {z0.d}, p0/z, [x0, x1, lsl #3] + ld1d {z1.d}, p0/z, [x0, x2, lsl #3] + ld1d {z2.d}, p0/z, [x0, x2, lsl #3] + ld1d {z3.d}, p0/z, [x0, x1, lsl #3] + ld1d {z4.d}, p0/z, [x0, x1, lsl #3] + ld1d {z5.d}, p0/z, [x0, x1, lsl #3] + + smax z1.d, p0/m, z1.d, z0.d + smax z2.d, p1/m, z2.d, z0.d + + smax z3.d, z3.d, #0 + smax z4.d, z4.d, #-128 + smax z5.d, z5.d, #127 + )"); + std::vector results64 = {16, 15, 14, 13, 5, 6, 7, 8, + 8, 7, 6, 5, 13, 14, -2, -1}; + CHECK_NEON(1, int64_t, fillNeon(results64, VL / 8)); + std::rotate(srcB64.begin(), srcB64.begin() + ((VL / 128) % 16), srcB64.end()); + CHECK_NEON(2, int64_t, fillNeonCombined(results64, srcB64, VL / 8)); + + CHECK_NEON(3, int64_t, + fillNeon( + {1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 13, 14, 0, 0}, VL / 8)); + CHECK_NEON(4, int64_t, + fillNeon( + {1, 2, 3, 4, 5, 6, 7, 8, -9, -10, -11, -12, 13, 14, -15, -1}, + VL / 8)); + CHECK_NEON(5, int64_t, + fillNeon({127, 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127}, + VL / 8)); + // 32-bit initialHeapData_.resize(VL / 4); int32_t* heap32 = reinterpret_cast(initialHeapData_.data()); @@ -6446,6 +6504,122 @@ TEST_P(InstSve, smax) { fillNeon({127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}, VL / 8)); + + // 16-bit + initialHeapData_.resize(VL / 4); + int16_t* heap16 = reinterpret_cast(initialHeapData_.data()); + std::vector srcA16 = {1, 2, 3, 4, 5, 6, 7, 8, + -9, -10, -11, -12, 13, 14, -15, -1}; + std::vector srcB16 = {16, 15, 14, 13, -12, -11, -10, -9, + 8, 7, 6, 5, 4, 3, -2, -1}; + fillHeapCombined(heap16, srcA16, srcB16, VL / 8); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #0 + mov x4, #2 + mov x5, #2 + addvl x2, x2, #1 + udiv x2, x2, x4 + udiv x3, x2, x5 + whilelo p1.h, xzr, x3 + ptrue p0.h + + ld1h {z0.h}, p0/z, [x0, x1, lsl #1] + ld1h {z1.h}, p0/z, [x0, x2, lsl #1] + ld1h {z2.h}, p0/z, [x0, x2, lsl #1] + ld1h {z3.h}, p0/z, [x0, x1, lsl #1] + ld1h {z4.h}, p0/z, [x0, x1, lsl #1] + ld1h {z5.h}, p0/z, [x0, x1, lsl #1] + + smax z1.h, p0/m, z1.h, z0.h + smax z2.h, p1/m, z2.h, z0.h + + smax z3.h, z3.h, #0 + smax z4.h, z4.h, #-128 + smax z5.h, z5.h, #127 + )"); + std::vector results16 = {16, 15, 14, 13, 5, 6, 7, 8, + 8, 7, 6, 5, 13, 14, -2, -1}; + CHECK_NEON(1, int16_t, fillNeon(results16, VL / 8)); + std::rotate(srcB16.begin(), srcB16.begin() + ((VL / 32) % 16), srcB16.end()); + CHECK_NEON(2, int16_t, fillNeonCombined(results16, srcB16, VL / 8)); + + CHECK_NEON(3, int16_t, + fillNeon( + {1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 13, 14, 0, 0}, VL / 8)); + CHECK_NEON(4, int16_t, + fillNeon( + {1, 2, 3, 4, 5, 6, 7, 8, -9, -10, -11, -12, 13, 14, -15, -1}, + VL / 8)); + CHECK_NEON(5, int16_t, + fillNeon({127, 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127}, + VL / 8)); + + // 8-bit + initialHeapData_.resize(VL / 4); + int8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + std::vector srcA8 = {1, 2, 3, 4, 5, 6, 7, 8, + -9, -10, -11, -12, 13, 14, -15, -1}; + std::vector srcB8 = {16, 15, 14, 13, -12, -11, -10, -9, + 8, 7, 6, 5, 4, 3, -2, -1}; + fillHeapCombined(heap8, srcA8, srcB8, VL / 4); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #0 + mov x4, #1 + mov x5, #2 + addvl x2, x2, #1 + udiv x2, x2, x4 + udiv x3, x2, x5 + whilelo p1.b, xzr, x3 + ptrue p0.b + + ld1b {z0.b}, p0/z, [x0, x1] + ld1b {z1.b}, p0/z, [x0, x2] + ld1b {z2.b}, p0/z, [x0, x2] + ld1b {z3.b}, p0/z, [x0, x1] + ld1b {z4.b}, p0/z, [x0, x1] + ld1b {z5.b}, p0/z, [x0, x1] + + smax z1.b, p0/m, z1.b, z0.b + smax z2.b, p1/m, z2.b, z0.b + + smax z3.b, z3.b, #0 + smax z4.b, z4.b, #-128 + smax z5.b, z5.b, #127 + )"); + std::vector results8 = {16, 15, 14, 13, 5, 6, 7, 8, + 8, 7, 6, 5, 13, 14, -2, -1}; + CHECK_NEON(1, int8_t, fillNeon(results8, VL / 8)); + std::rotate(srcB8.begin(), srcB8.begin() + ((VL / 16) % 16), srcB8.end()); + CHECK_NEON(2, int8_t, fillNeonCombined(results8, srcB8, VL / 8)); + + CHECK_NEON(3, int8_t, + fillNeon( + {1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 13, 14, 0, 0}, VL / 8)); + CHECK_NEON(4, int8_t, + fillNeon( + {1, 2, 3, 4, 5, 6, 7, 8, -9, -10, -11, -12, 13, 14, -15, -1}, + VL / 8)); + CHECK_NEON(5, int8_t, + fillNeon({127, 127, 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127}, + VL / 8)); } TEST_P(InstSve, smin) { From 75f0d9f32f85f62c5064099b014f41e06e299a31 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Tue, 10 Sep 2024 16:21:39 +0100 Subject: [PATCH 24/38] Also added smin tests --- test/regression/aarch64/instructions/sve.cc | 195 ++++++++++++++++++-- 1 file changed, 183 insertions(+), 12 deletions(-) diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index f79402491a..b55d6b2a4d 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -6623,6 +6623,63 @@ TEST_P(InstSve, smax) { } TEST_P(InstSve, smin) { + // 64-bit + initialHeapData_.resize(VL / 4); + int64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + std::vector srcA64 = {1, 2, 3, 4, 5, 6, 7, 8, + -9, -10, -11, -12, 13, 14, -15, -1}; + std::vector srcB64 = {16, 15, 14, 13, -12, -11, -10, -9, + 8, 7, 6, 5, 4, 3, -2, -1}; + fillHeapCombined(heap64, srcA64, srcB64, VL / 32); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #0 + mov x4, #8 + mov x5, #2 + addvl x2, x2, #1 + udiv x2, x2, x4 + udiv x3, x2, x5 + whilelo p1.d, xzr, x3 + ptrue p0.d + + ld1d {z0.d}, p0/z, [x0, x1, lsl #3] + ld1d {z1.d}, p0/z, [x0, x2, lsl #3] + ld1d {z2.d}, p0/z, [x0, x2, lsl #3] + + smin z1.d, p0/m, z1.d, z0.d + smin z2.d, p1/m, z2.d, z0.d + + sminv d3, p1, z1.d + sminv d4, p0, z2.d + )"); + + std::vector results64 = {1, 2, 3, 4, -12, -11, -10, -9, + -9, -10, -11, -12, 4, 3, -15, -1}; + std::array arrA64 = fillNeon(results64, VL / 8); + std::rotate(srcB64.begin(), srcB64.begin() + ((VL / 128) % 16), srcB64.end()); + std::array arrB64 = + fillNeonCombined(results64, srcB64, VL / 8); + + CHECK_NEON(1, int64_t, arrA64); + CHECK_NEON(2, int64_t, arrB64); + // Find miniumum element. Modify search end point to only consider the + // elements within the current VL and predication. + int64_t minElemA64 = arrA64[std::distance( + arrA64.begin(), + std::min_element(arrA64.begin(), arrA64.end() - (32 - VL / 128)))]; + int64_t minElemB64 = arrB64[std::distance( + arrB64.begin(), + std::min_element(arrB64.begin(), arrB64.end() - (32 - VL / 64)))]; + CHECK_NEON(3, int64_t, {minElemA64, 0, 0, 0}); + CHECK_NEON(4, int64_t, {minElemB64, 0, 0, 0}); + // 32-bit initialHeapData_.resize(VL / 4); int32_t* heap32 = reinterpret_cast(initialHeapData_.data()); @@ -6662,23 +6719,137 @@ TEST_P(InstSve, smin) { std::vector results32 = {1, 2, 3, 4, -12, -11, -10, -9, -9, -10, -11, -12, 4, 3, -15, -1}; - std::array arrA = fillNeon(results32, VL / 8); + std::array arrA32 = fillNeon(results32, VL / 8); std::rotate(srcB32.begin(), srcB32.begin() + ((VL / 64) % 16), srcB32.end()); - std::array arrB = + std::array arrB32 = fillNeonCombined(results32, srcB32, VL / 8); - CHECK_NEON(1, int32_t, arrA); - CHECK_NEON(2, int32_t, arrB); + CHECK_NEON(1, int32_t, arrA32); + CHECK_NEON(2, int32_t, arrB32); + // Find miniumum element. Modify search end point to only consider the + // elements within the current VL and predication. + int32_t minElemA32 = arrA32[std::distance( + arrA32.begin(), + std::min_element(arrA32.begin(), arrA32.end() - (64 - VL / 64)))]; + int32_t minElemB32 = arrB32[std::distance( + arrB32.begin(), + std::min_element(arrB32.begin(), arrB32.end() - (64 - VL / 32)))]; + CHECK_NEON(3, int32_t, {minElemA32, 0, 0, 0}); + CHECK_NEON(4, int32_t, {minElemB32, 0, 0, 0}); + + // 16-bit + initialHeapData_.resize(VL / 4); + int16_t* heap16 = reinterpret_cast(initialHeapData_.data()); + std::vector srcA16 = {1, 2, 3, 4, 5, 6, 7, 8, + -9, -10, -11, -12, 13, 14, -15, -1}; + std::vector srcB16 = {16, 15, 14, 13, -12, -11, -10, -9, + 8, 7, 6, 5, 4, 3, -2, -1}; + fillHeapCombined(heap16, srcA16, srcB16, VL / 8); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #0 + mov x4, #2 + mov x5, #2 + addvl x2, x2, #1 + udiv x2, x2, x4 + udiv x3, x2, x5 + whilelo p1.h, xzr, x3 + ptrue p0.h + + ld1h {z0.h}, p0/z, [x0, x1, lsl #1] + ld1h {z1.h}, p0/z, [x0, x2, lsl #1] + ld1h {z2.h}, p0/z, [x0, x2, lsl #1] + + smin z1.h, p0/m, z1.h, z0.h + smin z2.h, p1/m, z2.h, z0.h + + sminv h3, p1, z1.h + sminv h4, p0, z2.h + )"); + + std::vector results16 = {1, 2, 3, 4, -12, -11, -10, -9, + -9, -10, -11, -12, 4, 3, -15, -1}; + std::array arrA16 = fillNeon(results16, VL / 8); + std::rotate(srcB16.begin(), srcB16.begin() + ((VL / 32) % 16), srcB16.end()); + std::array arrB16 = + fillNeonCombined(results16, srcB16, VL / 8); + + CHECK_NEON(1, int16_t, arrA16); + CHECK_NEON(2, int16_t, arrB16); + // Find miniumum element. Modify search end point to only consider the + // elements within the current VL and predication. + int16_t minElemA16 = arrA16[std::distance( + arrA16.begin(), + std::min_element(arrA16.begin(), arrA16.end() - (128 - VL / 32)))]; + int16_t minElemB16 = arrB16[std::distance( + arrB16.begin(), + std::min_element(arrB16.begin(), arrB16.end() - (128 - VL / 16)))]; + CHECK_NEON(3, int16_t, {minElemA16, 0, 0, 0}); + CHECK_NEON(4, int16_t, {minElemB16, 0, 0, 0}); + + // 8-bit + initialHeapData_.resize(VL / 4); + int8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + std::vector srcA8 = {1, 2, 3, 4, 5, 6, 7, 8, + -9, -10, -11, -12, 13, 14, -15, -1}; + std::vector srcB8 = {16, 15, 14, 13, -12, -11, -10, -9, + 8, 7, 6, 5, 4, 3, -2, -1}; + fillHeapCombined(heap8, srcA8, srcB8, VL / 4); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #0 + mov x4, #1 + mov x5, #2 + addvl x2, x2, #1 + udiv x2, x2, x4 + udiv x3, x2, x5 + whilelo p1.b, xzr, x3 + ptrue p0.b + + ld1b {z0.b}, p0/z, [x0, x1] + ld1b {z1.b}, p0/z, [x0, x2] + ld1b {z2.b}, p0/z, [x0, x2] + + smin z1.b, p0/m, z1.b, z0.b + smin z2.b, p1/m, z2.b, z0.b + + sminv b3, p1, z1.b + sminv b4, p0, z2.b + )"); + + std::vector results8 = {1, 2, 3, 4, -12, -11, -10, -9, + -9, -10, -11, -12, 4, 3, -15, -1}; + std::array arrA8 = fillNeon(results8, VL / 8); + std::rotate(srcB8.begin(), srcB8.begin() + ((VL / 16) % 16), srcB8.end()); + std::array arrB8 = + fillNeonCombined(results8, srcB8, VL / 8); + + CHECK_NEON(1, int8_t, arrA8); + CHECK_NEON(2, int8_t, arrB8); // Find miniumum element. Modify search end point to only consider the // elements within the current VL and predication. - int32_t minElemA = arrA[std::distance( - arrA.begin(), - std::min_element(arrA.begin(), arrA.end() - (64 - VL / 64)))]; - int32_t minElemB = arrB[std::distance( - arrB.begin(), - std::min_element(arrB.begin(), arrB.end() - (64 - VL / 32)))]; - CHECK_NEON(3, int32_t, {minElemA, 0, 0, 0}); - CHECK_NEON(4, int32_t, {minElemB, 0, 0, 0}); + int8_t minElemA8 = arrA8[std::distance( + arrA8.begin(), + std::min_element(arrA8.begin(), arrA8.end() - (256 - VL / 16)))]; + int8_t minElemB8 = arrB8[std::distance( + arrB8.begin(), + std::min_element(arrB8.begin(), arrB8.end() - (256 - VL / 8)))]; + CHECK_NEON(3, int8_t, {minElemA8, 0, 0, 0}); + CHECK_NEON(4, int8_t, {minElemB8, 0, 0, 0}); } TEST_P(InstSve, smulh) { From 02386a380b27b947d02ab00d533472bbc9789ae0 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Tue, 10 Sep 2024 23:48:19 +0100 Subject: [PATCH 25/38] Added tests for umaxv and whilels --- test/regression/aarch64/instructions/sve.cc | 258 +++++++++++++++++++- 1 file changed, 249 insertions(+), 9 deletions(-) diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index b55d6b2a4d..f48e121c7d 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -6947,16 +6947,32 @@ TEST_P(InstSve, umaxv) { heap[5] = 0xCC; heap[6] = 0xDD; heap[7] = 0xEE; + heap[8] = 0x07; + heap[9] = 0x00; + heap[10] = 0xFC; + heap[11] = 0xFD; + heap[12] = 0xBA; + heap[13] = 0xCA; + heap[14] = 0x39; + heap[15] = 0xEF; // v1 - heap[8] = 0x00; - heap[9] = 0x00; - heap[10] = 0xEE; - heap[11] = 0x11; - heap[12] = 0x22; - heap[13] = 0x33; - heap[14] = 0x44; - heap[15] = 0x55; + heap[16] = 0x00; + heap[17] = 0x00; + heap[18] = 0xEE; + heap[19] = 0x11; + heap[20] = 0x22; + heap[21] = 0x33; + heap[22] = 0x44; + heap[23] = 0x55; + heap[24] = 0x26; + heap[25] = 0xFF; + heap[26] = 0xEA; + heap[27] = 0xFA; + heap[28] = 0x14; + heap[29] = 0x43; + heap[30] = 0x21; + heap[31] = 0xAE; RUN_AARCH64(R"( # Get heap address @@ -6965,15 +6981,45 @@ TEST_P(InstSve, umaxv) { svc #0 ldr q0, [x0] - ldr q1, [x0, #8] + ldr q1, [x0, #16] umaxv h2, v0.4h umaxv h3, v1.4h + umaxv h4, v0.8h + umaxv h5, v1.8h + + umaxv s6, v0.4s + umaxv s7, v1.4s + + umaxv b8, v0.8b + umaxv b9, v1.8b + + umaxv b10, v0.16b + umaxv b11, v1.16b + )"); CHECK_NEON(2, uint16_t, {0xEEDD, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}); CHECK_NEON(3, uint16_t, {0x5544, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}); + CHECK_NEON(4, uint16_t, + {0xFDFC, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}); + CHECK_NEON(5, uint16_t, + {0xFF26, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}); + CHECK_NEON(6, uint32_t, {0xFDFC0007, 0x00000000, 0x00000000, 0x00000000}); + CHECK_NEON(7, uint32_t, {0xFAEAFF26, 0x00000000, 0x00000000, 0x00000000}); + CHECK_NEON(8, uint8_t, + {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00}) + CHECK_NEON(9, uint8_t, + {0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00}) + CHECK_NEON(10, uint8_t, + {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00}) + CHECK_NEON(11, uint8_t, + {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00}) } TEST_P(InstSve, clastb) { @@ -8722,6 +8768,200 @@ TEST_P(InstSve, whilelo) { EXPECT_EQ(getNZCV(), 0b0110); } +TEST_P(InstSve, whilels) { + // 8-bit arrangement, 64-bit source operands + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + + whilels p0.b, xzr, x0 + )"); + CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {1}, 1)); + EXPECT_EQ(getNZCV(), 0b1000); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #2 + udiv x2, x0, x1 + + whilels p1.b, x2, x0 + )"); + CHECK_PREDICATE(1, uint64_t, fillPred((VL / 16) + 1, {1}, 1)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #2 + udiv x2, x0, x1 + mov x3, #4 + udiv x4, x0, x3 + add x5, x4, x2 + + whilels p2.b, x5, x0 + )"); + CHECK_PREDICATE(2, uint64_t, fillPred((VL / 32) + 1, {1}, 1)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + sub x0, x0, #1 + mov x1, #0 + + whilels p3.b, x1, x0 + )"); + CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 1)); + EXPECT_EQ(getNZCV(), 0b1000); + + // 16-bit arrangement, 64-bit source operands + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #2 + udiv x2, x0, x1 + + whilels p0.h, xzr, x2 + )"); + CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {1}, 2)); + EXPECT_EQ(getNZCV(), 0b1000); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #2 + udiv x0, x0, x1 + udiv x2, x0, x1 + + whilels p1.h, x2, x0 + )"); + CHECK_PREDICATE(1, uint64_t, fillPred((VL / 16) + 1, {1}, 2)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #4 + udiv x2, x0, x1 + mov x3, #8 + udiv x4, x0, x3 + mov x5, #2 + udiv x0, x0, x5 + add x6, x4, x2 + + whilels p2.h, x6, x0 + )"); + CHECK_PREDICATE(2, uint64_t, fillPred((VL / 32) + 1, {1}, 2)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #2 + udiv x0, x0, x1 + sub x0, x0, #1 + + whilels p3.h, xzr, x0 + )"); + CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 2)); + EXPECT_EQ(getNZCV(), 0b1000); + + // 32-bit arrangement, 64-bit source operands + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #4 + udiv x2, x0, x1 + + whilels p0.s, xzr, x2 + )"); + CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {1}, 4)); + EXPECT_EQ(getNZCV(), 0b1000); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #2 + mov x2, #4 + udiv x0, x0, x2 + udiv x3, x0, x1 + + whilels p1.s, x3, x0 + )"); + CHECK_PREDICATE(1, uint64_t, fillPred((VL / 16) + 1, {1}, 4)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #8 + udiv x2, x0, x1 + mov x3, #16 + udiv x4, x0, x3 + mov x5, #4 + udiv x0, x0, x5 + add x6, x4, x2 + + whilels p2.s, x6, x0 + )"); + CHECK_PREDICATE(2, uint64_t, fillPred((VL / 32) + 1, {1}, 4)); + EXPECT_EQ(getNZCV(), 0b1010); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #4 + udiv x0, x0, x1 + sub x0, x0, #1 + + whilels p3.s, xzr, x0 + )"); + CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 4)); + EXPECT_EQ(getNZCV(), 0b1000); + + // 64-bit arrangement, 64-bit source operands + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #8 + udiv x2, x0, x1 + + whilels p0.d, xzr, x2 + )"); + CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {1}, 8)); + EXPECT_EQ(getNZCV(), 0b1000); + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #2 + mov x2, #8 + udiv x0, x0, x2 + udiv x3, x0, x1 + + whilels p1.d, x3, x0 + )"); + CHECK_PREDICATE(1, uint64_t, fillPred((VL / 16) + 1, {1}, 8)); + if (VL == 128) { + EXPECT_EQ(getNZCV(), 0b1000); + } else { + EXPECT_EQ(getNZCV(), 0b1010); + } + + RUN_AARCH64(R"( + mov x0, #0 + addvl x0, x0, #1 + mov x1, #8 + udiv x0, x0, x1 + sub x0, x0, #1 + + whilels p3.d, xzr, x0 + )"); + CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 8)); + EXPECT_EQ(getNZCV(), 0b1000); +} + TEST_P(InstSve, whilelt) { // 8-bit arrangement, 64-bit source operands RUN_AARCH64(R"( From 4712ea43fe0492b9bdaa2be3f41e8f91c55e1656 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Wed, 11 Sep 2024 17:35:36 +0100 Subject: [PATCH 26/38] Added (or fixed) tests for pfirst and splice --- .../simeng/arch/aarch64/helpers/sve.hh | 6 +- test/regression/aarch64/instructions/sve.cc | 62 +++++++++++++++++++ 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 8b9f32c89a..7853ac438b 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -1538,12 +1538,12 @@ std::array svePsel( std::array svePfirst(srcValContainer& sourceValues, const uint16_t VL_bits) { const uint16_t partition_num = VL_bits / 8; - const uint64_t* p = sourceValues[0].getAsVector(); - const uint64_t* dn = sourceValues[1].getAsVector(); + // sourceValues are wrong and the correct value is in the previous index. + const uint64_t* p = sourceValues[1].getAsVector(); + const uint64_t* dn = sourceValues[2].getAsVector(); // Set destination d as source n to copy all false lanes and the active lanes // beyond the first std::array out = {dn[0], dn[1], dn[2], dn[3]}; - // Get the first active lane and set same lane in destination predicate for (int i = 0; i < partition_num; i++) { uint64_t shifted_active = 1ull << ((i % (64))); diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index f48e121c7d..e733c9e667 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -5971,6 +5971,26 @@ TEST_P(InstSve, pfalse) { CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {0}, 1)); } +TEST_P(InstSve, pfirst) { + RUN_AARCH64(R"( + ptrue p0.b + pfalse p1.b + ptrue p2.b + ptrue p3.b + pfalse p4.b + pfalse p5.b + + pfirst p2.b, p0, p2.b + pfirst p3.b, p1, p3.b + pfirst p4.b, p0, p4.b + pfirst p5.b, p1, p5.b + )"); + CHECK_PREDICATE(2, uint64_t, fillPred(VL / 8, {1}, 1)); + CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 1)); + CHECK_PREDICATE(4, uint64_t, fillPred(1, {1}, 1)); + CHECK_PREDICATE(5, uint64_t, fillPred(VL / 8, {0}, 1)); +} + TEST_P(InstSve, ptrue) { RUN_AARCH64(R"( ptrue p0.s @@ -7228,6 +7248,48 @@ TEST_P(InstSve, lastb) { CHECK_NEON(1, uint64_t, fillNeon({0x1F}, 8)); } +TEST_P(InstSve, splice) { + // 64-bit arrangement + RUN_AARCH64(R"( + fmov z0.d, #1.5 + fmov z1.d, #-0.5 + fmov z2.d, #1.5 + + ptrue p0.d + + mov x2, #0 + mov x4, #16 + addvl x2, x2, #1 + udiv x2, x2, x4 + whilelo p1.d, xzr, x2 + + splice z0.d, p0, z0.d, z1.d + splice z2.d, p1, z2.d, z1.d + )"); + CHECK_NEON(0, double, fillNeon({1.5}, VL / 8)); + CHECK_NEON(2, double, fillNeonCombined({1.5}, {-0.5}, VL / 8)); + + // 32-bit arrangement + RUN_AARCH64(R"( + fmov z0.s, #1.5 + fmov z1.s, #-0.5 + fmov z2.s, #1.5 + + ptrue p0.s + + mov x2, #0 + mov x4, #8 + addvl x2, x2, #1 + udiv x2, x2, x4 + whilelo p1.s, xzr, x2 + + splice z0.s, p0, z0.s, z1.s + splice z2.s, p1, z2.s, z1.s + )"); + CHECK_NEON(0, float, fillNeon({1.5}, VL / 8)); + CHECK_NEON(2, float, fillNeonCombined({1.5}, {-0.5}, VL / 8)); +} + TEST_P(InstSve, st1b) { initialHeapData_.resize(VL / 4); uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); From c73b2d2d74abd02646d2832d66ad1d5f5b4a0adc Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Wed, 11 Sep 2024 21:49:46 +0100 Subject: [PATCH 27/38] Added tests for ftsmul and fixed some broken logic --- .../simeng/arch/aarch64/helpers/sve.hh | 2 +- src/lib/arch/aarch64/InstructionMetadata.cc | 4 ++ test/regression/aarch64/instructions/sve.cc | 71 +++++++++++++++++++ 3 files changed, 76 insertions(+), 1 deletion(-) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 7853ac438b..d704a38269 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -921,7 +921,7 @@ RegisterValue sveFTrigSMul(srcValContainer& sourceValues, // register for (int i = 0; i < partition_num; i++) { out[i] = n[i] * n[i]; - T sign_bit = m[i] & bit_0_mask ? 1.0 : -1.0; + T sign_bit = m[i] & bit_0_mask ? -1.0 : 1.0; out[i] = std::abs(out[i]) * sign_bit; } diff --git a/src/lib/arch/aarch64/InstructionMetadata.cc b/src/lib/arch/aarch64/InstructionMetadata.cc index 34ddca07d7..c177ef90f1 100644 --- a/src/lib/arch/aarch64/InstructionMetadata.cc +++ b/src/lib/arch/aarch64/InstructionMetadata.cc @@ -89,6 +89,10 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn) } case Opcode::AArch64_SMAX_ZI_B: [[fallthrough]]; + case Opcode::AArch64_FTSMUL_ZZZ_D: + [[fallthrough]]; + case Opcode::AArch64_FTSMUL_ZZZ_S: + [[fallthrough]]; case Opcode::AArch64_SMAX_ZI_D: [[fallthrough]]; case Opcode::AArch64_SMAX_ZI_H: diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index e733c9e667..c6f55865b4 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -4934,6 +4934,77 @@ TEST_P(InstSve, index) { CHECK_NEON(7, uint64_t, fillNeonBaseAndOffset(10, 10, VL / 8)); } +TEST_P(InstSve, ftsmul) { + initialHeapData_.resize(VL / 4); + // 64-bit arrangement + double* dheap = reinterpret_cast(initialHeapData_.data()); + std::vector srcA64 = {1.0, 2.0, 4.0, 12.34}; + std::vector srcB64 = {1.0, -5.4, 0.0, 78.2}; + fillHeapCombined(dheap, srcA64, srcB64, VL / 32); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #8 + addvl x2, x2, #1 + udiv x2, x2, x3 + ptrue p0.d + + ld1d {z0.d}, p0/z, [x0, x1, lsl #3] + ld1d {z1.d}, p0/z, [x0, x2, lsl #3] + + ftsmul z2.d, z0.d, z1.d + )"); + CHECK_NEON(2, double, fillNeon({1.0, -4.0, 16.0, 152.2756}, VL / 8)); + + // 32-bit arrangement + initialHeapData_.resize(VL / 8); + float* fheap = reinterpret_cast(initialHeapData_.data()); + std::vector fsrcA = {1.0f, 2.0f, 4.0f, 12.34f, + -3.0f, -19.6f, 0.0f, 7.0f}; + std::vector fsrcB = {1.0f, -5.4f, 0.0f, 78.2f, + 2.1f, -26.42f, 12.0f, 3.5f}; + fillHeapCombined(fheap, fsrcA, fsrcB, VL / 32); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #8 + addvl x2, x2, #1 + sdiv x2, x2, x3 + + whilelo p0.s, xzr, x2 + ptrue p1.s + + ld1w {z0.s}, p0/z, [x0, x1, lsl #2] + ld1w {z1.s}, p0/z, [x0, x2, lsl #2] + + ftsmul z2.s, z0.s, z1.s + )"); + CHECK_NEON( + 0, float, + fillNeon({1.0f, 2.0f, 4.0f, 12.34f, -3.0f, -19.6f, 0.0f, 7.0f}, + VL / 16)); + CHECK_NEON( + 1, float, + fillNeon({1.0f, -5.4f, 0.0f, 78.2f, 2.1f, -26.42f, 12.0f, 3.5f}, + VL / 16)); + CHECK_NEON(2, float, + fillNeon( + {1.0f, -4.0f, 16.0f, 152.2756f, 9.0f, -384.16f, 0.0f, 49.0f}, + VL / 16)); +} + TEST_P(InstSve, ld1rd) { initialHeapData_.resize(16); uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); From 6fe35eca5b0b10aac9863ae0b03cbf3dd5ed9626 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Wed, 11 Sep 2024 21:54:59 +0100 Subject: [PATCH 28/38] Added comment to ftsmul test --- test/regression/aarch64/instructions/sve.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index c6f55865b4..7d89571dde 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -4939,6 +4939,10 @@ TEST_P(InstSve, ftsmul) { // 64-bit arrangement double* dheap = reinterpret_cast(initialHeapData_.data()); std::vector srcA64 = {1.0, 2.0, 4.0, 12.34}; + // Note that "the elements of the second source vector hold the corresponding + // value of the quadrant Q number as an integer not a floating-point value". + // We use doubles anyway as we only care about the sign bit, and currently + // "fillHeapCombined" only takes a single templated type std::vector srcB64 = {1.0, -5.4, 0.0, 78.2}; fillHeapCombined(dheap, srcA64, srcB64, VL / 32); @@ -4967,6 +4971,10 @@ TEST_P(InstSve, ftsmul) { float* fheap = reinterpret_cast(initialHeapData_.data()); std::vector fsrcA = {1.0f, 2.0f, 4.0f, 12.34f, -3.0f, -19.6f, 0.0f, 7.0f}; + // Note that "the elements of the second source vector hold the corresponding + // value of the quadrant Q number as an integer not a floating-point value". + // We use floats anyway as we only care about the sign bit, and currently + // "fillHeapCombined" only takes a single templated type std::vector fsrcB = {1.0f, -5.4f, 0.0f, 78.2f, 2.1f, -26.42f, 12.0f, 3.5f}; fillHeapCombined(fheap, fsrcA, fsrcB, VL / 32); From f22be5a94b1e9787f99d558f2a0fa396bc2ade75 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Tue, 17 Sep 2024 22:01:48 +0100 Subject: [PATCH 29/38] Added FTSSEL tests. Nasty bugger.... --- .../simeng/arch/aarch64/helpers/sve.hh | 8 +- src/lib/arch/aarch64/InstructionMetadata.cc | 4 + test/regression/aarch64/instructions/sve.cc | 87 +++++++++++++++++-- 3 files changed, 86 insertions(+), 13 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index d704a38269..99ae84f3b7 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -947,13 +947,11 @@ RegisterValue sveFTrigSSel(srcValContainer& sourceValues, // Place the value 1.0 or a copy of the first source vector element in the // destination element, depending on bit 0 of the corresponding element of // the second source vector. The sign bit of the destination element is - // copied from bit 1 of the second source vector + // negated from bit 1 of the second source vector for (int i = 0; i < partition_num; i++) { - out[i] = m[i] & bit_0_mask ? 1.0 : n[i]; - T sign_bit = m[i] & bit_1_mask ? 1.0 : -1.0; - out[i] = std::abs(out[i]) * sign_bit; + out[i] = m[i] & bit_0_mask ? static_cast(1.0) : n[i]; + out[i] = m[i] & bit_1_mask ? -out[i] : out[i]; } - return {out, 256}; } diff --git a/src/lib/arch/aarch64/InstructionMetadata.cc b/src/lib/arch/aarch64/InstructionMetadata.cc index c177ef90f1..9653e3a00a 100644 --- a/src/lib/arch/aarch64/InstructionMetadata.cc +++ b/src/lib/arch/aarch64/InstructionMetadata.cc @@ -93,6 +93,10 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn) [[fallthrough]]; case Opcode::AArch64_FTSMUL_ZZZ_S: [[fallthrough]]; + case Opcode::AArch64_FTSSEL_ZZZ_D: + [[fallthrough]]; + case Opcode::AArch64_FTSSEL_ZZZ_S: + [[fallthrough]]; case Opcode::AArch64_SMAX_ZI_D: [[fallthrough]]; case Opcode::AArch64_SMAX_ZI_H: diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 7d89571dde..eb826f9a5d 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -4963,8 +4963,10 @@ TEST_P(InstSve, ftsmul) { ld1d {z1.d}, p0/z, [x0, x2, lsl #3] ftsmul z2.d, z0.d, z1.d + ftsmul z3.d, z1.d, z0.d )"); CHECK_NEON(2, double, fillNeon({1.0, -4.0, 16.0, 152.2756}, VL / 8)); + CHECK_NEON(3, double, fillNeon({1.0, 29.16, 0.0, 6115.24}, VL / 8)); // 32-bit arrangement initialHeapData_.resize(VL / 8); @@ -4998,19 +5000,88 @@ TEST_P(InstSve, ftsmul) { ld1w {z1.s}, p0/z, [x0, x2, lsl #2] ftsmul z2.s, z0.s, z1.s + ftsmul z3.s, z1.s, z0.s )"); - CHECK_NEON( - 0, float, - fillNeon({1.0f, 2.0f, 4.0f, 12.34f, -3.0f, -19.6f, 0.0f, 7.0f}, - VL / 16)); - CHECK_NEON( - 1, float, - fillNeon({1.0f, -5.4f, 0.0f, 78.2f, 2.1f, -26.42f, 12.0f, 3.5f}, - VL / 16)); CHECK_NEON(2, float, fillNeon( {1.0f, -4.0f, 16.0f, 152.2756f, 9.0f, -384.16f, 0.0f, 49.0f}, VL / 16)); + CHECK_NEON(3, float, + fillNeon({1.0f, 29.16f, 0.0f, 6115.24f, -4.41f, -698.0164f, + 144.0f, 12.25f}, + VL / 16)); +} + +TEST_P(InstSve, ftssel) { + initialHeapData_.resize(VL / 4); + // 64-bit arrangement + // We use uint64_t to model doubles here as we care about the bit patterns + // rather than values + uint64_t* dheap = reinterpret_cast(initialHeapData_.data()); + std::vector srcA64 = {0x1234, 0xABCD, 0x00000000F0F0FFFF, 0x9876}; + // Note that "The use of the second operand is consistent with it holding an + // integer corresponding to the desired sine-wave quadrant." + std::vector srcB64 = {0x0, 0x8000000000000000, 0x4000000000000000, + 0xC000000000000000}; + fillHeapCombined(dheap, srcA64, srcB64, VL / 32); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #8 + addvl x2, x2, #1 + udiv x2, x2, x3 + ptrue p0.d + + ld1d {z0.d}, p0/z, [x0, x1, lsl #3] + ld1d {z1.d}, p0/z, [x0, x2, lsl #3] + + ftssel z2.d, z0.d, z1.d + )"); + CHECK_NEON(2, uint64_t, + fillNeon({0x1234, 0x3ff0000000000000, 0x80000000F0F0FFFF, + 0xbff0000000000000}, + VL / 8)); + + // 32-bit arrangement + // We use uint32_t to model floats here as we care about the bit patterns + // rather than values + initialHeapData_.resize(VL / 8); + uint32_t* fheap = reinterpret_cast(initialHeapData_.data()); + std::vector fsrcA = {0x1234, 0xABCD, 0x00F0FFFF, 0x9876}; + // Note that "the elements of the second source vector hold the corresponding + // value of the quadrant Q number as an integer not a floating-point value". + std::vector fsrcB = {0x0, 0x80000000, 0x40000000, 0xC0000000}; + fillHeapCombined(fheap, fsrcA, fsrcB, VL / 32); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #8 + addvl x2, x2, #1 + sdiv x2, x2, x3 + + whilelo p0.s, xzr, x2 + ptrue p1.s + + ld1w {z0.s}, p0/z, [x0, x1, lsl #2] + ld1w {z1.s}, p0/z, [x0, x2, lsl #2] + + ftssel z2.s, z0.s, z1.s + )"); + CHECK_NEON(2, uint32_t, + fillNeon({0x1234, 0x3f800000, 0x80F0FFFF, 0xBF800000}, + VL / 16)); } TEST_P(InstSve, ld1rd) { From dad04676fec277c1ca3a009eebc7ee4e1d8f273f Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Fri, 20 Sep 2024 14:43:10 +0100 Subject: [PATCH 30/38] Finally got ftmad sorted. Had issues with 32 bit for some reason --- .../simeng/arch/aarch64/helpers/sve.hh | 43 +++++----- test/regression/aarch64/instructions/sve.cc | 78 +++++++++++++++++++ 2 files changed, 99 insertions(+), 22 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 99ae84f3b7..10ee4e5446 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -963,9 +963,9 @@ RegisterValue sveFTrigMad( srcValContainer& sourceValues, const simeng::arch::aarch64::InstructionMetadata& metadata, const uint16_t VL_bits) { - const T* n = sourceValues[0].getAsVector(); - const T* m = sourceValues[1].getAsVector(); - const uint8_t imm = static_cast(metadata.operands[1].imm); + const T* n = sourceValues[1].getAsVector(); + const T* m = sourceValues[2].getAsVector(); + const uint8_t imm = static_cast(metadata.operands[3].imm); const std::array sin64 = {1.0, -0.1666666666666661, @@ -985,31 +985,30 @@ RegisterValue sveFTrigMad( 0.2087558253975872e-08, -0.1135338700720054e-10}; - const std::array sin32 = {1.0, - -1.666666716337e-01, - 8.333330973983e-03, - -1.983967522392e-04, - 2.721174723774e-06, - 0.0, - 0.0, - 0.0}; - - const std::array cos32 = {1.0, - -5.000000000000e-01, - 4.166664928198e-02, - -1.388759003021e-03, - 2.446388680255e-05, - 0.0, - 0.0, - 0.0}; + const std::array sin32 = {1.0f, + -1.666666716337e-01f, + 8.333330973983e-03f, + -1.983967522392e-04f, + 2.721174723774e-06f, + 0.0f, + 0.0f, + 0.0f}; + + const std::array cos32 = {1.0f, + -5.000000000000e-01f, + 4.166664928198e-02f, + -1.388759003021e-03f, + 2.446388680255e-05f, + 0.0f, + 0.0f, + 0.0f}; const uint16_t partition_num = VL_bits / (sizeof(T) * 8); T out[256 / sizeof(T)] = {0}; - // std::array lut; for (int i = 0; i < partition_num; i++) { T coeff; - const bool sign_bit = m[i] < 0 ? 1 : 0; + const bool sign_bit = std::signbit(m[i]); // If float then use those LUTs if (sizeof(T) == 4) { coeff = sign_bit ? cos32[imm] : sin32[imm]; diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index eb826f9a5d..eda1d97602 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -5084,6 +5084,84 @@ TEST_P(InstSve, ftssel) { VL / 16)); } +TEST_P(InstSve, ftmad) { + initialHeapData_.resize(VL / 4); + // 64-bit arrangement + double* dheap = reinterpret_cast(initialHeapData_.data()); + std::vector srcA64 = {0.0, 0.5, -0.5, 0.75}; + std::vector srcB64 = {0.0, 0.5, -0.4, -0.2}; + fillHeapCombined(dheap, srcA64, srcB64, VL / 32); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #8 + addvl x2, x2, #1 + udiv x2, x2, x3 + ptrue p0.d + + ld1d {z0.d}, p0/z, [x0, x1, lsl #3] + ld1d {z1.d}, p0/z, [x0, x2, lsl #3] + mov z2.d, z0.d + mov z3.d, z0.d + mov z4.d, z0.d + + ftmad z2.d, z2.d, z1.d, #0 + ftmad z3.d, z3.d, z1.d, #2 + ftmad z4.d, z4.d, z1.d, #7 + )"); + CHECK_NEON(2, double, fillNeon({1.0, 1.25, 0.8, 1.15}, VL / 8)); + CHECK_NEON(3, double, + fillNeon({0.008333333333320002, 0.258333333333320002, + -0.15833333333333355, 0.19166666666666645}, + VL / 8)); + CHECK_NEON( + 4, double, + fillNeon({0.0, 0.25, -0.20000000001135337, 0.1499999999886466}, + VL / 8)); + + // 32-bit arrangement + initialHeapData_.resize(VL / 4); + float* fheap = reinterpret_cast(initialHeapData_.data()); + std::vector fsrcA = {0.0f, 0.5f, -0.5f, 0.75f}; + std::vector fsrcB = {0.0f, 0.5f, -0.4f, -0.2f}; + fillHeapCombined(fheap, fsrcA, fsrcB, VL / 16); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #0 + mov x2, #0 + mov x3, #4 + addvl x2, x2, #1 + sdiv x2, x2, x3 + + whilelo p0.s, xzr, x2 + + ld1w {z2.s}, p0/z, [x0] + ld1w {z3.s}, p0/z, [x0] + ld1w {z4.s}, p0/z, [x0, x1, lsl #2] + ld1w {z1.s}, p0/z, [x0, x2, lsl #2] + + ftmad z2.s, z2.s, z1.s, #0 + ftmad z3.s, z3.s, z1.s, #2 + ftmad z4.s, z4.s, z1.s, #7 + )"); + CHECK_NEON(2, float, fillNeon({1.0f, 1.25f, 0.8f, 1.15f}, VL / 8)); + CHECK_NEON(3, float, + fillNeon( + {0.00833333f, 0.25833333f, -0.1583334f, 0.1916666f}, VL / 8)); + CHECK_NEON(4, float, fillNeon({0.0f, 0.25f, -0.2f, 0.15f}, VL / 8)); +} + TEST_P(InstSve, ld1rd) { initialHeapData_.resize(16); uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); From 5a611d31f922c04dbebbfb14fc4facb96fe5e2dc Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Fri, 20 Sep 2024 17:51:44 +0100 Subject: [PATCH 31/38] Added LDAXRB and STLXR insts. STLXR took some fix in decode to flag as a store --- src/lib/arch/aarch64/Instruction_address.cc | 8 +- src/lib/arch/aarch64/Instruction_decode.cc | 5 +- src/lib/arch/aarch64/Instruction_execute.cc | 8 +- test/regression/aarch64/instructions/load.cc | 62 +++++++++++++ test/regression/aarch64/instructions/store.cc | 89 +++++++++++++++++++ 5 files changed, 165 insertions(+), 7 deletions(-) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index 3f27b5acc3..06eb7e2004 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -1365,11 +1365,15 @@ span Instruction::generateAddresses() { setMemoryAddresses({{sourceValues_[1].get(), 1}}); break; } - case Opcode::AArch64_STLXRW: { // stlxr ws, wt, [xn] + case Opcode::AArch64_STLXRH: { // stlxrb ws, ht, [xn] + setMemoryAddresses({{sourceValues_[1].get(), 2}}); + break; + } + case Opcode::AArch64_STLXRW: { // stlxrb ws, wt, [xn] setMemoryAddresses({{sourceValues_[1].get(), 4}}); break; } - case Opcode::AArch64_STLXRX: { // stlxr ws, xt, [xn] + case Opcode::AArch64_STLXRX: { // stlxr ws, xwt, [xn] setMemoryAddresses({{sourceValues_[1].get(), 8}}); break; } diff --git a/src/lib/arch/aarch64/Instruction_decode.cc b/src/lib/arch/aarch64/Instruction_decode.cc index 6d2007cb55..de68245ff6 100644 --- a/src/lib/arch/aarch64/Instruction_decode.cc +++ b/src/lib/arch/aarch64/Instruction_decode.cc @@ -499,8 +499,9 @@ void Instruction::decode() { // Check first operand access to determine if it's a load or store if (metadata_.operands[0].access & CS_AC_WRITE) { - if (metadata_.id == AARCH64_INS_STXR || - metadata_.id == AARCH64_INS_STLXR) { + if (metadata_.id == ARM64_INS_STXR || metadata_.id == ARM64_INS_STLXR || + metadata_.id == ARM64_INS_STLXRB || + metadata_.id == ARM64_INS_STLXRH) { // Exceptions to this is load condition are exclusive store with a // success flag as first operand if (microOpcode_ != MicroOpcode::STR_DATA) { diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index fecf3a36ae..8e8706bcc5 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -5176,17 +5176,19 @@ void Instruction::execute() { } case Opcode::AArch64_STLRW: // stlr wt, [xn] case Opcode::AArch64_STLRX: { // stlr xt, [xn] - // STORE + // STORE + std::cout << "sv0: " << sourceValues_[0] << "\n"; memoryData_[0] = sourceValues_[0]; break; } case Opcode::AArch64_STLXRB: // stlxrb ws, wt, [xn] + case Opcode::AArch64_STLXRH: // stlxrh ws, wt, [xn] case Opcode::AArch64_STLXRW: // stlxr ws, wt, [xn] case Opcode::AArch64_STLXRX: { // stlxr ws, xt, [xn] // STORE memoryData_[0] = sourceValues_[0]; - // TODO: Implement atomic memory access - results_[0] = static_cast(0); + // TODO: Implement atomic memory access + results_[0] = {0, 8}; break; } case Opcode::AArch64_STPDi: // stp dt1, dt2, [xn, #imm] diff --git a/test/regression/aarch64/instructions/load.cc b/test/regression/aarch64/instructions/load.cc index 05ffdd90a0..ed165943af 100644 --- a/test/regression/aarch64/instructions/load.cc +++ b/test/regression/aarch64/instructions/load.cc @@ -695,6 +695,68 @@ TEST_P(InstLoad, ldarb) { EXPECT_EQ(getGeneralRegister(7), 64); } +TEST_P(InstLoad, ldaxrb) { + initialHeapData_.resize(8); + uint32_t* heap = reinterpret_cast(initialHeapData_.data()); + heap[0] = 0xDEADBEEF; + heap[1] = 0x12345678; + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + ldaxrb w1, [x0] + add x0, x0, #1 + ldaxrb w2, [x0] + add x0, x0, #1 + ldaxrb w3, [x0] + add x0, x0, #1 + ldaxrb w4, [x0] + add x0, x0, #1 + ldaxrb w5, [x0] + add x0, x0, #1 + ldaxrb w6, [x0] + add x0, x0, #1 + ldaxrb w7, [x0] + add x0, x0, #1 + ldaxrb w8, [x0] + )"); + EXPECT_EQ(getGeneralRegister(1), 0xEF); + EXPECT_EQ(getGeneralRegister(2), 0xBE); + EXPECT_EQ(getGeneralRegister(3), 0xAD); + EXPECT_EQ(getGeneralRegister(4), 0xDE); + EXPECT_EQ(getGeneralRegister(5), 0x78); + EXPECT_EQ(getGeneralRegister(6), 0x56); + EXPECT_EQ(getGeneralRegister(7), 0x34); + EXPECT_EQ(getGeneralRegister(8), 0x12); + + RUN_AARCH64(R"( + sub sp, sp, #1024 + mov w0, #16 + mov w1, #32 + mov w2, #48 + mov w3, #64 + str w0, [sp], #32 + str w1, [sp], #32 + str w2, [sp], #32 + str w3, [sp], #32 + sub sp, sp, #128 + ldaxrb w4, [sp] + add sp, sp, #32 + ldaxrb w5, [sp] + add sp, sp, #32 + ldaxrb w6, [sp] + add sp, sp, #32 + ldaxrb w7, [sp] + )"); + + EXPECT_EQ(getGeneralRegister(4), 16); + EXPECT_EQ(getGeneralRegister(5), 32); + EXPECT_EQ(getGeneralRegister(6), 48); + EXPECT_EQ(getGeneralRegister(7), 64); +} + TEST_P(InstLoad, ldrb) { initialHeapData_.resize(8); uint32_t* heap = reinterpret_cast(initialHeapData_.data()); diff --git a/test/regression/aarch64/instructions/store.cc b/test/regression/aarch64/instructions/store.cc index 6d6876b494..c2298693b8 100644 --- a/test/regression/aarch64/instructions/store.cc +++ b/test/regression/aarch64/instructions/store.cc @@ -60,6 +60,95 @@ TEST_P(InstStore, stlr) { 0xBABA); } +TEST_P(InstStore, stlxr) { + // stlxrb + RUN_AARCH64(R"( + mov w0, 0xAB + mov w1, 0x12 + mov w2, 0xCD + mov w3, 0x34 + sub sp, sp, #4 + stlxrb w4, w0, [sp] + add sp, sp, #1 + stlxrb w5, w1, [sp] + add sp, sp, #1 + stlxrb w6, w2, [sp] + add sp, sp, #1 + stlxrb w7, w3, [sp] + add sp, sp, #1 + )"); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 4), + 0xAB); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 3), + 0x12); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 2), + 0xCD); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 1), + 0x34); + EXPECT_EQ(getGeneralRegister(4), 0); + EXPECT_EQ(getGeneralRegister(5), 0); + EXPECT_EQ(getGeneralRegister(6), 0); + EXPECT_EQ(getGeneralRegister(7), 0); + + // stlxrh + RUN_AARCH64(R"( + mov w0, 0xABCD + mov w1, 0x1234 + mov w2, 0xCDEF + mov w3, 0x3456 + sub sp, sp, #8 + stlxrh w4, w0, [sp] + add sp, sp, #2 + stlxrh w5, w1, [sp] + add sp, sp, #2 + stlxrh w6, w2, [sp] + add sp, sp, #2 + stlxrh w7, w3, [sp] + add sp, sp, #2 + )"); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 8), + 0xABCD); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 6), + 0x1234); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 4), + 0xCDEF); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 2), + 0x3456); + EXPECT_EQ(getGeneralRegister(4), 0); + EXPECT_EQ(getGeneralRegister(5), 0); + EXPECT_EQ(getGeneralRegister(6), 0); + EXPECT_EQ(getGeneralRegister(7), 0); + + // stlxr + RUN_AARCH64(R"( + mov w0, 0xABCD + mov w1, 0x1234 + mov w2, 0xCDEF + mov w3, 0x3456 + sub sp, sp, #24 + stlxr w4, x0, [sp] + add sp, sp, #8 + stlxr w5, x1, [sp] + add sp, sp, #8 + stlxr w6, w2, [sp] + add sp, sp, #4 + stlxr w7, w3, [sp] + add sp, sp, #4 + )"); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 24), + 0xABCD); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 16), + 0x1234); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 8), + 0xCDEF); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 4), + 0x3456); + EXPECT_EQ(getGeneralRegister(4), 0); + EXPECT_EQ(getGeneralRegister(5), 0); + EXPECT_EQ(getGeneralRegister(6), 0); + EXPECT_EQ(getGeneralRegister(7), 0); +} + TEST_P(InstStore, strb) { RUN_AARCH64(R"( mov w0, 0xAB From a58409bd39ca883f9f7f91acc0efd30277287a4c Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Fri, 20 Sep 2024 18:18:22 +0100 Subject: [PATCH 32/38] Added test for ORN. Finished all base tests --- test/regression/aarch64/instructions/neon.cc | 21 ++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc index 680574158a..1a51df01e7 100644 --- a/test/regression/aarch64/instructions/neon.cc +++ b/test/regression/aarch64/instructions/neon.cc @@ -1090,6 +1090,27 @@ TEST_P(InstNeon, eor) { CHECK_NEON(3, uint8_t, {1, 3, 1, 7, 1, 3, 1, 15, 0, 0, 0, 0, 0, 0, 0, 0}); } +TEST_P(InstNeon, orn) { + initialHeapData_.resize(16); + uint8_t* heap = reinterpret_cast(initialHeapData_.data()); + for (int i = 0; i < 8; i++) { + heap[i] = i; + heap[i + 8] = i + 1; + } + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr q0, [x0] + ldr q1, [x0, #8] + + orn v2.8b, v0.8b, v1.8b + )"); + CHECK_NEON(2, uint8_t, {254, 253, 254, 251, 254, 253, 254, 247}); +} + TEST_P(InstNeon, ext) { RUN_AARCH64(R"( movi v0.16b, #0xAB From 0ffcd51bb0e268adc0b084604d66cb24afd714f7 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Tue, 10 Dec 2024 13:33:05 +0000 Subject: [PATCH 33/38] Added group tests to all added insts --- src/lib/arch/aarch64/Instruction_execute.cc | 2 -- test/regression/aarch64/instructions/load.cc | 5 +++ test/regression/aarch64/instructions/neon.cc | 8 +++++ test/regression/aarch64/instructions/store.cc | 7 +++++ test/regression/aarch64/instructions/sve.cc | 31 +++++++++++++++++++ 5 files changed, 51 insertions(+), 2 deletions(-) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 8e8706bcc5..23a51c190c 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -5176,8 +5176,6 @@ void Instruction::execute() { } case Opcode::AArch64_STLRW: // stlr wt, [xn] case Opcode::AArch64_STLRX: { // stlr xt, [xn] - // STORE - std::cout << "sv0: " << sourceValues_[0] << "\n"; memoryData_[0] = sourceValues_[0]; break; } diff --git a/test/regression/aarch64/instructions/load.cc b/test/regression/aarch64/instructions/load.cc index ed165943af..83737c14ce 100644 --- a/test/regression/aarch64/instructions/load.cc +++ b/test/regression/aarch64/instructions/load.cc @@ -3,6 +3,7 @@ namespace { using InstLoad = AArch64RegressionTest; +using namespace simeng::arch::aarch64::InstructionGroups; TEST_P(InstLoad, ld1r) { // 8-bit @@ -755,6 +756,8 @@ TEST_P(InstLoad, ldaxrb) { EXPECT_EQ(getGeneralRegister(5), 32); EXPECT_EQ(getGeneralRegister(6), 48); EXPECT_EQ(getGeneralRegister(7), 64); + + EXPECT_GROUP(R"(ldaxrb w7, [sp])", LOAD_INT); } TEST_P(InstLoad, ldrb) { @@ -1353,6 +1356,8 @@ TEST_P(InstLoad, ldrsw) { EXPECT_EQ(getGeneralRegister(4), -5); EXPECT_EQ(getGeneralRegister(5), -5); + EXPECT_GROUP(R"(ldrsw x4, [x0, x6, lsl #2])", LOAD_INT); + // ldursw RUN_AARCH64(R"( # Get heap address diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc index 1a51df01e7..91dee06ebb 100644 --- a/test/regression/aarch64/instructions/neon.cc +++ b/test/regression/aarch64/instructions/neon.cc @@ -374,6 +374,8 @@ TEST_P(InstNeon, uaddlv) { uaddlv h1, v0.8b )"); CHECK_NEON(1, uint16_t, {36}); + + EXPECT_GROUP(R"(uaddlv h1, v0.8b)", SCALAR_SIMPLE_ARTH_NOSHIFT); } TEST_P(InstNeon, and) { @@ -770,6 +772,8 @@ TEST_P(InstNeon, cmeq) { cmeq v2.4s, v0.4s, v1.4s )"); CHECK_NEON(2, uint32_t, {0, 0xFFFFFFFFu, 0xFFFFFFFFu, 0}); + + EXPECT_GROUP(R"(cmeq v2.4s, v0.4s, v1.4s)", VECTOR_SIMPLE_CMP); } TEST_P(InstNeon, cmhs) { @@ -883,6 +887,8 @@ TEST_P(InstNeon, cmhi) { )"); CHECK_NEON(2, uint32_t, {0xFFFFFFFF, 0x0, 0xFFFFFFFF, 0x0}); CHECK_NEON(3, uint32_t, {0x0, 0xFFFFFFFF, 0x0, 0x0}); + + EXPECT_GROUP(R"(cmhi v3.4s, v1.4s, v0.4s)", VECTOR_SIMPLE_CMP); } TEST_P(InstNeon, cnt) { @@ -1109,6 +1115,8 @@ TEST_P(InstNeon, orn) { orn v2.8b, v0.8b, v1.8b )"); CHECK_NEON(2, uint8_t, {254, 253, 254, 251, 254, 253, 254, 247}); + + EXPECT_GROUP(R"(orn v2.8b, v0.8b, v1.8b)", VECTOR_SIMPLE_LOGICAL_NOSHIFT); } TEST_P(InstNeon, ext) { diff --git a/test/regression/aarch64/instructions/store.cc b/test/regression/aarch64/instructions/store.cc index c2298693b8..2b43e510e4 100644 --- a/test/regression/aarch64/instructions/store.cc +++ b/test/regression/aarch64/instructions/store.cc @@ -3,6 +3,7 @@ namespace { using InstStore = AArch64RegressionTest; +using namespace simeng::arch::aarch64::InstructionGroups; TEST_P(InstStore, stlr) { // stlrb @@ -90,6 +91,8 @@ TEST_P(InstStore, stlxr) { EXPECT_EQ(getGeneralRegister(6), 0); EXPECT_EQ(getGeneralRegister(7), 0); + EXPECT_GROUP(R"(stlxrb w7, w3, [sp])", STORE_ADDRESS_INT); + // stlxrh RUN_AARCH64(R"( mov w0, 0xABCD @@ -119,6 +122,8 @@ TEST_P(InstStore, stlxr) { EXPECT_EQ(getGeneralRegister(6), 0); EXPECT_EQ(getGeneralRegister(7), 0); + EXPECT_GROUP(R"(stlxrh w7, w3, [sp])", STORE_ADDRESS_INT); + // stlxr RUN_AARCH64(R"( mov w0, 0xABCD @@ -147,6 +152,8 @@ TEST_P(InstStore, stlxr) { EXPECT_EQ(getGeneralRegister(5), 0); EXPECT_EQ(getGeneralRegister(6), 0); EXPECT_EQ(getGeneralRegister(7), 0); + + EXPECT_GROUP(R"(stlxr w7, w3, [sp])", STORE_ADDRESS_INT); } TEST_P(InstStore, strb) { diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index eda1d97602..ab027b408a 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -1506,6 +1506,8 @@ TEST_P(InstSve, cmphs_vec) { )"); CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 8)); EXPECT_EQ(getNZCV(), 0b1000); + + EXPECT_GROUP(R"(cmphs p1.d, p0/z, z1.d, z0.d)", PREDICATE); } TEST_P(InstSve, cnt) { @@ -1868,6 +1870,8 @@ TEST_P(InstSve, cpy) { CHECK_NEON(3, double, fillNeon({static_cast(-16)}, VL / 16)); CHECK_NEON(4, double, fillNeon({12}, VL / 8)); CHECK_NEON(5, double, fillNeon({static_cast(-8)}, VL / 16)); + + EXPECT_GROUP(R"(cpy z3.d, p1/m, d9)", SVE_SIMPLE_ARTH_NOSHIFT); } TEST_P(InstSve, fcpy) { @@ -3775,6 +3779,8 @@ TEST_P(InstSve, fdiv) { CHECK_NEON(1, double, fillNeon(dresults, VL / 8)); std::rotate(dsrcB.begin(), dsrcB.begin() + ((VL / 128) % 8), dsrcB.end()); CHECK_NEON(2, double, fillNeonCombined(dresults, dsrcB, VL / 8)); + + EXPECT_GROUP(R"(fdiv z2.d, p0/m, z2.d, z0.d)", SVE_DIV_OR_SQRT); } TEST_P(InstSve, fnmls) { @@ -5010,6 +5016,8 @@ TEST_P(InstSve, ftsmul) { fillNeon({1.0f, 29.16f, 0.0f, 6115.24f, -4.41f, -698.0164f, 144.0f, 12.25f}, VL / 16)); + + EXPECT_GROUP(R"(ftsmul z3.s, z1.s, z0.s)", SVE_MUL); } TEST_P(InstSve, ftssel) { @@ -5082,6 +5090,8 @@ TEST_P(InstSve, ftssel) { CHECK_NEON(2, uint32_t, fillNeon({0x1234, 0x3f800000, 0x80F0FFFF, 0xBF800000}, VL / 16)); + + EXPECT_GROUP(R"(ftssel z2.s, z0.s, z1.s)", SVE_SIMPLE_ARTH_NOSHIFT); } TEST_P(InstSve, ftmad) { @@ -5160,6 +5170,8 @@ TEST_P(InstSve, ftmad) { fillNeon( {0.00833333f, 0.25833333f, -0.1583334f, 0.1916666f}, VL / 8)); CHECK_NEON(4, float, fillNeon({0.0f, 0.25f, -0.2f, 0.15f}, VL / 8)); + + EXPECT_GROUP(R"(ftmad z4.s, z4.s, z1.s, #7)", SVE_MUL); } TEST_P(InstSve, ld1rd) { @@ -6217,6 +6229,8 @@ TEST_P(InstSve, pfirst) { CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 1)); CHECK_PREDICATE(4, uint64_t, fillPred(1, {1}, 1)); CHECK_PREDICATE(5, uint64_t, fillPred(VL / 8, {0}, 1)); + + EXPECT_GROUP(R"(pfirst p5.b, p1, p5.b)", PREDICATE); } TEST_P(InstSve, ptrue) { @@ -6330,6 +6344,8 @@ TEST_P(InstSve, pnext) { CHECK_PREDICATE(1, uint64_t, fillPredFromSource({0x1, 0, 0, 0}, 32)); EXPECT_EQ(getNZCV(), 0b1010); + + EXPECT_GROUP(R"(pnext p1.d, p3, p1.d)", PREDICATE); } TEST_P(InstSve, punpk) { @@ -6868,6 +6884,8 @@ TEST_P(InstSve, smax) { fillNeon({127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127}, VL / 8)); + + EXPECT_GROUP(R"(smax z5.b, z5.b, #127)", SVE_SIMPLE_ARTH_NOSHIFT); } TEST_P(InstSve, smin) { @@ -7098,6 +7116,9 @@ TEST_P(InstSve, smin) { std::min_element(arrB8.begin(), arrB8.end() - (256 - VL / 8)))]; CHECK_NEON(3, int8_t, {minElemA8, 0, 0, 0}); CHECK_NEON(4, int8_t, {minElemB8, 0, 0, 0}); + + EXPECT_GROUP(R"(smin z2.b, p1/m, z2.b, z0.b)", SVE_SIMPLE_ARTH_NOSHIFT); + EXPECT_GROUP(R"(sminv b4, p0, z2.b)", SCALAR_SIMPLE_ARTH_NOSHIFT); } TEST_P(InstSve, smulh) { @@ -7268,6 +7289,8 @@ TEST_P(InstSve, umaxv) { CHECK_NEON(11, uint8_t, {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}) + + EXPECT_GROUP(R"(umaxv b11, v1.16b)", SCALAR_SIMPLE_ARTH_NOSHIFT); } TEST_P(InstSve, clastb) { @@ -7370,6 +7393,8 @@ TEST_P(InstSve, clastb) { )"); CHECK_NEON(0, uint64_t, fillNeon({0xEF}, 8)); CHECK_NEON(1, uint64_t, fillNeon({0x1F}, 8)); + + EXPECT_GROUP(R"(clastb b2, p0, b2, z3.b)", SCALAR_SIMPLE_ARTH_NOSHIFT); } TEST_P(InstSve, lastb) { @@ -7474,6 +7499,8 @@ TEST_P(InstSve, lastb) { )"); CHECK_NEON(0, uint64_t, fillNeon({0x01}, 8)); CHECK_NEON(1, uint64_t, fillNeon({0x1F}, 8)); + + EXPECT_GROUP(R"(lastb b4, p0, z3.b)", SCALAR_SIMPLE_ARTH_NOSHIFT); } TEST_P(InstSve, splice) { @@ -7516,6 +7543,8 @@ TEST_P(InstSve, splice) { )"); CHECK_NEON(0, float, fillNeon({1.5}, VL / 8)); CHECK_NEON(2, float, fillNeonCombined({1.5}, {-0.5}, VL / 8)); + + EXPECT_GROUP(R"(splice z2.s, p1, z2.s, z1.s)", SVE_SIMPLE_ARTH_NOSHIFT); } TEST_P(InstSve, st1b) { @@ -9250,6 +9279,8 @@ TEST_P(InstSve, whilels) { )"); CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 8)); EXPECT_EQ(getNZCV(), 0b1000); + + EXPECT_GROUP(R"(whilels p3.d, xzr, x0)", PREDICATE); } TEST_P(InstSve, whilelt) { From 4361eabc5c1090070b3c93de3bc9ab8973aa7c79 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Mon, 16 Dec 2024 17:52:10 +0000 Subject: [PATCH 34/38] Cleaned up infinite ROB check and OpenMP bug --- src/include/simeng/pipeline/ReorderBuffer.hh | 6 +---- src/lib/arch/aarch64/ExceptionHandler.cc | 3 +-- src/lib/pipeline/ReorderBuffer.cc | 24 +++++++++----------- 3 files changed, 13 insertions(+), 20 deletions(-) diff --git a/src/include/simeng/pipeline/ReorderBuffer.hh b/src/include/simeng/pipeline/ReorderBuffer.hh index 06a9aefadd..c7ee01fcc6 100644 --- a/src/include/simeng/pipeline/ReorderBuffer.hh +++ b/src/include/simeng/pipeline/ReorderBuffer.hh @@ -125,13 +125,9 @@ class ReorderBuffer { */ uint64_t pc_; - /** The address of the last instruction at the head of the ROB to check if - * it's stuck */ - uint64_t last_inst_addr = 0; - /** A counter for how many cycles the same instruction has been at the head of * the ROB */ - uint64_t inst_repeat_counter = 0; + uint64_t robHeadRepeatCounter_ = 0; /** The sequence ID of the youngest instruction that should remain after the * current flush. */ diff --git a/src/lib/arch/aarch64/ExceptionHandler.cc b/src/lib/arch/aarch64/ExceptionHandler.cc index 33701b049b..639f8e0655 100644 --- a/src/lib/arch/aarch64/ExceptionHandler.cc +++ b/src/lib/arch/aarch64/ExceptionHandler.cc @@ -431,14 +431,13 @@ bool ExceptionHandler::init() { } uint64_t retval = static_cast(bitmask); stateChange = {ChangeType::REPLACEMENT, {R0}, {sizeof(retval)}}; - stateChange.memoryAddresses.push_back({mask, 8}); + stateChange.memoryAddresses.push_back({mask, sizeof(bitmask)}); stateChange.memoryAddressValues.push_back(bitmask); } else { stateChange = {ChangeType::REPLACEMENT, {R0}, {-1ll}}; } break; } - case 131: { // tgkill // TODO: Functionality temporarily omitted since simeng only has a // single thread at the moment diff --git a/src/lib/pipeline/ReorderBuffer.cc b/src/lib/pipeline/ReorderBuffer.cc index 1ff4a6b6c5..33326944a3 100644 --- a/src/lib/pipeline/ReorderBuffer.cc +++ b/src/lib/pipeline/ReorderBuffer.cc @@ -81,23 +81,21 @@ unsigned int ReorderBuffer::commit(uint64_t maxCommitSize) { unsigned int n; for (n = 0; n < maxCommits; n++) { auto& uop = buffer_[0]; - if (uop->getInstructionAddress() == last_inst_addr) { - inst_repeat_counter++; - } else { - inst_repeat_counter = 0; - } - if (inst_repeat_counter > 10000000) { - std::cout - << "Infinite loop detected in rob commit at instruction address " - << std::hex << uop->getInstructionAddress() << std::dec << " (" - << uop->getMicroOpIndex() << "). Killing.\n"; - exit(1); - } - last_inst_addr = uop->getInstructionAddress(); if (!uop->canCommit()) { + // If an instruction has been stuck at the head of the rob for + // sufficiently long, assume an error in SimEng has occured. + robHeadRepeatCounter_++; + if (robHeadRepeatCounter_ > 10000000) { + std::cerr << "[SimEng:ReorderBuffer] Infinite loop detected in rob " + "commit at instruction address " + << std::hex << uop->getInstructionAddress() << std::dec + << " (" << uop->getMicroOpIndex() << ")." << std::endl; + exit(1); + } break; } + robHeadRepeatCounter_ = 0; if (uop->isLastMicroOp()) instructionsCommitted_++; From 6345f0824d23dbe31f99b505390d7a99a0f81e19 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Wed, 18 Dec 2024 11:21:25 +0000 Subject: [PATCH 35/38] Responded to PR comments. Cleaned up a lot of helper functions and fixed a few metadata issues --- .../simeng/arch/aarch64/helpers/neon.hh | 29 ++++++ .../simeng/arch/aarch64/helpers/sve.hh | 88 ++++++------------ src/lib/arch/aarch64/Instruction_execute.cc | 17 ++-- test/regression/aarch64/instructions/load.cc | 27 +----- test/regression/aarch64/instructions/neon.cc | 91 ++++++++++++++++++ test/regression/aarch64/instructions/sve.cc | 92 +------------------ 6 files changed, 160 insertions(+), 184 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh index c2626b7e91..98c1648d6b 100644 --- a/src/include/simeng/arch/aarch64/helpers/neon.hh +++ b/src/include/simeng/arch/aarch64/helpers/neon.hh @@ -558,6 +558,35 @@ RegisterValue vecLogicOp_3vecs(srcValContainer& sourceValues, return {out, 256}; } +/** Helper function for NEON instructions with the format `uaddlv zd, zn.T`. + * T represents the type of the destination register (e.g. for h0, T = + * uint32_t). + * U represents the type of the sourceValues[0] (e.g. for v0.8b, U = + * uint8_t) + * Returns correctly formatted RegisterValue. */ +template +RegisterValue vecAddlv(srcValContainer& sourceValues) { + const U* n = sourceValues[0].getAsVector(); + T out = 0; + for (int i = 0; i < I; i++) { + out += n[i]; + } + return {out, 256}; +} + +/** Helper function for NEON instructions with the format `umaxv Vd, Vn.T`. + * T represents the type of sourceValues (e.g. for vn.s, T = uint32_t). + * Returns correctly formatted RegisterValue. */ +template +RegisterValue vecUMaxV(srcValContainer& sourceValues) { + const T* n = sourceValues[0].getAsVector(); + T out = n[0]; + for (int i = 1; i < I; i++) { + out = std::max(n[i], out); + } + return {out, 256}; +} + /** Helper function for NEON instructions with the format `umaxp vd, vn, vm`. * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t). * I represents the number of elements in the output array to be updated (e.g. diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 10ee4e5446..8b23bb0ea9 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -114,33 +114,6 @@ RegisterValue sveAddvPredicated(srcValContainer& sourceValues, return {out, 256}; } -/** Helper function for NEON instructions with the format `uaddlv Vd, Vn.T`. - * T represents the type of the destination register (e.g. for h0, T = - * uint32_t). U represents the type of the sourceValues[0] (e.g. for v0.8b, U = - * uint8_t) Returns correctly formatted RegisterValue. */ -template -RegisterValue sveAddlv(srcValContainer& sourceValues) { - const U* n = sourceValues[0].getAsVector(); - T out = 0; - for (int i = 0; i < I; i++) { - out += n[i]; - } - return {out, 256}; -} - -/** Helper function for NEON instructions with the format `umaxv Vd, Vn.T`. - * T represents the type of sourceValues (e.g. for vn.s, T = uint32_t). - * Returns correctly formatted RegisterValue. */ -template -RegisterValue sveUMaxV(srcValContainer& sourceValues) { - const T* n = sourceValues[0].getAsVector(); - T out = n[0]; - for (int i = 1; i < I; i++) { - out = std::max(n[i], out); - } - return {out, 256}; -} - /** Helper function for SVE instructions with the format `adr zd, [zn, zm{, * lsl #<1,2,3>}]`. * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t). @@ -284,7 +257,7 @@ RegisterValue sveCpy_imm( return {out, 256}; } -/** Helper function for SVE instructions with the format `cpy zd, pg/m, vn +/** Helper function for SVE instructions with the format `cpy zd, pg/m, rn * T represents the type of sourceValues (e.g. for zd.d, T = int64_t). * Returns correctly formatted RegisterValue. */ template @@ -294,7 +267,7 @@ RegisterValue sveCpy_Scalar( const uint16_t VL_bits) { const T* zd = sourceValues[0].getAsVector(); const uint64_t* p = sourceValues[1].getAsVector(); - const T vn = sourceValues[2].get(); + const T rn = sourceValues[2].get(); const uint16_t partition_num = VL_bits / (sizeof(T) * 8); T out[256 / sizeof(T)] = {0}; @@ -302,7 +275,7 @@ RegisterValue sveCpy_Scalar( for (int i = 0; i < partition_num; i++) { uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); if (p[i / (64 / sizeof(T))] & shifted_active) { - out[i] = vn; + out[i] = rn; } else { out[i] = zd[i]; } @@ -956,7 +929,8 @@ RegisterValue sveFTrigSSel(srcValContainer& sourceValues, } /** Helper function for SVE instructions with the format `ftmad zd, zn, zm, - * #imm`. T represents the type of sourceValues (e.g. for zn.d, T = double). + * #imm`. + * T represents the type of sourceValues (e.g. for zn.d, T = double). * Returns correctly formatted RegisterValue. **/ template RegisterValue sveFTrigMad( @@ -1112,15 +1086,14 @@ RegisterValue sveIndex( return {out, 256}; } -/** Helper function for SVE instructions with the format `lastb vd, pg, zn`. +/** Helper function for SVE instructions with the format `lastb rd, pg, zn`. * T represents the vector register type (e.g. zd.d would be uint64_t). * Returns correctly formatted RegisterValue. */ template RegisterValue sveLastBScalar(srcValContainer& sourceValues, const uint16_t VL_bits) { - // sourceValues are wrong and the correct value is in the previous index. - const uint64_t* p = sourceValues[1].getAsVector(); - const T* n = sourceValues[2].getAsVector(); + const uint64_t* p = sourceValues[0].getAsVector(); + const T* n = sourceValues[1].getAsVector(); const uint16_t partition_num = VL_bits / (sizeof(T) * 8); T out; @@ -1141,15 +1114,15 @@ RegisterValue sveLastBScalar(srcValContainer& sourceValues, return {out, 256}; } -/** Helper function for SVE instructions with the format `clastb vd, pg, vd, - * zn`. T represents the vector register type (e.g. zd.d would be uint64_t). +/** Helper function for SVE instructions with the format `clastb rd, pg, rd, + * zn`. + * T represents the vector register type (e.g. zd.d would be uint64_t). * Returns correctly formatted RegisterValue. */ template RegisterValue sveCLastBScalar(srcValContainer& sourceValues, const uint16_t VL_bits) { - // sourceValues are wrong and the correct value is in the previous index. const uint64_t* p = sourceValues[1].getAsVector(); - const uint64_t* m = sourceValues[2].getAsVector(); + const uint64_t m = sourceValues[2].get(); const T* n = sourceValues[3].getAsVector(); const uint16_t partition_num = VL_bits / (sizeof(T) * 8); @@ -1166,9 +1139,9 @@ RegisterValue sveCLastBScalar(srcValContainer& sourceValues, } if (lastElem < 0) { - out = static_cast(static_cast(m[0])); + out = m; } else { - out = static_cast(static_cast(n[lastElem])); + out = n[lastElem]; } return {out, 256}; } @@ -1505,7 +1478,8 @@ RegisterValue sveOrr_3vecs(srcValContainer& sourceValues, /** Helper function for SVE2 instructions with the format `psel pd, pn, * pm.t[wa, #imm]`. * T represents the type of sourceValues (e.g. for pm.d, T = - * uint64_t). Returns an array of 4 uint64_t elements. */ + * uint64_t). + * Returns an array of 4 uint64_t elements. */ template std::array svePsel( srcValContainer& sourceValues, @@ -1530,12 +1504,13 @@ std::array svePsel( return out; } -/** Helper function for SVE instructions with the format `pfirst pdn, pg, pdn`. - * Returns an array of 4 uint64_t elements. */ -std::array svePfirst(srcValContainer& sourceValues, - const uint16_t VL_bits) { +/** Helper function for SVE instructions with the format `pfirst pdn.b, pg, + * pdn.b`. + * Returns an array of 4 uint64_t elements, and updates the NZCV flags. + */ +std::tuple, uint8_t> svePfirst( + srcValContainer& sourceValues, const uint16_t VL_bits) { const uint16_t partition_num = VL_bits / 8; - // sourceValues are wrong and the correct value is in the previous index. const uint64_t* p = sourceValues[1].getAsVector(); const uint64_t* dn = sourceValues[2].getAsVector(); // Set destination d as source n to copy all false lanes and the active lanes @@ -1549,10 +1524,11 @@ std::array svePfirst(srcValContainer& sourceValues, break; } } - return out; + return {out, getNZCVfromPred(out, VL_bits, 1)}; } /** Helper function for SVE instructions with the format `pnext pdn, pv, pdn`. + * T represents the type of sourceValues (e.g. for pdn.d, T = uint64_t). * Returns an array of 4 uint64_t elements, and updates the NZCV flags. */ template std::tuple, uint8_t> svePnext( @@ -1565,21 +1541,13 @@ std::tuple, uint8_t> svePnext( // Set destination elements to 0 std::array out = {0, 0, 0, 0}; - // Get pattern - const uint16_t count = - sveGetPattern(metadata.operandStr, sizeof(T) * 8, VL_bits); - - // Exit early if count == 0 - if (count == 0) return {out, getNZCVfromPred(out, VL_bits, sizeof(T))}; // Get last active element of dn.pattern int lastElem = -1; for (int i = partition_num - 1; i >= 0; i--) { - if (i < count) { - uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); - if (dn[i / (64 / sizeof(T))] & shifted_active) { - lastElem = i; - break; - } + uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); + if (dn[i / (64 / sizeof(T))] & shifted_active) { + lastElem = i; + break; } } // Get next active element of p, starting from last of dn.pattern diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 23a51c190c..2c87ee9ed3 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -206,7 +206,7 @@ void Instruction::execute() { break; } case Opcode::AArch64_UADDLVv8i8v: { // uaddlv hd, vn.8b - results_[0] = sveAddlv(sourceValues_); + results_[0] = vecAddlv(sourceValues_); break; } case Opcode::AArch64_ADDWri: { // add wd, wn, #imm{, shift} @@ -4154,7 +4154,9 @@ void Instruction::execute() { break; } case Opcode::AArch64_PFIRST_B: { // pfirst pdn.b, pg, pdn.b - results_[0] = svePfirst(sourceValues_, VL_bits); + auto [result, nzcv] = svePfirst(sourceValues_, VL_bits); + results_[0] = nzcv; + results_[1] = result; break; } case Opcode::AArch64_PNEXT_B: { // pnext pdn.b, pv, pdn.b @@ -5176,6 +5178,7 @@ void Instruction::execute() { } case Opcode::AArch64_STLRW: // stlr wt, [xn] case Opcode::AArch64_STLRX: { // stlr xt, [xn] + // STORE memoryData_[0] = sourceValues_[0]; break; } @@ -5805,23 +5808,23 @@ void Instruction::execute() { break; } case Opcode::AArch64_UMAXVv16i8v: { // umaxv bd, vn.16b - results_[0] = sveUMaxV(sourceValues_); + results_[0] = vecUMaxV(sourceValues_); break; } case Opcode::AArch64_UMAXVv4i16v: { // umaxv hd, vn.4h - results_[0] = sveUMaxV(sourceValues_); + results_[0] = vecUMaxV(sourceValues_); break; } case Opcode::AArch64_UMAXVv4i32v: { // umaxv sd, vn.4s - results_[0] = sveUMaxV(sourceValues_); + results_[0] = vecUMaxV(sourceValues_); break; } case Opcode::AArch64_UMAXVv8i16v: { // umaxv hd, vn.8h - results_[0] = sveUMaxV(sourceValues_); + results_[0] = vecUMaxV(sourceValues_); break; } case Opcode::AArch64_UMAXVv8i8v: { // umaxv bd, vn.8b - results_[0] = sveUMaxV(sourceValues_); + results_[0] = vecUMaxV(sourceValues_); break; } case Opcode::AArch64_UMOVvi32_idx0: // umov wd, vn.s[0] diff --git a/test/regression/aarch64/instructions/load.cc b/test/regression/aarch64/instructions/load.cc index 83737c14ce..bf5a3cad47 100644 --- a/test/regression/aarch64/instructions/load.cc +++ b/test/regression/aarch64/instructions/load.cc @@ -732,32 +732,7 @@ TEST_P(InstLoad, ldaxrb) { EXPECT_EQ(getGeneralRegister(7), 0x34); EXPECT_EQ(getGeneralRegister(8), 0x12); - RUN_AARCH64(R"( - sub sp, sp, #1024 - mov w0, #16 - mov w1, #32 - mov w2, #48 - mov w3, #64 - str w0, [sp], #32 - str w1, [sp], #32 - str w2, [sp], #32 - str w3, [sp], #32 - sub sp, sp, #128 - ldaxrb w4, [sp] - add sp, sp, #32 - ldaxrb w5, [sp] - add sp, sp, #32 - ldaxrb w6, [sp] - add sp, sp, #32 - ldaxrb w7, [sp] - )"); - - EXPECT_EQ(getGeneralRegister(4), 16); - EXPECT_EQ(getGeneralRegister(5), 32); - EXPECT_EQ(getGeneralRegister(6), 48); - EXPECT_EQ(getGeneralRegister(7), 64); - - EXPECT_GROUP(R"(ldaxrb w7, [sp])", LOAD_INT); + EXPECT_GROUP(R"(ldaxrb w8, [x0])", LOAD_INT); } TEST_P(InstLoad, ldrb) { diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc index 91dee06ebb..c66f6f3c6f 100644 --- a/test/regression/aarch64/instructions/neon.cc +++ b/test/regression/aarch64/instructions/neon.cc @@ -2878,6 +2878,97 @@ TEST_P(InstNeon, umaxp) { 0xCC, 0xBB, 0xAA, 0x99, 0x88}); } +TEST_P(InstNeon, umaxv) { + // umaxv vd, vn.t + initialHeapData_.resize(32); + uint8_t* heap = reinterpret_cast(initialHeapData_.data()); + + // v0 + heap[0] = 0x01; + heap[1] = 0x00; + heap[2] = 0xFF; + heap[3] = 0xAA; + heap[4] = 0xBB; + heap[5] = 0xCC; + heap[6] = 0xDD; + heap[7] = 0xEE; + heap[8] = 0x07; + heap[9] = 0x00; + heap[10] = 0xFC; + heap[11] = 0xFD; + heap[12] = 0xBA; + heap[13] = 0xCA; + heap[14] = 0x39; + heap[15] = 0xEF; + + // v1 + heap[16] = 0x00; + heap[17] = 0x00; + heap[18] = 0xEE; + heap[19] = 0x11; + heap[20] = 0x22; + heap[21] = 0x33; + heap[22] = 0x44; + heap[23] = 0x55; + heap[24] = 0x26; + heap[25] = 0xFF; + heap[26] = 0xEA; + heap[27] = 0xFA; + heap[28] = 0x14; + heap[29] = 0x43; + heap[30] = 0x21; + heap[31] = 0xAE; + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ldr q0, [x0] + ldr q1, [x0, #16] + umaxv h2, v0.4h + umaxv h3, v1.4h + + umaxv h4, v0.8h + umaxv h5, v1.8h + + umaxv s6, v0.4s + umaxv s7, v1.4s + + umaxv b8, v0.8b + umaxv b9, v1.8b + + umaxv b10, v0.16b + umaxv b11, v1.16b + + )"); + CHECK_NEON(2, uint16_t, + {0xEEDD, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}); + CHECK_NEON(3, uint16_t, + {0x5544, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}); + CHECK_NEON(4, uint16_t, + {0xFDFC, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}); + CHECK_NEON(5, uint16_t, + {0xFF26, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}); + CHECK_NEON(6, uint32_t, {0xFDFC0007, 0x00000000, 0x00000000, 0x00000000}); + CHECK_NEON(7, uint32_t, {0xFAEAFF26, 0x00000000, 0x00000000, 0x00000000}); + CHECK_NEON(8, uint8_t, + {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00}) + CHECK_NEON(9, uint8_t, + {0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00}) + CHECK_NEON(10, uint8_t, + {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00}) + CHECK_NEON(11, uint8_t, + {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00}) + + EXPECT_GROUP(R"(umaxv b11, v1.16b)", SCALAR_SIMPLE_ARTH_NOSHIFT); +} + TEST_P(InstNeon, smax) { initialHeapData_.resize(32); uint32_t* heap = reinterpret_cast(initialHeapData_.data()); diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index ab027b408a..6a75c597cc 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -6229,6 +6229,7 @@ TEST_P(InstSve, pfirst) { CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 1)); CHECK_PREDICATE(4, uint64_t, fillPred(1, {1}, 1)); CHECK_PREDICATE(5, uint64_t, fillPred(VL / 8, {0}, 1)); + EXPECT_EQ(getNZCV(), 0b0110); EXPECT_GROUP(R"(pfirst p5.b, p1, p5.b)", PREDICATE); } @@ -7202,97 +7203,6 @@ TEST_P(InstSve, smulh) { fillNeonCombined({-12}, {-1076902265}, VL / 8)); } -TEST_P(InstSve, umaxv) { - // umaxv vd, vn.t - initialHeapData_.resize(32); - uint8_t* heap = reinterpret_cast(initialHeapData_.data()); - - // v0 - heap[0] = 0x01; - heap[1] = 0x00; - heap[2] = 0xFF; - heap[3] = 0xAA; - heap[4] = 0xBB; - heap[5] = 0xCC; - heap[6] = 0xDD; - heap[7] = 0xEE; - heap[8] = 0x07; - heap[9] = 0x00; - heap[10] = 0xFC; - heap[11] = 0xFD; - heap[12] = 0xBA; - heap[13] = 0xCA; - heap[14] = 0x39; - heap[15] = 0xEF; - - // v1 - heap[16] = 0x00; - heap[17] = 0x00; - heap[18] = 0xEE; - heap[19] = 0x11; - heap[20] = 0x22; - heap[21] = 0x33; - heap[22] = 0x44; - heap[23] = 0x55; - heap[24] = 0x26; - heap[25] = 0xFF; - heap[26] = 0xEA; - heap[27] = 0xFA; - heap[28] = 0x14; - heap[29] = 0x43; - heap[30] = 0x21; - heap[31] = 0xAE; - - RUN_AARCH64(R"( - # Get heap address - mov x0, 0 - mov x8, 214 - svc #0 - - ldr q0, [x0] - ldr q1, [x0, #16] - umaxv h2, v0.4h - umaxv h3, v1.4h - - umaxv h4, v0.8h - umaxv h5, v1.8h - - umaxv s6, v0.4s - umaxv s7, v1.4s - - umaxv b8, v0.8b - umaxv b9, v1.8b - - umaxv b10, v0.16b - umaxv b11, v1.16b - - )"); - CHECK_NEON(2, uint16_t, - {0xEEDD, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}); - CHECK_NEON(3, uint16_t, - {0x5544, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}); - CHECK_NEON(4, uint16_t, - {0xFDFC, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}); - CHECK_NEON(5, uint16_t, - {0xFF26, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000}); - CHECK_NEON(6, uint32_t, {0xFDFC0007, 0x00000000, 0x00000000, 0x00000000}); - CHECK_NEON(7, uint32_t, {0xFAEAFF26, 0x00000000, 0x00000000, 0x00000000}); - CHECK_NEON(8, uint8_t, - {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00}) - CHECK_NEON(9, uint8_t, - {0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00}) - CHECK_NEON(10, uint8_t, - {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00}) - CHECK_NEON(11, uint8_t, - {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00}) - - EXPECT_GROUP(R"(umaxv b11, v1.16b)", SCALAR_SIMPLE_ARTH_NOSHIFT); -} - TEST_P(InstSve, clastb) { // 64 bit RUN_AARCH64(R"( From 6119ade2f0df8f05439b43220c6b6c23f0e9cafe Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Wed, 18 Dec 2024 16:41:34 +0000 Subject: [PATCH 36/38] Responded to more comments --- .../simeng/arch/aarch64/helpers/neon.hh | 8 ++++-- .../simeng/arch/aarch64/helpers/sve.hh | 25 +++++++++---------- src/lib/arch/aarch64/Instruction_execute.cc | 8 +++--- src/lib/pipeline/ReorderBuffer.cc | 13 +++++++--- 4 files changed, 32 insertions(+), 22 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh index 98c1648d6b..e5cf3dd3aa 100644 --- a/src/include/simeng/arch/aarch64/helpers/neon.hh +++ b/src/include/simeng/arch/aarch64/helpers/neon.hh @@ -558,11 +558,13 @@ RegisterValue vecLogicOp_3vecs(srcValContainer& sourceValues, return {out, 256}; } -/** Helper function for NEON instructions with the format `uaddlv zd, zn.T`. +/** Helper function for NEON instructions with the format `uaddlv rd, Vn.T`. * T represents the type of the destination register (e.g. for h0, T = * uint32_t). * U represents the type of the sourceValues[0] (e.g. for v0.8b, U = * uint8_t) + * I represents the number of elements in the output array to be updated (e.g. + * for vd.8b I = 8). * Returns correctly formatted RegisterValue. */ template RegisterValue vecAddlv(srcValContainer& sourceValues) { @@ -574,8 +576,10 @@ RegisterValue vecAddlv(srcValContainer& sourceValues) { return {out, 256}; } -/** Helper function for NEON instructions with the format `umaxv Vd, Vn.T`. +/** Helper function for NEON instructions with the format `umaxv rd, Vn.T`. * T represents the type of sourceValues (e.g. for vn.s, T = uint32_t). + * I represents the number of elements in the output array to be updated (e.g. + * for vd.8b I = 8). * Returns correctly formatted RegisterValue. */ template RegisterValue vecUMaxV(srcValContainer& sourceValues) { diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 8b23bb0ea9..563cc3ed62 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -877,8 +877,9 @@ RegisterValue sveFsqrtPredicated_2vecs(srcValContainer& sourceValues, /** Helper function for SVE instructions with the format `ftsmul zd, zn, zm`. * T represents the type of sourceValues (e.g. for zn.d, T = double). - * Returns correctly formatted RegisterValue. U represents the same precision as - * T, but as an integer type for the second source register. */ + * U represents the same precision as T, but as an integer type for the second + * source register. + * Returns correctly formatted RegisterValue. */ template RegisterValue sveFTrigSMul(srcValContainer& sourceValues, const uint16_t VL_bits) { @@ -903,8 +904,9 @@ RegisterValue sveFTrigSMul(srcValContainer& sourceValues, /** Helper function for SVE instructions with the format `ftssel zd, zn, zm`. * T represents the type of sourceValues (e.g. for zn.d, T = double). - * Returns correctly formatted RegisterValue. U represents the same precision as - * T, but as an integer type for the second source register. */ + * U represents the same precision as T, but as an integer type for the second + * source register. + * Returns correctly formatted RegisterValue. */ template RegisterValue sveFTrigSSel(srcValContainer& sourceValues, const uint16_t VL_bits) { @@ -1096,7 +1098,6 @@ RegisterValue sveLastBScalar(srcValContainer& sourceValues, const T* n = sourceValues[1].getAsVector(); const uint16_t partition_num = VL_bits / (sizeof(T) * 8); - T out; // Get last active element int lastElem = 0; @@ -1109,20 +1110,18 @@ RegisterValue sveLastBScalar(srcValContainer& sourceValues, // If no active lane has been found, select highest element instead if (i == 0) lastElem = partition_num - 1; } - - out = n[lastElem]; - return {out, 256}; + return {n[lastElem], 256}; } -/** Helper function for SVE instructions with the format `clastb rd, pg, rd, +/** Helper function for SVE instructions with the format `clastb zd, pg, zd, * zn`. * T represents the vector register type (e.g. zd.d would be uint64_t). * Returns correctly formatted RegisterValue. */ template -RegisterValue sveCLastBScalar(srcValContainer& sourceValues, - const uint16_t VL_bits) { +RegisterValue sveCLastBSimdScalar(srcValContainer& sourceValues, + const uint16_t VL_bits) { const uint64_t* p = sourceValues[1].getAsVector(); - const uint64_t m = sourceValues[2].get(); + const T* m = sourceValues[2].getAsVector(); const T* n = sourceValues[3].getAsVector(); const uint16_t partition_num = VL_bits / (sizeof(T) * 8); @@ -1139,7 +1138,7 @@ RegisterValue sveCLastBScalar(srcValContainer& sourceValues, } if (lastElem < 0) { - out = m; + out = m[0]; } else { out = n[lastElem]; } diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 2c87ee9ed3..6bbb6c0006 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -2666,19 +2666,19 @@ void Instruction::execute() { break; } case Opcode::AArch64_CLASTB_VPZ_D: { // clastb dd, pg, dn, zn.d - results_[0] = sveCLastBScalar(sourceValues_, VL_bits); + results_[0] = sveCLastBSimdScalar(sourceValues_, VL_bits); break; } case Opcode::AArch64_CLASTB_VPZ_S: { // clastb sd, pg, sn, zn.s - results_[0] = sveCLastBScalar(sourceValues_, VL_bits); + results_[0] = sveCLastBSimdScalar(sourceValues_, VL_bits); break; } case Opcode::AArch64_CLASTB_VPZ_H: { // clastb hd, pg, hn, zn.h - results_[0] = sveCLastBScalar(sourceValues_, VL_bits); + results_[0] = sveCLastBSimdScalar(sourceValues_, VL_bits); break; } case Opcode::AArch64_CLASTB_VPZ_B: { // clastb bd, pg, bn, zn.b - results_[0] = sveCLastBScalar(sourceValues_, VL_bits); + results_[0] = sveCLastBSimdScalar(sourceValues_, VL_bits); break; } case Opcode::AArch64_LD1_MXIPXX_H_D: { // ld1d {zath.d[ws, #imm]}, pg/z, diff --git a/src/lib/pipeline/ReorderBuffer.cc b/src/lib/pipeline/ReorderBuffer.cc index 33326944a3..e53849ea89 100644 --- a/src/lib/pipeline/ReorderBuffer.cc +++ b/src/lib/pipeline/ReorderBuffer.cc @@ -87,10 +87,17 @@ unsigned int ReorderBuffer::commit(uint64_t maxCommitSize) { // sufficiently long, assume an error in SimEng has occured. robHeadRepeatCounter_++; if (robHeadRepeatCounter_ > 10000000) { - std::cerr << "[SimEng:ReorderBuffer] Infinite loop detected in rob " - "commit at instruction address " + std::cerr << "[SimEng:ReorderBuffer] Instruction stuck unable to " + "commit at the head of ROB for a very long time at " + "instruction address 0x" << std::hex << uop->getInstructionAddress() << std::dec - << " (" << uop->getMicroOpIndex() << ")." << std::endl; + << " (MicroOp Index: " << uop->getMicroOpIndex() + << "). This is unexpected behaviour for most valid core " + "configurations, though may arise in designs with very " + "high latencies or bottlenecks. If this is not the case, " + "please try re-running. Please raise an issue on GitHub " + "if the problem persists." + << std::endl; exit(1); } break; From 6da7f5cf64fe06f780b5c131251a49ed3a9c594f Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Wed, 18 Dec 2024 16:50:01 +0000 Subject: [PATCH 37/38] Updated naming for confusing lastb helper --- src/include/simeng/arch/aarch64/helpers/sve.hh | 6 +++--- src/lib/arch/aarch64/Instruction_execute.cc | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 563cc3ed62..08afb5bb19 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -1088,12 +1088,12 @@ RegisterValue sveIndex( return {out, 256}; } -/** Helper function for SVE instructions with the format `lastb rd, pg, zn`. +/** Helper function for SVE instructions with the format `lastb zd, pg, zn`. * T represents the vector register type (e.g. zd.d would be uint64_t). * Returns correctly formatted RegisterValue. */ template -RegisterValue sveLastBScalar(srcValContainer& sourceValues, - const uint16_t VL_bits) { +RegisterValue sveLastBSimdScalar(srcValContainer& sourceValues, + const uint16_t VL_bits) { const uint64_t* p = sourceValues[0].getAsVector(); const T* n = sourceValues[1].getAsVector(); diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 6bbb6c0006..3090e3cb42 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -2650,19 +2650,19 @@ void Instruction::execute() { break; } case Opcode::AArch64_LASTB_VPZ_D: { // lastb dd, pg, zn.d - results_[0] = sveLastBScalar(sourceValues_, VL_bits); + results_[0] = sveLastBSimdScalar(sourceValues_, VL_bits); break; } case Opcode::AArch64_LASTB_VPZ_S: { // lastb sd, pg, zn.s - results_[0] = sveLastBScalar(sourceValues_, VL_bits); + results_[0] = sveLastBSimdScalar(sourceValues_, VL_bits); break; } case Opcode::AArch64_LASTB_VPZ_H: { // lastb hd, pg, zn.h - results_[0] = sveLastBScalar(sourceValues_, VL_bits); + results_[0] = sveLastBSimdScalar(sourceValues_, VL_bits); break; } case Opcode::AArch64_LASTB_VPZ_B: { // lastb bd, pg, zn.b - results_[0] = sveLastBScalar(sourceValues_, VL_bits); + results_[0] = sveLastBSimdScalar(sourceValues_, VL_bits); break; } case Opcode::AArch64_CLASTB_VPZ_D: { // clastb dd, pg, dn, zn.d From c9f708b46797db6e55734acc6c9478cb7a4f7c50 Mon Sep 17 00:00:00 2001 From: Joseph Moore Date: Thu, 19 Dec 2024 23:43:29 +0000 Subject: [PATCH 38/38] Fixed issues arising from merge conflicts on Capstone Update branch. Updated comment for infinite loop detector --- .../simeng/arch/aarch64/helpers/sve.hh | 14 ++--- src/include/simeng/pipeline/ReorderBuffer.hh | 4 ++ src/lib/arch/aarch64/InstructionMetadata.cc | 55 ++++++++++++++++--- src/lib/arch/aarch64/Instruction_decode.cc | 7 ++- src/lib/pipeline/ReorderBuffer.cc | 11 ++-- test/regression/aarch64/instructions/sve.cc | 2 +- 6 files changed, 70 insertions(+), 23 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 08afb5bb19..6d4c0df66a 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -1120,9 +1120,9 @@ RegisterValue sveLastBSimdScalar(srcValContainer& sourceValues, template RegisterValue sveCLastBSimdScalar(srcValContainer& sourceValues, const uint16_t VL_bits) { - const uint64_t* p = sourceValues[1].getAsVector(); - const T* m = sourceValues[2].getAsVector(); - const T* n = sourceValues[3].getAsVector(); + const uint64_t* p = sourceValues[0].getAsVector(); + const T* m = sourceValues[1].getAsVector(); + const T* n = sourceValues[2].getAsVector(); const uint16_t partition_num = VL_bits / (sizeof(T) * 8); T out; @@ -1510,8 +1510,8 @@ std::array svePsel( std::tuple, uint8_t> svePfirst( srcValContainer& sourceValues, const uint16_t VL_bits) { const uint16_t partition_num = VL_bits / 8; - const uint64_t* p = sourceValues[1].getAsVector(); - const uint64_t* dn = sourceValues[2].getAsVector(); + const uint64_t* p = sourceValues[0].getAsVector(); + const uint64_t* dn = sourceValues[1].getAsVector(); // Set destination d as source n to copy all false lanes and the active lanes // beyond the first std::array out = {dn[0], dn[1], dn[2], dn[3]}; @@ -1535,8 +1535,8 @@ std::tuple, uint8_t> svePnext( const simeng::arch::aarch64::InstructionMetadata& metadata, const uint16_t VL_bits) { const uint16_t partition_num = VL_bits / (sizeof(T) * 8); - const uint64_t* p = sourceValues[1].getAsVector(); - const uint64_t* dn = sourceValues[2].getAsVector(); + const uint64_t* p = sourceValues[0].getAsVector(); + const uint64_t* dn = sourceValues[1].getAsVector(); // Set destination elements to 0 std::array out = {0, 0, 0, 0}; diff --git a/src/include/simeng/pipeline/ReorderBuffer.hh b/src/include/simeng/pipeline/ReorderBuffer.hh index c7ee01fcc6..4c31eeb38a 100644 --- a/src/include/simeng/pipeline/ReorderBuffer.hh +++ b/src/include/simeng/pipeline/ReorderBuffer.hh @@ -129,6 +129,10 @@ class ReorderBuffer { * the ROB */ uint64_t robHeadRepeatCounter_ = 0; + /** A limit for the counter of how long an instruction can be stuck at the + * head of the ROB before SimEng exits with an exception. */ + uint64_t robHeadRepeatLimit_ = 10000000; + /** The sequence ID of the youngest instruction that should remain after the * current flush. */ uint64_t flushAfter_; diff --git a/src/lib/arch/aarch64/InstructionMetadata.cc b/src/lib/arch/aarch64/InstructionMetadata.cc index 9653e3a00a..219023d93a 100644 --- a/src/lib/arch/aarch64/InstructionMetadata.cc +++ b/src/lib/arch/aarch64/InstructionMetadata.cc @@ -89,14 +89,6 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn) } case Opcode::AArch64_SMAX_ZI_B: [[fallthrough]]; - case Opcode::AArch64_FTSMUL_ZZZ_D: - [[fallthrough]]; - case Opcode::AArch64_FTSMUL_ZZZ_S: - [[fallthrough]]; - case Opcode::AArch64_FTSSEL_ZZZ_D: - [[fallthrough]]; - case Opcode::AArch64_FTSSEL_ZZZ_S: - [[fallthrough]]; case Opcode::AArch64_SMAX_ZI_D: [[fallthrough]]; case Opcode::AArch64_SMAX_ZI_H: @@ -108,6 +100,14 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn) operands[0].access = CS_AC_WRITE; operands[1].access = CS_AC_READ; break; + case Opcode::AArch64_FTSMUL_ZZZ_D: + [[fallthrough]]; + case Opcode::AArch64_FTSMUL_ZZZ_S: + [[fallthrough]]; + case Opcode::AArch64_FTSSEL_ZZZ_D: + [[fallthrough]]; + case Opcode::AArch64_FTSSEL_ZZZ_S: + [[fallthrough]]; case Opcode::AArch64_FSUB_ZPmI_D: [[fallthrough]]; case Opcode::AArch64_FSUB_ZPmI_H: @@ -131,6 +131,41 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn) operands[2].access = CS_AC_READ; break; } + case Opcode::AArch64_FTMAD_ZZI_D: + [[fallthrough]]; + case Opcode::AArch64_FTMAD_ZZI_S: { + // Incorrect access types + operands[0].access = CS_AC_READ | CS_AC_WRITE; + operands[1].access = CS_AC_READ; + operands[2].access = CS_AC_READ; + break; + } + case Opcode::AArch64_PFIRST_B: + [[fallthrough]]; + case Opcode::AArch64_PNEXT_D: + [[fallthrough]]; + case Opcode::AArch64_PNEXT_S: + [[fallthrough]]; + case Opcode::AArch64_PNEXT_H: + [[fallthrough]]; + case Opcode::AArch64_PNEXT_B: { + // Incorrect access types + operands[0].access = CS_AC_WRITE; + operands[1].access = CS_AC_READ; + operands[2].access = CS_AC_READ; + // Doesn't identify implicit NZCV destination + implicitDestinationCount = 1; + implicitDestinations[0] = AARCH64_REG_NZCV; + break; + } + case Opcode::AArch64_CLASTB_VPZ_D: + [[fallthrough]]; + case Opcode::AArch64_CLASTB_VPZ_S: + [[fallthrough]]; + case Opcode::AArch64_CLASTB_VPZ_H: + [[fallthrough]]; + case Opcode::AArch64_CLASTB_VPZ_B: + [[fallthrough]]; case Opcode::AArch64_AND_ZPmZ_D: // Example bytecode - 4901da04 [[fallthrough]]; case Opcode::AArch64_AND_ZPmZ_H: @@ -163,6 +198,10 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn) [[fallthrough]]; case Opcode::AArch64_SMAX_ZPmZ_S: // Example bytecode - 01008804 [[fallthrough]]; + case Opcode::AArch64_SPLICE_ZPZ_D: + [[fallthrough]]; + case Opcode::AArch64_SPLICE_ZPZ_S: + [[fallthrough]]; case Opcode::AArch64_MUL_ZPmZ_B: // Example bytecode - 40001004 [[fallthrough]]; case Opcode::AArch64_MUL_ZPmZ_D: diff --git a/src/lib/arch/aarch64/Instruction_decode.cc b/src/lib/arch/aarch64/Instruction_decode.cc index de68245ff6..215ade08fa 100644 --- a/src/lib/arch/aarch64/Instruction_decode.cc +++ b/src/lib/arch/aarch64/Instruction_decode.cc @@ -499,9 +499,10 @@ void Instruction::decode() { // Check first operand access to determine if it's a load or store if (metadata_.operands[0].access & CS_AC_WRITE) { - if (metadata_.id == ARM64_INS_STXR || metadata_.id == ARM64_INS_STLXR || - metadata_.id == ARM64_INS_STLXRB || - metadata_.id == ARM64_INS_STLXRH) { + if (metadata_.id == AARCH64_INS_STXR || + metadata_.id == AARCH64_INS_STLXR || + metadata_.id == AARCH64_INS_STLXRB || + metadata_.id == AARCH64_INS_STLXRH) { // Exceptions to this is load condition are exclusive store with a // success flag as first operand if (microOpcode_ != MicroOpcode::STR_DATA) { diff --git a/src/lib/pipeline/ReorderBuffer.cc b/src/lib/pipeline/ReorderBuffer.cc index e53849ea89..20a2970995 100644 --- a/src/lib/pipeline/ReorderBuffer.cc +++ b/src/lib/pipeline/ReorderBuffer.cc @@ -86,17 +86,20 @@ unsigned int ReorderBuffer::commit(uint64_t maxCommitSize) { // If an instruction has been stuck at the head of the rob for // sufficiently long, assume an error in SimEng has occured. robHeadRepeatCounter_++; - if (robHeadRepeatCounter_ > 10000000) { + if (robHeadRepeatCounter_ > robHeadRepeatLimit_) { std::cerr << "[SimEng:ReorderBuffer] Instruction stuck unable to " - "commit at the head of ROB for a very long time at " + "commit at the head of ROB for 10,000,000 cycles at " "instruction address 0x" << std::hex << uop->getInstructionAddress() << std::dec << " (MicroOp Index: " << uop->getMicroOpIndex() << "). This is unexpected behaviour for most valid core " "configurations, though may arise in designs with very " "high latencies or bottlenecks. If this is not the case, " - "please try re-running. Please raise an issue on GitHub " - "if the problem persists." + "please try re-running. If this may be expected, you can " + "increase this limit in " + "`SimEng/src/include/pipeline/ReorderBuffer.hh` under the " + "variable `robHeadRepeatLimit_`. Please raise " + "an issue on GitHub if the problem persists." << std::endl; exit(1); } diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 6a75c597cc..f94ee28262 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -5017,7 +5017,7 @@ TEST_P(InstSve, ftsmul) { 144.0f, 12.25f}, VL / 16)); - EXPECT_GROUP(R"(ftsmul z3.s, z1.s, z0.s)", SVE_MUL); + EXPECT_GROUP(R"(ftsmul z3.s, z1.s, z0.s)", SVE_SIMPLE_ARTH_NOSHIFT); } TEST_P(InstSve, ftssel) {