From 51ade58d82460d966dc901cf30fa0b22ad4cf5ea Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 7 Oct 2024 17:10:33 +0100 Subject: [PATCH 01/71] Fixed execution logic for UMINP and UMAXP neon instructions. --- .../simeng/arch/aarch64/helpers/neon.hh | 14 ++++++++++++-- src/lib/arch/aarch64/ExceptionHandler.cc | 5 ++--- src/lib/arch/aarch64/InstructionMetadata.cc | 2 +- .../aarch64/instructions/bitmanip.cc | 18 ++++++++++++++++++ test/regression/aarch64/instructions/neon.cc | 8 ++++---- 5 files changed, 37 insertions(+), 10 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh index c2626b7e91..17137dcb55 100644 --- a/src/include/simeng/arch/aarch64/helpers/neon.hh +++ b/src/include/simeng/arch/aarch64/helpers/neon.hh @@ -568,9 +568,14 @@ RegisterValue vecUMaxP(srcValContainer& sourceValues) { const T* n = sourceValues[0].getAsVector(); const T* m = sourceValues[1].getAsVector(); + // Concatenate the vectors + T temp[2 * I]; + memcpy(temp, m, sizeof(T) * I); + memcpy(temp + (sizeof(T) * I), n, sizeof(T) * I); + // Compare each adjacent pair of elements T out[I]; for (int i = 0; i < I; i++) { - out[i] = std::max(n[i], m[i]); + out[i] = std::max(temp[2 * i], temp[2 * i + 1]); } return {out, 256}; } @@ -585,9 +590,14 @@ RegisterValue vecUMinP(srcValContainer& sourceValues) { const T* n = sourceValues[0].getAsVector(); const T* m = sourceValues[1].getAsVector(); + // Concatenate the vectors + T temp[2 * I]; + memcpy(temp, m, sizeof(T) * I); + memcpy(temp + (sizeof(T) * I), n, sizeof(T) * I); + T out[I]; for (int i = 0; i < I; i++) { - out[i] = std::min(n[i], m[i]); + out[i] = std::min(temp[2 * i], temp[2 * i + 1]); } return {out, 256}; } diff --git a/src/lib/arch/aarch64/ExceptionHandler.cc b/src/lib/arch/aarch64/ExceptionHandler.cc index ae98dddb1a..ff7375339f 100644 --- a/src/lib/arch/aarch64/ExceptionHandler.cc +++ b/src/lib/arch/aarch64/ExceptionHandler.cc @@ -626,8 +626,7 @@ bool ExceptionHandler::init() { break; } - case 293: // rseq - { + case 293: { // rseq stateChange = {ChangeType::REPLACEMENT, {R0}, {0ull}}; break; } @@ -818,7 +817,7 @@ void ExceptionHandler::readLinkAt(span path) { for (size_t i = 0; i < bytesCopied; i += 256) { uint8_t size = std::min(bytesCopied - i, 256ul); stateChange.memoryAddresses.push_back({bufAddress + i, size}); - stateChange.memoryAddressValues.push_back(RegisterValue(bufPtr, size)); + stateChange.memoryAddressValues.push_back(RegisterValue(bufPtr + i, size)); } concludeSyscall(stateChange); diff --git a/src/lib/arch/aarch64/InstructionMetadata.cc b/src/lib/arch/aarch64/InstructionMetadata.cc index 34ddca07d7..ce71ec5b1f 100644 --- a/src/lib/arch/aarch64/InstructionMetadata.cc +++ b/src/lib/arch/aarch64/InstructionMetadata.cc @@ -244,7 +244,7 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn) if (isAlias) { exceptionString_ = "This instruction is an alias. The printed mnemonic and operand string " - "differ from what is expected of the Capstone opcode."; + "may differ from the underlying opcode."; } } diff --git a/test/regression/aarch64/instructions/bitmanip.cc b/test/regression/aarch64/instructions/bitmanip.cc index a72dcb64dc..30eb27fcef 100644 --- a/test/regression/aarch64/instructions/bitmanip.cc +++ b/test/regression/aarch64/instructions/bitmanip.cc @@ -274,11 +274,20 @@ TEST_P(InstBitmanip, ubfm) { ubfm w2, w0, #16, #31 ubfm w3, w0, #28, #23 ubfm w4, w0, #30, #27 + + # check alias + mov w10, #-1 + mov w11, #-1 + mov w12, #128 + lsl w10, w12, #1 + lsr w11, w12, #1 )"); EXPECT_EQ(getGeneralRegister(1), 0x000007A0ull); EXPECT_EQ(getGeneralRegister(2), 0x0000007Aull); EXPECT_EQ(getGeneralRegister(3), 0x07A00000ull); EXPECT_EQ(getGeneralRegister(4), 0x01E80000ull); + EXPECT_EQ(getGeneralRegister(10), 256); + EXPECT_EQ(getGeneralRegister(11), 64); RUN_AARCH64(R"( # Fill destination registers with 1s @@ -295,11 +304,20 @@ TEST_P(InstBitmanip, ubfm) { ubfm x2, x0, #16, #63 ubfm x3, x0, #32, #23 ubfm x4, x0, #60, #55 + + # check alias + mov x10, #-1 + mov x11, #-1 + mov x12, #128 + lsl x10, x12, #1 + lsr x11, x12, #1 )"); EXPECT_EQ(getGeneralRegister(1), 0x00000000000007A0ull); EXPECT_EQ(getGeneralRegister(2), 0x000000000000007Aull); EXPECT_EQ(getGeneralRegister(3), 0x007A000000000000ull); EXPECT_EQ(getGeneralRegister(4), 0x0000000007A00000ull); + EXPECT_EQ(getGeneralRegister(10), 256); + EXPECT_EQ(getGeneralRegister(11), 64); } INSTANTIATE_TEST_SUITE_P(AArch64, InstBitmanip, diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc index a4731f388f..ad11b13e9a 100644 --- a/test/regression/aarch64/instructions/neon.cc +++ b/test/regression/aarch64/instructions/neon.cc @@ -2684,8 +2684,8 @@ TEST_P(InstNeon, uminp) { )"); CHECK_NEON(2, uint8_t, - {0x00, 0x00, 0xEE, 0x11, 0x22, 0x33, 0x44, 0x55, 0x01, 0x02, 0x03, - 0x04, 0x05, 0x06, 0x07, 0x08}); + {0x00, 0x11, 0x22, 0x44, 0xEE, 0xCC, 0xAA, 0x88, 0x00, 0xAA, 0xBB, + 0xDD, 0x01, 0x03, 0x05, 0x07}); } TEST_P(InstNeon, umaxp) { // umaxp vd.16b vn.16b vm.16b @@ -2742,8 +2742,8 @@ TEST_P(InstNeon, umaxp) { )"); CHECK_NEON(2, uint8_t, - {0x01, 0x00, 0xFF, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0xEE, 0xDD, - 0xCC, 0xBB, 0xAA, 0x99, 0x88}); + {0x00, 0xEE, 0x33, 0x55, 0xFF, 0xDD, 0xBB, 0x99, 0x01, 0xFF, 0xCC, + 0xEE, 0x02, 0x04, 0x06, 0x08}); } TEST_P(InstNeon, smax) { From 6a11d7d8b46d02d24e634a85368e69dc0d10d576 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 7 Oct 2024 17:36:38 +0100 Subject: [PATCH 02/71] Implemented ldrsb (32-bit, Post) instruction with test. --- src/lib/arch/aarch64/Instruction_address.cc | 4 ++++ src/lib/arch/aarch64/Instruction_execute.cc | 8 ++++++++ test/regression/aarch64/instructions/load.cc | 17 +++++++++++++---- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index 357077e7b3..a8a98e5edd 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -679,6 +679,10 @@ span Instruction::generateAddresses() { setMemoryAddresses({{base, 4}, {base + 4, 4}}); break; } + case Opcode::AArch64_LDRSBWpost: { // ldrsb wt, [xn], #imm + setMemoryAddresses({{sourceValues_[0].get(), 1}}); + break; + } case Opcode::AArch64_LDRSBWroX: { // ldrsb wt, [xn, xm{, extend // {#amount}}] uint64_t offset = extendOffset(sourceValues_[1].get(), diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 20b62904b9..63f8147aa3 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -3522,6 +3522,14 @@ void Instruction::execute() { results_[0] = memoryData_[0].zeroExtend(16, 256); break; } + case Opcode::AArch64_LDRSBWpost: { // ldrsb wt, [xn], #imm + // LOAD + results_[1] = RegisterValue( + static_cast(memoryData_[0].get()), 4); + results_[0] = + sourceValues_[0].get() + metadata_.operands[2].imm; + break; + } case Opcode::AArch64_LDRSBWroX: { // ldrsb wt, [xn, xm{, extend // {#amount}}] // LOAD diff --git a/test/regression/aarch64/instructions/load.cc b/test/regression/aarch64/instructions/load.cc index 09269eebb8..2718c1fdb3 100644 --- a/test/regression/aarch64/instructions/load.cc +++ b/test/regression/aarch64/instructions/load.cc @@ -1222,14 +1222,23 @@ TEST_P(InstLoad, ldrsb) { mov x5, 1 # Load 8-bit values from heap and sign-extend to 32-bits ldrsb w1, [x0, x5, sxtx] + # Post Index + mov x20, x0 + ldrsb w2, [x20], #16 + # Load 8-bit values from heap and sign-extend to 64-bits - ldrsb x2, [x0] - ldrsb x3, [x0, #3] + ldrsb x3, [x0] + ldrsb x4, [x0, #3] + )"); EXPECT_EQ(getGeneralRegister(1), INT8_MAX); - EXPECT_EQ(getGeneralRegister(2), -2); - EXPECT_EQ(getGeneralRegister(3), 64); + EXPECT_EQ(getGeneralRegister(2), -2); + EXPECT_EQ(getGeneralRegister(20), + getGeneralRegister(0) + 16); + + EXPECT_EQ(getGeneralRegister(3), -2); + EXPECT_EQ(getGeneralRegister(4), 64); } TEST_P(InstLoad, ldrsh) { From 520324c4049851675fd71cef41ed7aa968070aab Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Tue, 8 Oct 2024 15:31:06 +0100 Subject: [PATCH 03/71] Fixed implementation of NEON CMHS instruction. --- src/lib/arch/aarch64/Instruction_execute.cc | 4 ++-- test/regression/aarch64/instructions/neon.cc | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 63f8147aa3..93c1bfeca2 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -700,9 +700,9 @@ void Instruction::execute() { break; } case Opcode::AArch64_CMHSv16i8: { // cmhs vd.16b, vn.16b, vm.16b - results_[0] = vecCompare( + results_[0] = vecCompare( sourceValues_, false, - [](int8_t x, int8_t y) -> bool { return (x >= y); }); + [](uint8_t x, uint8_t y) -> bool { return (x >= y); }); break; } case Opcode::AArch64_CMPEQ_PPzZI_B: { // cmpeq pd.b, pg/z, zn.b, #imm diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc index ad11b13e9a..2a28a4e22b 100644 --- a/test/regression/aarch64/instructions/neon.cc +++ b/test/regression/aarch64/instructions/neon.cc @@ -727,8 +727,8 @@ TEST_P(InstNeon, cmhs) { heap[1] = 0x7F; heap[2] = INT8_MAX; heap[3] = 1; - heap[4] = -128; - heap[5] = -1; + heap[4] = 128; + heap[5] = 1; heap[6] = 0xAA; heap[7] = 0xBB; heap[8] = 0xCC; @@ -744,7 +744,7 @@ TEST_P(InstNeon, cmhs) { heap[16] = INT8_MAX; heap[17] = 0x7F; heap[18] = 0; - heap[19] = -128; + heap[19] = 128; heap[20] = 1; heap[21] = 0; heap[22] = 0xAA; @@ -772,10 +772,10 @@ TEST_P(InstNeon, cmhs) { )"); CHECK_NEON(2, uint8_t, - {0x00, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + {0x00, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}); CHECK_NEON(3, uint8_t, - {0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, + {0xFF, 0xFF, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF}); } From 2b4a88605cead6f45aeeb1c149d5ce6be579c423 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Wed, 9 Oct 2024 13:35:38 +0100 Subject: [PATCH 04/71] Implemented UCVTF (fixed-point to float) instruction with test. --- src/lib/arch/aarch64/Instruction_execute.cc | 11 +++++++ test/regression/aarch64/instructions/float.cc | 31 +++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 93c1bfeca2..4f4e17fa18 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -5526,6 +5526,17 @@ void Instruction::execute() { bfm_2imms(sourceValues_, metadata_, false, true); break; } + case Opcode::AArch64_UCVTFSXSri: { // ucvtf sd, xn, #fbits + // Convert Fixed-Point to FP32 + // Using algorithm from + // https://embeddedartistry.com/blog/2018/07/12/simple-fixed-point-conversion-in-c/ + const uint64_t xn = sourceValues_[0].get(); + const uint64_t fbits = static_cast(metadata_.operands[2].imm); + std::cerr << xn << " " << fbits << std::endl; + results_[0] = { + static_cast(xn) / static_cast(1ull << fbits), 256}; + break; + } case Opcode::AArch64_UCVTFUWDri: { // ucvtf dd, wn results_[0] = {static_cast(sourceValues_[0].get()), 256}; diff --git a/test/regression/aarch64/instructions/float.cc b/test/regression/aarch64/instructions/float.cc index 03f3f799df..bc2d09ea27 100644 --- a/test/regression/aarch64/instructions/float.cc +++ b/test/regression/aarch64/instructions/float.cc @@ -1453,6 +1453,37 @@ TEST_P(InstFloat, ucvtf) { CHECK_NEON(9, float, {static_cast(UINT64_C(1) << 48), 0.f, 0.f, 0.f}); CHECK_NEON(10, float, {static_cast(UINT64_MAX), 0.f, 0.f, 0.f}); CHECK_NEON(11, float, {0.f, 0.f, 0.f, 0.f}); + + // 32-bit unsigned fixed-point to float + // Numbers have been chosen to have less than 0.0005 fixed-point + // representation error to ensure tests pass + initialHeapData_.resize(12); + heap32 = reinterpret_cast(initialHeapData_.data()); + heap32[0] = 0x000001EE; + heap32[1] = 0x00021F3B; + heap32[2] = 0x32FE6B75; + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + # 2 fraction-bits (123.5) + ldr w1, [x0], #4 + ucvtf s1, x1, #0x2 + + # 8 fraction-bits (543.23) + ldr w2, [x0], #4 + ucvtf s2, x2, #0x8 + + + # 23 fraction-bits (101.987654321) + ldr w3, [x0] + ucvtf s3, x3, #0x17 + )"); + CHECK_NEON(1, float, {123.5f, 0.0f, 0.0f, 0.0f}); + CHECK_NEON(2, float, {543.23f, 0.0f, 0.0f, 0.0f}); + CHECK_NEON(3, float, {101.987654321f, 0.0f, 0.0f, 0.0f}); } TEST_P(InstFloat, frintp) { From e43ada7b0e615744521824735929929b0e53cd04 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Wed, 9 Oct 2024 14:58:28 +0100 Subject: [PATCH 05/71] Implemented UCVTF (fixed-point to float) helper function. --- src/include/simeng/arch/aarch64/helpers/float.hh | 15 +++++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 9 +-------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/float.hh b/src/include/simeng/arch/aarch64/helpers/float.hh index 454f50070c..8675f5ed0c 100644 --- a/src/include/simeng/arch/aarch64/helpers/float.hh +++ b/src/include/simeng/arch/aarch64/helpers/float.hh @@ -194,6 +194,21 @@ D fcvtzu_integer(srcValContainer& sourceValues) { return result; } +/** Helper function for SCALAR/FP instructions with the format ucvtf rd, rn + * #fbits. + * D represents the destination register type (e.g. for Sd, D = float). + * N represents the source register type (e.g. for Xn, N = uint32_t). + * Returns single value of type D. */ +template +D ucvtf_fixedToFloat(srcValContainer& sourceValues) { + // Convert Fixed-Point to FP + // Using algorithm from + // https://embeddedartistry.com/blog/2018/07/12/simple-fixed-point-conversion-in-c/ + const N xn = sourceValues_[0].get(); + const N fbits = static_cast(metadata_.operands[2].imm); + return (static_cast(xn) / static_cast(1ull << fbits)); +} + } // namespace aarch64 } // namespace arch } // namespace simeng \ No newline at end of file diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 4f4e17fa18..97d98cdb8e 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -5527,14 +5527,7 @@ void Instruction::execute() { break; } case Opcode::AArch64_UCVTFSXSri: { // ucvtf sd, xn, #fbits - // Convert Fixed-Point to FP32 - // Using algorithm from - // https://embeddedartistry.com/blog/2018/07/12/simple-fixed-point-conversion-in-c/ - const uint64_t xn = sourceValues_[0].get(); - const uint64_t fbits = static_cast(metadata_.operands[2].imm); - std::cerr << xn << " " << fbits << std::endl; - results_[0] = { - static_cast(xn) / static_cast(1ull << fbits), 256}; + results_[0] = {ucvtf_fixedToFloat(sourceValues_), 256}; break; } case Opcode::AArch64_UCVTFUWDri: { // ucvtf dd, wn From 4773af8fdd5f842718c2f0334025c8a2a7ee9776 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Thu, 10 Oct 2024 15:13:15 +0100 Subject: [PATCH 06/71] Implemented UDOT (by element) NEON instructions with tests. --- .../simeng/arch/aarch64/helpers/float.hh | 8 +++-- .../simeng/arch/aarch64/helpers/neon.hh | 30 ++++++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 11 +++++- .../aarch64/AArch64RegressionTest.hh | 15 ++++---- test/regression/aarch64/instructions/neon.cc | 35 +++++++++++++++++++ 5 files changed, 89 insertions(+), 10 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/float.hh b/src/include/simeng/arch/aarch64/helpers/float.hh index 8675f5ed0c..0d198f9268 100644 --- a/src/include/simeng/arch/aarch64/helpers/float.hh +++ b/src/include/simeng/arch/aarch64/helpers/float.hh @@ -200,12 +200,14 @@ D fcvtzu_integer(srcValContainer& sourceValues) { * N represents the source register type (e.g. for Xn, N = uint32_t). * Returns single value of type D. */ template -D ucvtf_fixedToFloat(srcValContainer& sourceValues) { +D ucvtf_fixedToFloat( + srcValContainer& sourceValues, + const simeng::arch::aarch64::InstructionMetadata& metadata) { // Convert Fixed-Point to FP // Using algorithm from // https://embeddedartistry.com/blog/2018/07/12/simple-fixed-point-conversion-in-c/ - const N xn = sourceValues_[0].get(); - const N fbits = static_cast(metadata_.operands[2].imm); + const N xn = sourceValues[0].get(); + const N fbits = static_cast(metadata.operands[2].imm); return (static_cast(xn) / static_cast(1ull << fbits)); } diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh index 17137dcb55..c2bf42e6fa 100644 --- a/src/include/simeng/arch/aarch64/helpers/neon.hh +++ b/src/include/simeng/arch/aarch64/helpers/neon.hh @@ -951,6 +951,36 @@ RegisterValue vecUzp(srcValContainer& sourceValues, bool isUzp1) { return {out, 256}; } +/** Helper function for NEON instructions with the format `udot vd.s, vn.b, + * vm.4b[index]`. + * D represents the number of elements in the output vector to be updated (i.e. + * for vd.2s D = 2). Only 2 or 4 are valid. + * Returns correctly formatted RegisterValue. */ +template +RegisterValue vecUdot_byElement( + srcValContainer& sourceValues, + const simeng::arch::aarch64::InstructionMetadata& metadata) { + // Check D and N are valid values + static_assert((D == 2 || D == 4) && + "D must be either 2 or 4 to align with vd.2s or vd.4s."); + + const uint32_t* vd = sourceValues[0].getAsVector(); + const uint8_t* vn = sourceValues[1].getAsVector(); + const uint8_t* vm = sourceValues[2].getAsVector(); + const int index = metadata.operands[2].vector_index; + + uint32_t out[D] = {0}; + for (int i = 0; i < D; i++) { + uint32_t acc = vd[i]; + for (int j = 0; j < 4; j++) { + acc += (static_cast(vn[(4 * i) + j]) * + static_cast(vm[(4 * index) + j])); + } + out[i] = acc; + } + return {out, 256}; +} + /** Helper function for NEON instructions with the format `zip<1,2> vd.T, * vn.T, vm.T`. * T represents the type of sourceValues (e.g. for vn.d, T = uint64_t). diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 97d98cdb8e..19136d5442 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -5527,7 +5527,8 @@ void Instruction::execute() { break; } case Opcode::AArch64_UCVTFSXSri: { // ucvtf sd, xn, #fbits - results_[0] = {ucvtf_fixedToFloat(sourceValues_), 256}; + results_[0] = { + ucvtf_fixedToFloat(sourceValues_, metadata_), 256}; break; } case Opcode::AArch64_UCVTFUWDri: { // ucvtf dd, wn @@ -5568,6 +5569,14 @@ void Instruction::execute() { results_[0] = {div_3ops(sourceValues_), 8}; break; } + case Opcode::AArch64_UDOTlanev16i8: { // udot vd.4s, vn.16b, vm.4b[index] + results_[0] = vecUdot_byElement<4>(sourceValues_, metadata_); + break; + } + case Opcode::AArch64_UDOTlanev8i8: { // udot vd.2s, vn.8b, vm.4b[index] + results_[0] = vecUdot_byElement<2>(sourceValues_, metadata_); + break; + } case Opcode::AArch64_UMADDLrrr: { // umaddl xd, wn, wm, xa results_[0] = maddl_4ops(sourceValues_); break; diff --git a/test/regression/aarch64/AArch64RegressionTest.hh b/test/regression/aarch64/AArch64RegressionTest.hh index 32d975b09d..3e39fa59fe 100644 --- a/test/regression/aarch64/AArch64RegressionTest.hh +++ b/test/regression/aarch64/AArch64RegressionTest.hh @@ -239,13 +239,16 @@ class AArch64RegressionTest : public RegressionTest { /** Get the subtarget feature string based on LLVM version being used */ std::string getSubtargetFeaturesString() { -#if SIMENG_LLVM_VERSION < 14 - return "+sve,+lse"; -#elif SIMENG_LLVM_VERSION < 18 - return "+sve,+lse,+sve2,+sme,+sme-f64"; -#else - return "+sve,+lse,+sve2,+sme,+sme-f64f64,+sme-i16i64,+sme2"; + std::string features = "+dotprod,+sve,+lse"; +#if SIMENG_LLVM_VERSION > 13 + // "+dotprod,+sve,+lse,+sve2,+sme,+sme-f64"; + features += ",+sve2,+sme,+sme-f64"; #endif +#if SIMENG_LLVM_VERSION > 17 + // "+dotprod,+sve,+lse,+sve2,+sme,+sme-f64f64,+sme-i16i64,+sme2"; + features += "f64,+sme-i16i64,+sme2"; +#endif + return features; } /** Check the elements of a Neon register. diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc index 2a28a4e22b..92d270288f 100644 --- a/test/regression/aarch64/instructions/neon.cc +++ b/test/regression/aarch64/instructions/neon.cc @@ -3648,6 +3648,41 @@ TEST_P(InstNeon, trn) { CHECK_NEON(8, uint64_t, {0x1e1c1a1816141210, 0x1f1d1b1917151311}); } +TEST_P(InstNeon, udot) { + // udot by element + initialHeapData_.resize(128); + uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + heap64[0] = 0xDEADBEEFFFFF00FF; + heap64[1] = 0x01234567ABBACAFE; + heap64[2] = 0xFEDCBA9876543210; + heap64[3] = 0xDEADCAFEABBABEEF; + RUN_AARCH64(R"( + # Get heap address + mov x0, #0 + mov x8, #214 + svc #0 + + ldr q0, [x0] + ldr q1, [x0, #16] + + movi v2.4s, #3 + movi v3.4s, #4 + movi v4.4s, #5 + movi v5.4s, #6 + + udot v2.2s, v1.8b, v0.4b[0] + udot v3.4s, v1.16b, v0.4b[1] + udot v4.2s, v1.8b, v0.4b[2] + udot v5.4s, v1.16b, v0.4b[3] + )"); + CHECK_NEON(0, uint64_t, {0xDEADBEEFFFFF00FF, 0x01234567ABBACAFE}); + CHECK_NEON(1, uint64_t, {0xFEDCBA9876543210, 0xDEADCAFEABBABEEF}); + CHECK_NEON(2, uint32_t, {0xd929, 0x26f91, 0x0, 0x0}); + CHECK_NEON(3, uint32_t, {0xd328, 0x288e8, 0x27e25, 0x2b87f}); + CHECK_NEON(4, uint32_t, {0xc333, 0x2731b, 0x0, 0x0}); + CHECK_NEON(5, uint32_t, {0x1fe2, 0x8e62, 0xad7e, 0xb52f}); +} + TEST_P(InstNeon, uzp) { initialHeapData_.resize(128); uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); From 50a8a20efbf331efc4eb7fff6ef20bc724e96330 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Thu, 10 Oct 2024 16:17:01 +0100 Subject: [PATCH 07/71] Implemented LD1 (NEON 8h x2, post index) instruction with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 3 ++ src/lib/arch/aarch64/Instruction_execute.cc | 3 ++ test/regression/aarch64/instructions/load.cc | 47 ++++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index a8a98e5edd..df94a5efda 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -275,6 +275,9 @@ span Instruction::generateAddresses() { case Opcode::AArch64_LD1Twov2d_POST: // ld1 {vt1.2d, vt2.2d}, [xn], // <#imm|xm> [[fallthrough]]; + case Opcode::AArch64_LD1Twov8h_POST: // ld1 {vt1.8h, vt2.8h}, [xn], + // <#imm|xm> + [[fallthrough]]; case Opcode::AArch64_LD1Twov4s: // ld1 {vt1.4s, vt2.4s}, [xn] [[fallthrough]]; case Opcode::AArch64_LD1Twov4s_POST: { // ld1 {vt1.4s, vt2.4s}, [xn], diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 19136d5442..d7d12040c1 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -3106,6 +3106,9 @@ void Instruction::execute() { case Opcode::AArch64_LD1Twov2d_POST: // ld1 {vt1.2d, vt2.2d}, [xn], // <#imm|xm> [[fallthrough]]; + case Opcode::AArch64_LD1Twov8h_POST: // ld1 {vt1.8h, vt2.8h}, [xn], + // <#imm|xm> + [[fallthrough]]; case Opcode::AArch64_LD1Twov4s_POST: { // ld1 {vt1.4s, vt2.4s}, [xn], // <#imm|xm> // LOAD diff --git a/test/regression/aarch64/instructions/load.cc b/test/regression/aarch64/instructions/load.cc index 2718c1fdb3..b59f1f8cf5 100644 --- a/test/regression/aarch64/instructions/load.cc +++ b/test/regression/aarch64/instructions/load.cc @@ -282,6 +282,53 @@ TEST_P(InstLoad, ld1_multi_struct) { EXPECT_EQ(getGeneralRegister(12), getGeneralRegister(10) + 32); + // Two reg, 8h elements + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #32 + + # Load values from heap + # ld1 {v0.8h, v1.8h}, [x0] + + # save heap address before post index + mov x10, x0 + + # Load values from heap with imm post-index + ld1 {v2.8h, v3.8h}, [x0], #32 + + # save heap address after post index + mov x11, x0 + sub x0, x0, #32 + + # Load values from heap with reg post-index + ld1 {v4.8h, v5.8h}, [x0], x1 + + mov x12, x0 + )"); + + // CHECK_NEON(0, uint16_t, + // {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, + // 0xEEDD}); + // CHECK_NEON(1, uint16_t, + // {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, + // 0xEEDD}); + CHECK_NEON(2, uint16_t, + {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD}); + CHECK_NEON(3, uint16_t, + {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD}); + CHECK_NEON(4, uint16_t, + {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD}); + CHECK_NEON(5, uint16_t, + {0x00FF, 0x2211, 0x4433, 0x6655, 0x8877, 0xAA99, 0xCCBB, 0xEEDD}); + EXPECT_EQ(getGeneralRegister(11), + getGeneralRegister(10) + 32); + EXPECT_EQ(getGeneralRegister(12), + getGeneralRegister(10) + 32); + // Two reg, 2d elements RUN_AARCH64(R"( # Get heap address From 6696d5f861357f42e0e2876a540aa2795b507c24 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Thu, 10 Oct 2024 17:34:51 +0100 Subject: [PATCH 08/71] Implemented NEON UMLAL (32 to 64 bit) instruction with tests. --- src/lib/arch/aarch64/Instruction_execute.cc | 13 ++++++++++ test/regression/aarch64/instructions/neon.cc | 26 ++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index d7d12040c1..caa423871e 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -5592,6 +5592,19 @@ void Instruction::execute() { results_[0] = vecUMinP(sourceValues_); break; } + case Opcode::AArch64_UMLALv2i32_indexed: { // umlal vd.2d, vn.2s, + // vm.s[index] + const uint64_t* vd = sourceValues_[0].getAsVector(); + const uint32_t* vn = sourceValues_[1].getAsVector(); + const uint32_t* vm = sourceValues_[2].getAsVector(); + const int64_t index = metadata_.operands[2].vector_index; + const uint64_t vm_idx_elem = static_cast(vm[index]); + + uint64_t out[2] = {vd[0] + static_cast(vn[0]) * vm_idx_elem, + vd[1] + static_cast(vn[1]) * vm_idx_elem}; + results_[0] = {out, 256}; + break; + } case Opcode::AArch64_UMOVvi32_idx0: // umov wd, vn.s[0] case Opcode::AArch64_UMOVvi32: { // umov wd, vn.s[index] const uint32_t* vec = sourceValues_[0].getAsVector(); diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc index 92d270288f..8ecee526e6 100644 --- a/test/regression/aarch64/instructions/neon.cc +++ b/test/regression/aarch64/instructions/neon.cc @@ -3758,6 +3758,32 @@ TEST_P(InstNeon, uzp) { CHECK_NEON(8, uint64_t, {0x1e1c1a1816141210, 0x1f1d1b1917151311}); } +TEST_P(InstNeon, umlal) { + // uint32 to uint64, lower half + RUN_AARCH64(R"( + mov w0, #-1 + mov w1, #344 + mov v0.s[0], w0 + mov v0.s[3], w1 + + mov w2, #-1 + mov w3, #3 + mov v1.s[0], w2 + mov v1.s[1], w3 + + mov v2.d[0], xzr + mov v2.d[1], xzr + mov v3.d[0], xzr + mov v3.d[1], xzr + + umlal v2.2d, v1.2s, v0.s[0] + umlal v3.2d, v1.2s, v0.s[3] + )"); + CHECK_NEON(0, uint32_t, {UINT32_MAX, 0, 0, 344}); + CHECK_NEON(2, uint64_t, {18446744065119617025ull, 12884901885ull}); + CHECK_NEON(3, uint64_t, {1477468749480ull, 1032ull}); +} + TEST_P(InstNeon, zip) { initialHeapData_.resize(128); uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); From bb5096a6ca1c2acfecf3c55c2021abf7aeefe182 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Thu, 10 Oct 2024 17:41:06 +0100 Subject: [PATCH 09/71] Implemented NEON UMLAL2 (32 to 64 bit) instruction with tests. --- src/lib/arch/aarch64/Instruction_execute.cc | 13 +++++++++++ test/regression/aarch64/instructions/neon.cc | 24 ++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index caa423871e..aa58b2fe9f 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -5605,6 +5605,19 @@ void Instruction::execute() { results_[0] = {out, 256}; break; } + case Opcode::AArch64_UMLALv4i32_indexed: { // umlal2 vd.2d, vn.4s, + // vm.s[index] + const uint64_t* vd = sourceValues_[0].getAsVector(); + const uint32_t* vn = sourceValues_[1].getAsVector(); + const uint32_t* vm = sourceValues_[2].getAsVector(); + const int64_t index = metadata_.operands[2].vector_index; + const uint64_t vm_idx_elem = static_cast(vm[index]); + + uint64_t out[2] = {vd[0] + static_cast(vn[2]) * vm_idx_elem, + vd[1] + static_cast(vn[3]) * vm_idx_elem}; + results_[0] = {out, 256}; + break; + } case Opcode::AArch64_UMOVvi32_idx0: // umov wd, vn.s[0] case Opcode::AArch64_UMOVvi32: { // umov wd, vn.s[index] const uint32_t* vec = sourceValues_[0].getAsVector(); diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc index 8ecee526e6..e8ce4f13f2 100644 --- a/test/regression/aarch64/instructions/neon.cc +++ b/test/regression/aarch64/instructions/neon.cc @@ -3782,6 +3782,30 @@ TEST_P(InstNeon, umlal) { CHECK_NEON(0, uint32_t, {UINT32_MAX, 0, 0, 344}); CHECK_NEON(2, uint64_t, {18446744065119617025ull, 12884901885ull}); CHECK_NEON(3, uint64_t, {1477468749480ull, 1032ull}); + + // uint32 to uint64, upper half + RUN_AARCH64(R"( + mov w0, #-1 + mov w1, #344 + mov v0.s[0], w0 + mov v0.s[3], w1 + + mov w2, #-1 + mov w3, #3 + mov v1.s[2], w2 + mov v1.s[3], w3 + + mov v2.d[0], xzr + mov v2.d[1], xzr + mov v3.d[0], xzr + mov v3.d[1], xzr + + umlal2 v2.2d, v1.4s, v0.s[0] + umlal2 v3.2d, v1.4s, v0.s[3] + )"); + CHECK_NEON(0, uint32_t, {UINT32_MAX, 0, 0, 344}); + CHECK_NEON(2, uint64_t, {18446744065119617025ull, 12884901885ull}); + CHECK_NEON(3, uint64_t, {1477468749480ull, 1032ull}); } TEST_P(InstNeon, zip) { From 09d65069af365105c5d907cb807d95cfc1707fd4 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Fri, 11 Oct 2024 13:00:39 +0100 Subject: [PATCH 10/71] Implemented NEON ST1 (single vector, post index) instruction with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 5 +++++ src/lib/arch/aarch64/Instruction_execute.cc | 13 ++++++++++++ test/regression/aarch64/instructions/store.cc | 20 +++++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index df94a5efda..2594e07ed6 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -1284,6 +1284,11 @@ span Instruction::generateAddresses() { setMemoryAddresses(std::move(addresses)); break; } + case Opcode::AArch64_ST1Onev4s_POST: { // st1 {vt.4s}, [xn|sp], <#imm|xm> + const uint64_t base = sourceValues_[1].get(); + setMemoryAddresses({base, 16}); + break; + } case Opcode::AArch64_ST1Twov16b: // st1 {vt.16b, vt2.16b}, [xn] [[fallthrough]]; case Opcode::AArch64_ST1Twov16b_POST: // st1 {vt.16b, vt2.16b}, [xn], diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index aa58b2fe9f..13bb362c35 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -4724,6 +4724,19 @@ void Instruction::execute() { results_[0] = sourceValues_[4].get() + postIndex; break; } + case Opcode::AArch64_ST1Onev4s_POST: { // st1 {vt.4s}, [xn|sp], <#imm|xm> + // STORE + const uint32_t* vt = sourceValues_[0].getAsVector(); + memoryData_[0] = RegisterValue((char*)vt, 4 * sizeof(uint32_t)); + + // if #imm post-index, value can only be 16 + const uint64_t postIndex = + (metadata_.operands[2].type == AARCH64_OP_REG) + ? sourceValues_[2].get() + : 16; + results_[0] = sourceValues_[1].get() + postIndex; + break; + } case Opcode::AArch64_ST1Twov16b: { // st1 {vt.16b, vt2.16b}, [xn|sp] // STORE const uint8_t* t = sourceValues_[0].getAsVector(); diff --git a/test/regression/aarch64/instructions/store.cc b/test/regression/aarch64/instructions/store.cc index 6d6876b494..6a8136da37 100644 --- a/test/regression/aarch64/instructions/store.cc +++ b/test/regression/aarch64/instructions/store.cc @@ -437,6 +437,26 @@ TEST_P(InstStore, st1_multi_struct) { } } + // one reg, 4s elements (post offset only) + RUN_AARCH64(R"( + mov x0, #32 + movi v0.4s, #1 + sub sp, sp, #96 + st1 {v0.4s}, [sp], #16 + st1 {v0.4s}, [sp], x0 + )"); + const uint64_t sp = process_->getInitialStackPointer(); + EXPECT_EQ(getGeneralRegister(31), sp - 48); + EXPECT_EQ(getMemoryValue(sp - 96), static_cast(1)); + EXPECT_EQ(getMemoryValue(sp - 92), static_cast(1)); + EXPECT_EQ(getMemoryValue(sp - 88), static_cast(1)); + EXPECT_EQ(getMemoryValue(sp - 84), static_cast(1)); + + EXPECT_EQ(getMemoryValue(sp - 80), static_cast(1)); + EXPECT_EQ(getMemoryValue(sp - 76), static_cast(1)); + EXPECT_EQ(getMemoryValue(sp - 72), static_cast(1)); + EXPECT_EQ(getMemoryValue(sp - 68), static_cast(1)); + // two reg, 4s elements RUN_AARCH64(R"( mov x0, #32 From f6e7c03afb691fd60dbcc32797ad6c04f8823c43 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Fri, 11 Oct 2024 14:39:42 +0100 Subject: [PATCH 11/71] Implemented NEON LD1 (single vector, post index, 8b) instruction with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 4 +++ src/lib/arch/aarch64/Instruction_execute.cc | 10 ++++++ test/regression/aarch64/instructions/load.cc | 35 ++++++++++++++++++++ 3 files changed, 49 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index 2594e07ed6..0758669e65 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -243,6 +243,10 @@ span Instruction::generateAddresses() { setMemoryAddresses({{sourceValues_[0].get(), 16}}); break; } + case Opcode::AArch64_LD1Onev8b_POST: { // ld1 {vt.8b}, [xn], <#imm|xm> + setMemoryAddresses({{sourceValues_[0].get(), 8}}); + break; + } case Opcode::AArch64_LD1Fourv16b: // ld1 {vt1.16b, vt2.16b, vt3.16b, // vt4.16b}, [xn] [[fallthrough]]; diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 13bb362c35..b44ab3bdc0 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -2795,6 +2795,16 @@ void Instruction::execute() { results_[1] = memoryData_[0].zeroExtend(memoryData_[0].size(), 256); break; } + case Opcode::AArch64_LD1Onev8b_POST: { // ld1 {vt.8b}, [xn], <#imm|xm> + // if #imm post-index, value can only be 8 + const uint64_t postIndex = + (metadata_.operands[2].type == AARCH64_OP_REG) + ? sourceValues_[1].get() + : 8; + results_[0] = sourceValues_[0].get() + postIndex; + results_[1] = memoryData_[0].zeroExtend(memoryData_[0].size(), 256); + break; + } case Opcode::AArch64_LD1RD_IMM: { // ld1rd {zt.d}, pg/z, [xn, #imm] // LOAD const uint16_t partition_num = VL_bits / 64; diff --git a/test/regression/aarch64/instructions/load.cc b/test/regression/aarch64/instructions/load.cc index b59f1f8cf5..b98013d2a2 100644 --- a/test/regression/aarch64/instructions/load.cc +++ b/test/regression/aarch64/instructions/load.cc @@ -231,6 +231,41 @@ TEST_P(InstLoad, ld1_multi_struct) { EXPECT_EQ(getGeneralRegister(12), getGeneralRegister(10) + 16); + // One reg, 8b elements + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x1, #8 + + # save heap address before post index + mov x10, x0 + + # Load values from heap with imm post-index + ld1 {v1.8b}, [x0], #8 + + # save heap address after post index + mov x11, x0 + + # Load values from heap with reg post-index + ld1 {v2.8b}, [x0], x1 + + mov x12, x0 + )"); + + CHECK_NEON(1, uint8_t, + {0xFF, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00}); + CHECK_NEON(2, uint8_t, + {0x77, 0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00}); + EXPECT_EQ(getGeneralRegister(11), + getGeneralRegister(10) + 8); + EXPECT_EQ(getGeneralRegister(12), + getGeneralRegister(10) + 16); + // Two reg, 16b elements RUN_AARCH64(R"( # Get heap address From 74e9b47c579bb60c949f75282ca06a86db0c797c Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 14 Oct 2024 11:54:41 +0100 Subject: [PATCH 12/71] Implemented SVE LD1RQB (imm offset) instruction with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 6 +++ src/lib/arch/aarch64/Instruction_execute.cc | 23 +++++++++++ test/regression/aarch64/instructions/sve.cc | 43 +++++++++++++++++++++ 3 files changed, 72 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index 0758669e65..b7dc32176d 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -141,6 +141,12 @@ span Instruction::generateAddresses() { } break; } + case Opcode::AArch64_LD1RQ_B_IMM: { // ld1rqb {zd.b}, pg/z, [xn{, #imm}] + uint64_t addr = + sourceValues_[1].get() + metadata_.operands[2].mem.disp; + setMemoryAddresses({addr, static_cast(16)}); + break; + } case Opcode::AArch64_LD1RQ_D_IMM: { // ld1rqd {zd.d}, pg/z, [xn{, #imm}] uint64_t addr = sourceValues_[1].get() + metadata_.operands[2].mem.disp; diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index b44ab3bdc0..1e5813bde2 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -2832,6 +2832,29 @@ void Instruction::execute() { results_[0] = {out, 256}; break; } + case Opcode::AArch64_LD1RQ_B_IMM: { // ld1rqb {zd.b}, pg/z, [xn{, #imm }] + // LOAD + const uint64_t* p = sourceValues_[0].getAsVector(); + const uint16_t partition_num = VL_bits / 8; + uint8_t out[256] = {0}; + const uint8_t* data = memoryData_[0].getAsVector(); + + // Get mini-vector (quadword) + uint8_t mini[16] = {0}; + for (int i = 0; i < 16; i++) { + uint64_t shifted_active = 1ull << (i % 64); + if (p[i / 64] & shifted_active) { + mini[i] = data[i]; + } + } + + // Duplicate mini-vector into output vector + for (int i = 0; i < partition_num; i++) { + out[i] = mini[i % 16]; + } + results_[0] = {out, 256}; + break; + } case Opcode::AArch64_LD1RQ_D_IMM: { // ld1rqd {zd.d}, pg/z, [xn{, #imm}] // LOAD const uint64_t* p = sourceValues_[0].getAsVector(); diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 6a52d46b95..c1db0317fd 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -4641,6 +4641,49 @@ TEST_P(InstSve, ld1rd) { CHECK_NEON(3, uint64_t, fillNeon({0x12345678}, VL / 16)); } +TEST_P(InstSve, ld1rqb) { + initialHeapData_.resize(32); + uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + fillHeap(heap64, + {0x12345678DEADBEEF, 0xABCDEF0198765432, + 0xABBACAFEFEDCBA98, 0xFEEDABCDBEADCABB}, + 4); + + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + # Load and broadcast values from heap + ptrue p0.b + ld1rqb {z0.b}, p0/z, [x0] + ld1rqb {z1.b}, p0/z, [x0, #16] + + # Test for inactive lanes + ptrue p1.b, vl1 + ld1rqb {z2.b}, p1/z, [x0] + add x0, x0, #32 + ld1rqb {z3.b}, p1/z, [x0, #-16] + )"); + CHECK_NEON(0, uint8_t, + fillNeon({0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12, + 0x32, 0x54, 0x76, 0x98, 0x01, 0xEF, 0xCD, 0xAB}, + VL / 8)); + CHECK_NEON(1, uint8_t, + fillNeon({0x98, 0xBA, 0xDC, 0xFE, 0xFE, 0xCA, 0xBA, 0xAB, + 0xBB, 0xCA, 0xAD, 0xBE, 0xCD, 0xAB, 0xED, 0xFE}, + VL / 8)); + CHECK_NEON(2, uint8_t, + fillNeon({0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + VL / 8)); + CHECK_NEON(3, uint8_t, + fillNeon({0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + VL / 8)); +} + TEST_P(InstSve, ld1rqd) { initialHeapData_.resize(32); uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); From 4daf7050b5d83644b0f94a6725f6e68ef4e99133 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 14 Oct 2024 12:09:13 +0100 Subject: [PATCH 13/71] Implemented SVE LD1RQB (reg offset) instruction with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 6 ++++ src/lib/arch/aarch64/Instruction_execute.cc | 1 + test/regression/aarch64/instructions/sve.cc | 37 ++++++++++++++++++++- 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index b7dc32176d..cd453f311e 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -141,6 +141,12 @@ span Instruction::generateAddresses() { } break; } + case Opcode::AArch64_LD1RQ_B: { // ld1rqb {zd.b}, pg/z, [xn, xm] + uint64_t addr = + sourceValues_[1].get() + sourceValues_[2].get(); + setMemoryAddresses({addr, static_cast(16)}); + break; + } case Opcode::AArch64_LD1RQ_B_IMM: { // ld1rqb {zd.b}, pg/z, [xn{, #imm}] uint64_t addr = sourceValues_[1].get() + metadata_.operands[2].mem.disp; diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 1e5813bde2..1ed2ff172a 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -2832,6 +2832,7 @@ void Instruction::execute() { results_[0] = {out, 256}; break; } + case Opcode::AArch64_LD1RQ_B: // ld1rqb {zd.b}, pg/z, [xn, xm] case Opcode::AArch64_LD1RQ_B_IMM: { // ld1rqb {zd.b}, pg/z, [xn{, #imm }] // LOAD const uint64_t* p = sourceValues_[0].getAsVector(); diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index c1db0317fd..47618e4fef 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -4648,7 +4648,7 @@ TEST_P(InstSve, ld1rqb) { {0x12345678DEADBEEF, 0xABCDEF0198765432, 0xABBACAFEFEDCBA98, 0xFEEDABCDBEADCABB}, 4); - + // Imm offset RUN_AARCH64(R"( # Get heap address mov x0, 0 @@ -4682,6 +4682,41 @@ TEST_P(InstSve, ld1rqb) { fillNeon({0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, VL / 8)); + + // Reg offset + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + # Load and broadcast values from heap + ptrue p0.b + mov x1, #16 + ld1rqb {z0.b}, p0/z, [x0] + ld1rqb {z1.b}, p0/z, [x0, x1] + + # Test for inactive lanes + ptrue p1.b, vl1 + ld1rqb {z2.b}, p1/z, [x0] + ld1rqb {z3.b}, p1/z, [x0, x1] + )"); + CHECK_NEON(0, uint8_t, + fillNeon({0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12, + 0x32, 0x54, 0x76, 0x98, 0x01, 0xEF, 0xCD, 0xAB}, + VL / 8)); + CHECK_NEON(1, uint8_t, + fillNeon({0x98, 0xBA, 0xDC, 0xFE, 0xFE, 0xCA, 0xBA, 0xAB, + 0xBB, 0xCA, 0xAD, 0xBE, 0xCD, 0xAB, 0xED, 0xFE}, + VL / 8)); + CHECK_NEON(2, uint8_t, + fillNeon({0xEF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + VL / 8)); + CHECK_NEON(3, uint8_t, + fillNeon({0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + VL / 8)); } TEST_P(InstSve, ld1rqd) { From 810a3240ce398e0199f7ae9dc99875b71c0a8579 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 14 Oct 2024 13:01:55 +0100 Subject: [PATCH 14/71] Implemented SVE UDOT (4-way, indexed) instruction and tests. --- .../simeng/arch/aarch64/helpers/sve.hh | 31 +++++++++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 5 +++ test/regression/aarch64/instructions/sve.cc | 26 ++++++++++++++++ 3 files changed, 62 insertions(+) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 2c33ccfbe6..b19600a0dc 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -1563,6 +1563,37 @@ RegisterValue sveTrn2_3vecs(srcValContainer& sourceValues, return {out, 256}; } +/** Helper function for SVE instructions with the format `udot zd, zn, + * zm[index]`. + * D represents the element type of the destination register (i.e. for uint32_t, + * D = uint32_t). + * N represents the element type of the source registers (i.e. for uint8_t, N = + * uint8_t). + * W represents how many source elements are multiplied to form an output + * element (i.e. for 4-way, W = 4). + * Returns correctly formatted RegisterValue. */ +template +RegisterValue sveUdot_indexed( + srcValContainer& sourceValues, + const simeng::arch::aarch64::InstructionMetadata& metadata, + const uint16_t VL_bits) { + const D* zd = sourceValues[0].getAsVector(); + const N* zn = sourceValues[1].getAsVector(); + const N* zm = sourceValues[2].getAsVector(); + const int index = metadata.operands[2].vector_index; + + D out[256 / sizeof(D)] = {0}; + for (int i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) { + D acc = zd[i]; + for (int j = 0; j < W; j++) { + acc += (static_cast(zn[(W * i) + j]) * + static_cast(zm[(W * index) + j])); + } + out[i] = acc; + } + return {out, 256}; +} + /** Helper function for SVE instructions with the format `unpk>hi,lo> zd, * zn`. * D represents the type of the destination register (e.g. int32_t for diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 1ed2ff172a..08d3f49ae3 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -5619,6 +5619,11 @@ void Instruction::execute() { results_[0] = {div_3ops(sourceValues_), 8}; break; } + case Opcode::AArch64_UDOT_ZZZI_S: { // udot zd.s, zn.b, zm.b[index] + results_[0] = sveUdot_indexed(sourceValues_, + metadata_, VL_bits); + break; + } case Opcode::AArch64_UDOTlanev16i8: { // udot vd.4s, vn.16b, vm.4b[index] results_[0] = vecUdot_byElement<4>(sourceValues_, metadata_); break; diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 47618e4fef..9ef230d575 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -7098,6 +7098,32 @@ TEST_P(InstSve, uaddv) { CHECK_NEON(3, uint64_t, {(9 * (VL / 128)), 0}); } +TEST_P(InstSve, udot) { + // udot by element + initialHeapData_.resize(16); + uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + heap64[0] = 0xDEADBEEFFFFF00FF; + heap64[1] = 0x01234567ABBACAFE; + RUN_AARCH64(R"( + # Get heap address + mov x0, #0 + mov x8, #214 + svc #0 + + ldr q0, [x0] + + dup z2.b, #2 + dup z3.b, #3 + dup z4.s, #4 + dup z5.s, #5 + + udot z4.s, z2.b, z0.b[0] + udot z5.s, z3.b, z0.b[3] + )"); + CHECK_NEON(4, uint32_t, fillNeon({1534}, VL / 8)); + CHECK_NEON(5, uint32_t, fillNeon({629}, VL / 8)); +} + TEST_P(InstSve, uqdec) { // d arrangement RUN_AARCH64(R"( From 2db08ae0484bdff080bef28fabc069c25a43e768 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 14 Oct 2024 13:18:34 +0100 Subject: [PATCH 15/71] Implemented SVE ZIP1+2 (byte) instructions and tests. --- CMakeLists.txt | 4 ++-- src/lib/arch/aarch64/Instruction_execute.cc | 8 ++++++++ test/regression/aarch64/Exception.cc | 2 -- test/regression/aarch64/instructions/sme.cc | 4 ---- test/regression/aarch64/instructions/sve.cc | 15 ++++++++++----- 5 files changed, 20 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b8f4379b98..d0691578fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -155,10 +155,10 @@ if(SIMENG_ENABLE_TESTS) # Print message containing if the full test suite will run if (${LLVM_PACKAGE_VERSION} VERSION_LESS "14.0") - message(STATUS "LLVM version does not support AArch64 extensions SME or SVE2. These test suites will be skipped.") + message(STATUS "LLVM version does not support AArch64 extensions SVE2, SVE2.1, SME, or SME2. Related tests will fail.") endif() if (${LLVM_PACKAGE_VERSION} VERSION_LESS "18.0") - message(STATUS "LLVM version does not support AArch64 extensions SME2. These test suites will be skipped.") + message(STATUS "LLVM version does not support AArch64 extensions SME2 or SVE2.1. Related test will fail.") endif() else() diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 08d3f49ae3..0c7a3f6250 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -5943,6 +5943,10 @@ void Instruction::execute() { results_[0] = sveZip_preds(sourceValues_, VL_bits, false); break; } + case Opcode::AArch64_ZIP1_ZZZ_B: { // zip1 zd.b, zn.b, zm.b + results_[0] = sveZip_vecs(sourceValues_, VL_bits, false); + break; + } case Opcode::AArch64_ZIP1_ZZZ_D: { // zip1 zd.d, zn.d, zm.d results_[0] = sveZip_vecs(sourceValues_, VL_bits, false); break; @@ -5995,6 +5999,10 @@ void Instruction::execute() { results_[0] = sveZip_preds(sourceValues_, VL_bits, true); break; } + case Opcode::AArch64_ZIP2_ZZZ_B: { // zip2 zd.b, zn.b, zm.b + results_[0] = sveZip_vecs(sourceValues_, VL_bits, true); + break; + } case Opcode::AArch64_ZIP2_ZZZ_D: { // zip2 zd.d, zn.d, zm.d results_[0] = sveZip_vecs(sourceValues_, VL_bits, true); break; diff --git a/test/regression/aarch64/Exception.cc b/test/regression/aarch64/Exception.cc index b987ae4429..2133629473 100644 --- a/test/regression/aarch64/Exception.cc +++ b/test/regression/aarch64/Exception.cc @@ -151,7 +151,6 @@ TEST_P(Exception, unmapped_sys_reg) { EXPECT_EQ(stdout_.substr(0, strlen(err)), err); } -#if SIMENG_LLVM_VERSION >= 14 // TODO: Write test for InstructionException::StreamingModeUpdate once it has a // trigger case // TODO: Write test for InstructionException::ZAregisterStatusUpdate once it has @@ -371,7 +370,6 @@ TEST_P(Exception, svcr) { fillNeon({0}, SVL / 8)); } } -#endif INSTANTIATE_TEST_SUITE_P( AArch64, Exception, diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index 55c7b945f3..68f686609c 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -7,7 +7,6 @@ namespace { using InstSme = AArch64RegressionTest; -#if SIMENG_LLVM_VERSION >= 14 TEST_P(InstSme, mova) { // 8-bit RUN_AARCH64(R"( @@ -576,8 +575,5 @@ TEST_P(InstSme, zero) { INSTANTIATE_TEST_SUITE_P(AArch64, InstSme, ::testing::ValuesIn(genCoreTypeSVLPairs(EMULATION)), paramToString); -#else -GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InstSme); -#endif } // namespace \ No newline at end of file diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 9ef230d575..ea8021fc96 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -8087,14 +8087,12 @@ TEST_P(InstSve, zip_pred) { } TEST_P(InstSve, zip) { - // d arrangement RUN_AARCH64(R"( # 64-bit fdup z0.d, #0.5 fdup z1.d, #-0.5 fdup z2.d, #0.75 fdup z3.d, #-0.75 - zip1 z4.d, z0.d, z1.d zip2 z5.d, z2.d, z3.d @@ -8105,16 +8103,24 @@ TEST_P(InstSve, zip) { fdup z9.s, #0.75 zip1 z10.s, z6.s, z7.s zip2 z11.s, z8.s, z9.s + + # 8-bit + dup z12.b, #1 + dup z13.b, #-2 + dup z14.b, #-1 + dup z15.b, #2 + zip1 z16.b, z12.b, z13.b + zip2 z17.b, z14.b, z15.b )"); CHECK_NEON(4, double, fillNeon({0.5, -0.5}, VL / 8)); CHECK_NEON(5, double, fillNeon({0.75, -0.75}, VL / 8)); CHECK_NEON(10, float, fillNeon({0.5, -0.75}, VL / 8)); CHECK_NEON(11, float, fillNeon({-0.5, 0.75}, VL / 8)); + CHECK_NEON(16, int8_t, fillNeon({1, -2}, VL / 8)); + CHECK_NEON(17, int8_t, fillNeon({-1, 2}, VL / 8)); } -#if SIMENG_LLVM_VERSION >= 14 -// If LLVM version supports SVE2 : TEST_P(InstSve, psel) { RUN_AARCH64(R"( mov w13, #0 @@ -8148,7 +8154,6 @@ TEST_P(InstSve, psel) { CHECK_PREDICATE(14, uint64_t, fillPred(VL / 8, {0}, 4)); CHECK_PREDICATE(15, uint64_t, fillPred(VL / 8, {0}, 8)); } -#endif INSTANTIATE_TEST_SUITE_P(AArch64, InstSve, ::testing::ValuesIn(genCoreTypeVLPairs(EMULATION)), From 7ac89e862ca2166c86156fd831e3dc4fabe62fce Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 14 Oct 2024 14:49:06 +0100 Subject: [PATCH 16/71] Implemented SVE faddv (float and double) instructions and tests. --- .../simeng/arch/aarch64/helpers/sve.hh | 21 +++++ src/lib/arch/aarch64/Instruction_execute.cc | 10 +++ test/regression/aarch64/instructions/sve.cc | 78 +++++++++++++++++++ 3 files changed, 109 insertions(+) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index b19600a0dc..25ea3dede9 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -626,6 +626,27 @@ std::enable_if_t, RegisterValue> sveFDivPredicated( return {out, 256}; } +/** Helpfer function for SVE instructions with the format `faddv rd, pg, zn. + * D represents the source vector element type and the destination scalar + * register type (i.e. for zn.s and sd, D = float). + * Returns correctly formatted RegisterValue. */ +template +RegisterValue sveFaddv_predicated(srcValContainer& sourceValues, + const uint16_t VL_bits) { + const uint64_t* p = sourceValues[0].getAsVector(); + const D* zn = sourceValues[1].getAsVector(); + + const uint16_t partition_num = VL_bits / (8 * sizeof(D)); + D out[256 / sizeof(D)] = {0}; + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % (64 / sizeof(D))) * sizeof(D)); + if (p[i / (64 / sizeof(D))] & shifted_active) { + out[0] += zn[i]; + } + } + return {out, 256}; +} + /** Helper function for SVE instructions with the format `fmad zd, pg/m, zn, * zm`. * T represents the type of sourceValues (e.g. for zn.d, T = double). diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 0c7a3f6250..5bc1e088ca 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -1388,6 +1388,16 @@ void Instruction::execute() { results_[0] = vecAdd_3ops(sourceValues_); break; } + case Opcode::AArch64_FADDV_VPZ_D: { // faddv dd, p0, zn.d + + results_[0] = sveFaddv_predicated(sourceValues_, VL_bits); + break; + } + case Opcode::AArch64_FADDV_VPZ_S: { // faddv sd, p0, zn.s + + results_[0] = sveFaddv_predicated(sourceValues_, VL_bits); + break; + } case Opcode::AArch64_FCADD_ZPmZ_D: { // fcadd zdn.d, pg/m, zdn.d, zm.d, // #imm results_[0] = diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index ea8021fc96..03e3e4e870 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -2852,6 +2852,84 @@ TEST_P(InstSve, fadda) { CHECK_NEON(3, double, {resultB, 0}); } +TEST_P(InstSve, faddv) { + // float + initialHeapData_.resize(VL / 8); + float* fheap = reinterpret_cast(initialHeapData_.data()); + std::vector fsrc = { + 1.0f, -42.76f, -0.125f, 0.0f, 40.26f, -684.72f, -0.15f, 107.86f, + -34.71f, -0.917f, 0.0f, 80.72f, -125.67f, -0.01f, 701.90f, 7.0f}; + fillHeap(fheap, fsrc, VL / 32); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x2, xzr + mov x3, xzr + mov x4, #4 + mov x5, #2 + addvl x3, x3, #1 + sdiv x3, x3, x4 + sdiv x2, x3, x5 + + ptrue p0.s + whilelo p1.s, xzr, x2 + + ld1w {z0.s}, p0/z, [x0] + + faddv s3, p0, z0.s + faddv s4, p1, z0.s + )"); + float s3 = 0.0f; + float s4 = 0.0f; + for (int i = 0; i < VL / 32; i++) { + s3 += fsrc[i % (fsrc.size())]; + if (i < (VL / 64)) s4 += fsrc[i % (fsrc.size())]; + } + CHECK_NEON(3, float, {s3, 0.0f, 0.0f, 0.0f}); + CHECK_NEON(4, float, {s4, 0.0f, 0.0f, 0.0f}); + + // double + initialHeapData_.resize(VL); + double* dheap = reinterpret_cast(initialHeapData_.data()); + std::vector dsrc = {1.0, -42.76, -0.125, 0.0, 40.26, -684.72, + -0.15, 107.86, -34.71, -0.917, 0.0, 80.72, + -125.67, -0.01, 701.90, 7.0}; + fillHeap(dheap, dsrc, VL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + mov x2, xzr + mov x3, xzr + mov x4, #8 + mov x5, #2 + addvl x3, x3, #1 + sdiv x3, x3, x4 + sdiv x2, x3, x5 + + ptrue p0.d + whilelo p1.d, xzr, x2 + + ld1d {z0.d}, p0/z, [x0] + + faddv d3, p0, z0.d + faddv d4, p1, z0.d + )"); + double d3 = 0.0; + double d4 = 0.0; + for (int i = 0; i < VL / 64; i++) { + d3 += dsrc[i % (dsrc.size())]; + if (i < (VL / 128)) d4 += dsrc[i % (dsrc.size())]; + } + CHECK_NEON(3, double, {d3, 0.0}); + CHECK_NEON(4, double, {d4, 0.0}); +} + TEST_P(InstSve, fcmge) { // double initialHeapData_.resize(VL / 16); From bb737611352886e648038ac9dbf09beeb401f5f2 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 14 Oct 2024 16:07:10 +0100 Subject: [PATCH 17/71] Implemented SVE PTRUE (as counter) instructions with tests. --- .../simeng/arch/aarch64/helpers/sve.hh | 42 +++++++++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 16 +++++++ test/regression/aarch64/instructions/sve.cc | 25 +++++++++++ 3 files changed, 83 insertions(+) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 25ea3dede9..3736a7c766 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -1340,6 +1340,48 @@ std::array svePtrue( return out; } +/** Helper function for SVE instructions with the format `ptrue pnd. + * T represents the type of sourceValues (e.g. for pnd.d, T = uint64_t). + * Returns an array of 4 uint64_t elements. */ +template +std::array svePtrue_counter(const uint16_t VL_bits) { + // Predicate as counter is 16-bits and has the following encoding: + // - Up to first 4 bits encode the element size (0b1, 0b10, 0b100, 0b1000 + // for b h s d respectively) + // - bits 0->LSZ + // - Bits LSZ -> 14 represent a uint of the number of consecutive elements + // from element 0 that are active / inactive + // - If invert bit = 0 it is number of active elements + // - If invert bit = 1 it is number of inactive elements + // - Bit 15 represents the invert bit + std::array out = {0, 0, 0, 0}; + + // Set invert bit + out[0] |= 0b1000000000000000; + + // Set Element size field + uint8_t bitsUsed = 0; + if (sizeof(T) == 1) { + out[0] |= 0b1; + bitsUsed += 1; + } else if (sizeof(T) == 2) { + out[0] |= 0b10; + bitsUsed += 2; + } else if (sizeof(T) == 4) { + out[0] |= 0b100; + bitsUsed += 3; + } else if (sizeof(T) == 8) { + out[0] |= 0b1000; + bitsUsed += 4; + } + + // Set Element count (max value is 256 (2048 bit VL for pnd.b)) + const uint64_t elementCount = VL_bits / (sizeof(T) * 8); + out[0] |= (elementCount << bitsUsed); + + return out; +} + /** Helper function for SVE instructions with the format `punpk pd.h, * pn.b`. * If `isHI` = false, then PUNPKLO is performed. diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 5bc1e088ca..879304f6a2 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -4111,6 +4111,22 @@ void Instruction::execute() { results_[0] = svePtrue(metadata_, VL_bits); break; } + case Opcode::AArch64_PTRUE_C_B: { // ptrue pnd.b + results_[0] = svePtrue_counter(VL_bits); + break; + } + case Opcode::AArch64_PTRUE_C_D: { // ptrue pnd.d + results_[0] = svePtrue_counter(VL_bits); + break; + } + case Opcode::AArch64_PTRUE_C_H: { // ptrue pnd.h + results_[0] = svePtrue_counter(VL_bits); + break; + } + case Opcode::AArch64_PTRUE_C_S: { // ptrue pnd.s + results_[0] = svePtrue_counter(VL_bits); + break; + } case Opcode::AArch64_PUNPKHI_PP: { // punpkhi pd.h, pn.b results_[0] = svePunpk(sourceValues_, VL_bits, true); break; diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 03e3e4e870..21a51a5d1a 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -5816,6 +5816,31 @@ TEST_P(InstSve, ptrue) { CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 2)); } +TEST_P(InstSve, ptrue_counter) { + RUN_AARCH64(R"( + ptrue pn8.s + ptrue pn9.d + ptrue pn10.b + ptrue pn11.h + )"); + const uint64_t ps = + 0b0000000000000000000000000000000000000000000000001000000000000100 | + ((static_cast(VL / 32)) << 3); + const uint64_t pd = + 0b0000000000000000000000000000000000000000000000001000000000001000 | + ((static_cast(VL / 64)) << 4); + const uint64_t pb = + 0b0000000000000000000000000000000000000000000000001000000000000001 | + ((static_cast(VL / 8)) << 1); + const uint64_t ph = + 0b0000000000000000000000000000000000000000000000001000000000000010 | + ((static_cast(VL / 16)) << 2); + CHECK_PREDICATE(8, uint64_t, {ps, 0x0, 0x0, 0x0}); + CHECK_PREDICATE(9, uint64_t, {pd, 0x0, 0x0, 0x0}); + CHECK_PREDICATE(10, uint64_t, {pb, 0x0, 0x0, 0x0}); + CHECK_PREDICATE(11, uint64_t, {ph, 0x0, 0x0, 0x0}); +} + TEST_P(InstSve, punpk) { RUN_AARCH64(R"( ptrue p0.b From 9febab0c62e82d05740abe8c83bd2bf687912875 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Thu, 24 Oct 2024 14:05:01 +0100 Subject: [PATCH 18/71] Added paciasp and autiasp empty execution logic. --- src/lib/arch/aarch64/Instruction_address.cc | 13 ++++++ src/lib/arch/aarch64/Instruction_execute.cc | 44 +++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index cd453f311e..d7874a44a4 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -327,6 +327,19 @@ span Instruction::generateAddresses() { setMemoryAddresses({addr, static_cast(VL_bits / 8)}); break; } + // case Opcode::AArch64_LD1D_4Z_IMM: { // ld1d {zt1.d - zt4.d}, png/z, + // [xn{, + // // #imm, mul vl}] + // const uint16_t partition_num = VL_bits / 64; + + // const uint64_t base = sourceValues_[1].get(); + // const uint64_t offset = + // static_cast(metadata_.operands[5].mem.disp); + // const uint64_t addr = base + (offset * 4 * partition_num * 8); + + // setMemoryAddresses({addr, static_cast((VL_bits / 8) * 4)}); + // break; + // } case Opcode::AArch64_LD1D_IMM: { // ld1d {zt.d}, pg/z, [xn{, #imm, // mul vl}] const uint16_t partition_num = VL_bits / 64; diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 879304f6a2..303a16c4ae 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -2752,6 +2752,42 @@ void Instruction::execute() { results_[0] = {out, 256}; break; } + // case Opcode::AArch64_LD1D_4Z_IMM: { // ld1d {zt1.d - zt4.d}, png/z, + // [xn{, + // // #imm, mul vl}] + // // LOAD + // const uint64_t* pn = sourceValues_[0].getAsVector(); + // const uint64_t* data = memoryData_[0].getAsVector(); + + // const uint16_t partition_num = VL_bits / 64; + + // // Get predicate-as-counter information + // const bool invert = + // (pn[0] & static_cast(0b1000000000000000)) != 0; + // const uint64_t numElems = + // (pn[0] & static_cast(0b0111111111110000)) >> 4; + + // uint64_t out[4][32] = {{0}, {0}, {0}, {0}}; + // uint64_t index = 0; + + // for (int r = 0; r < 4; r++) { + // // If invert = 1, dictates number of inactive elements at start of + // // each + // // vector. Otherwise, it is number of active elements at start of + // each + // // vector. + // int iMax = (invert) ? partition_num : numElems; + // for (int i = (invert) ? numElems : 0; i < iMax; i++) { + // out[r][i] = data[index]; + // index++; + // } + // } + // results_[0] = {out[0], 256}; + // results_[1] = {out[1], 256}; + // results_[2] = {out[2], 256}; + // results_[3] = {out[3], 256}; + // break; + // } case Opcode::AArch64_LD1D_IMM: { // ld1d {zt.d}, pg/z, [xn{, #imm, // mul vl}] // LOAD @@ -4062,6 +4098,14 @@ void Instruction::execute() { [](uint8_t x, uint8_t y) -> uint8_t { return x | y; }); break; } + case Opcode::AArch64_AUTIASP: // autiasp + [[fallthrough]]; + case Opcode::AArch64_PACIASP: { // paciasp + const uint64_t x30 = sourceValues_[0].get(); + // Mimic execution by writing leaving x30 unmodified + results_[0] = {x30, 8}; + break; + } case Opcode::AArch64_PFALSE: { // pfalse pd.b uint64_t out[4] = {0, 0, 0, 0}; results_[0] = out; From b45d8d782dc3f50943994f93a9e5f98357dda85c Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Fri, 25 Oct 2024 10:32:28 +0100 Subject: [PATCH 19/71] Implemented NEON UMULL (uint16 to uint32) instruction and tests. --- src/lib/arch/aarch64/Instruction_execute.cc | 11 ++++++ test/regression/aarch64/instructions/neon.cc | 36 ++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 303a16c4ae..0b4ce8a622 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -5767,6 +5767,17 @@ void Instruction::execute() { sourceValues_[1].get()); break; } + case Opcode::AArch64_UMULLv4i16_v4i32: { // umull vd.4s, vn.4h, vm.4h + const uint16_t* vn = sourceValues_[0].getAsVector(); + const uint16_t* vm = sourceValues_[1].getAsVector(); + + uint32_t out[4] = {0}; + for (int i = 0; i < 4; i++) { + out[i] = static_cast(vn[i]) * static_cast(vm[i]); + } + results_[0] = {out, 256}; + break; + } case Opcode::AArch64_UQDECD_WPiI: { // uqdecd wd{, pattern{, MUL #imm}} results_[0] = sveUqdec(sourceValues_, metadata_, VL_bits); diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc index e8ce4f13f2..ca9ae26a4e 100644 --- a/test/regression/aarch64/instructions/neon.cc +++ b/test/regression/aarch64/instructions/neon.cc @@ -3808,6 +3808,42 @@ TEST_P(InstNeon, umlal) { CHECK_NEON(3, uint64_t, {1477468749480ull, 1032ull}); } +TEST_P(InstNeon, umull) { + // uint16_t to uint32_t + initialHeapData_.resize(32); + uint16_t* heap16 = reinterpret_cast(initialHeapData_.data()); + heap16[0] = UINT16_MAX; + heap16[1] = 0; + heap16[2] = 1234; + heap16[3] = 0xBEEF; + heap16[4] = 0xABBA; + heap16[5] = 0xCAFE; + heap16[6] = 0xDEAD; + heap16[7] = 0xACDC; + + heap16[8] = UINT16_MAX; + heap16[9] = 0xACDC; + heap16[10] = 0xCAFE; + heap16[11] = 0xABBA; + heap16[12] = 0xBEEF; + heap16[13] = 0xDEAD; + heap16[14] = 9876; + heap16[15] = 0; + + RUN_AARCH64(R"( + # Get heap address + mov x0, #0 + mov x8, #214 + svc #0 + + ldr q0, [x0] + ldr q1, [x0, #16] + + umull v2.4s, v0.4h, v1.4h + )"); + CHECK_NEON(2, uint32_t, {4294836225u, 0, 64126044u, 2148818598u}); +} + TEST_P(InstNeon, zip) { initialHeapData_.resize(128); uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); From 6383d983896d4dc40c17b5100617acf3ae36e6bb Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Fri, 25 Oct 2024 11:17:34 +0100 Subject: [PATCH 20/71] Implemented RDSVL and tests. --- src/include/simeng/arch/aarch64/ArchInfo.hh | 3 ++- src/lib/arch/aarch64/Instruction_execute.cc | 13 +++++++++++-- test/regression/aarch64/instructions/sme.cc | 15 +++++++++++++++ 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/src/include/simeng/arch/aarch64/ArchInfo.hh b/src/include/simeng/arch/aarch64/ArchInfo.hh index 1403da08f8..b7f2740353 100644 --- a/src/include/simeng/arch/aarch64/ArchInfo.hh +++ b/src/include/simeng/arch/aarch64/ArchInfo.hh @@ -18,7 +18,8 @@ class ArchInfo : public simeng::arch::ArchInfo { aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1, aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0, aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0, - aarch64_sysreg::AARCH64_SYSREG_SVCR}), + aarch64_sysreg::AARCH64_SYSREG_SVCR, + aarch64_sysreg::AARCH64_SYSREG_TPIDR2_EL0}), zaSize_(config["Core"]["Streaming-Vector-Length"].as() / 8) { // Generate the architecture-defined architectural register structure archRegStruct_ = { diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 0b4ce8a622..7172a35de7 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -4187,9 +4187,18 @@ void Instruction::execute() { results_[0] = rbit(sourceValues_, metadata_); break; } + case Opcode::AArch64_RDSVLI_XI: { // rdsvl xd, #imm + // Uses Streaming SVE vector register size, regardless of streaming mode + // state + int64_t imm = metadata_.operands[1].imm; + results_[0] = imm * static_cast( + architecture_.getStreamingVectorLength() / 8); + break; + } case Opcode::AArch64_RDVLI_XI: { // rdvl xd, #imm - int8_t imm = static_cast(metadata_.operands[1].imm); - results_[0] = (uint64_t)(imm * (VL_bits / 8)); + // Uses current vector register size + int64_t imm = metadata_.operands[1].imm; + results_[0] = imm * static_cast(VL_bits / 8); break; } case Opcode::AArch64_RET: { // ret {xr} diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index 68f686609c..7171b0da0f 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -271,6 +271,21 @@ TEST_P(InstSme, ld1w) { {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, {0}, SVL / 8)); } +TEST_P(InstSme, rdsvl) { + RUN_AARCH64(R"( + rdsvl x0, #-32 + rdsvl x1, #-3 + rdsvl x2, #0 + rdsvl x3, #3 + rdsvl x4, #31 + )"); + EXPECT_EQ(getGeneralRegister(0), (SVL / 8) * -32); + EXPECT_EQ(getGeneralRegister(1), (SVL / 8) * -3); + EXPECT_EQ(getGeneralRegister(2), 0); + EXPECT_EQ(getGeneralRegister(3), (SVL / 8) * 3); + EXPECT_EQ(getGeneralRegister(4), (SVL / 8) * 31); +} + TEST_P(InstSme, st1d) { // Horizontal initialHeapData_.resize(SVL / 4); From 64162374ad15a42c9c5312f0f0396fad2c5a82fa Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Fri, 25 Oct 2024 12:11:55 +0100 Subject: [PATCH 21/71] Implemented ZERO {zt0} instruction with test. --- src/lib/arch/aarch64/Instruction_execute.cc | 9 +++++ test/integration/ConfigTest.cc | 6 ++-- .../aarch64/AArch64RegressionTest.hh | 35 +++++++++++++++++++ test/regression/aarch64/instructions/sme.cc | 9 +++++ test/unit/aarch64/ArchInfoTest.cc | 3 +- 5 files changed, 59 insertions(+), 3 deletions(-) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 7172a35de7..c41c80b2b5 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -6139,6 +6139,15 @@ void Instruction::execute() { } break; } + case Opcode::AArch64_ZERO_T: { // zero {zt0} + // SME + // Not in right context mode. Raise exception + if (!ZAenabled) return ZAdisabled(); + + // ZT0 has a fixed width of 512-bits + results_[0] = RegisterValue(0, 64); + break; + } default: return executionNYI(); } diff --git a/test/integration/ConfigTest.cc b/test/integration/ConfigTest.cc index 48975eeacd..49a028ebb8 100644 --- a/test/integration/ConfigTest.cc +++ b/test/integration/ConfigTest.cc @@ -24,7 +24,8 @@ TEST(ConfigTest, Default) { aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1, aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0, aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0, - aarch64_sysreg::AARCH64_SYSREG_SVCR}; + aarch64_sysreg::AARCH64_SYSREG_SVCR, + aarch64_sysreg::AARCH64_SYSREG_TPIDR2_EL0}; EXPECT_EQ(simeng::config::SimInfo::getSysRegVec(), sysRegisterEnums); std::vector archRegStruct = { {8, 32}, @@ -384,7 +385,8 @@ TEST(ConfigTest, configFromFile) { aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1, aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0, aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0, - aarch64_sysreg::AARCH64_SYSREG_SVCR}; + aarch64_sysreg::AARCH64_SYSREG_SVCR, + aarch64_sysreg::AARCH64_SYSREG_TPIDR2_EL0}; EXPECT_EQ(simeng::config::SimInfo::getSysRegVec(), sysRegisterEnums); std::vector archRegStruct = { {8, 32}, diff --git a/test/regression/aarch64/AArch64RegressionTest.hh b/test/regression/aarch64/AArch64RegressionTest.hh index 3e39fa59fe..8285726ee7 100644 --- a/test/regression/aarch64/AArch64RegressionTest.hh +++ b/test/regression/aarch64/AArch64RegressionTest.hh @@ -190,6 +190,24 @@ inline std::vector> genCoreTypeSVLPairs( checkMatrixRegisterCol(tag, index, __VA_ARGS__); \ } +/** Check each element of the Lookup Table register ZT0 against expected values. + * + * The `tag` argument is the register index (must be 0), and the `type` argument + * is the C++ data type to use for value comparisons. The third argument should + * be an initializer list containing one value for each register element (for a + * total of `(64 / sizeof(type))` values). + * + * For example: + * + * // Compare zt0 to some expected 32-bit uint64 values. + * CHECK_TABLE(0, uint32_t, {1, 2, 3, 4, ..., 16}); + */ +#define CHECK_TABLE(tag, type, ...) \ + { \ + SCOPED_TRACE("<<== error generated here"); \ + checkTableRegister(tag, __VA_ARGS__); \ + } + /** A helper macro to predecode the first instruction in a snippet of Armv9.2-a * assembly code and check the assigned group(s) for each micro-op matches the * expected group(s). Returns from the calling function if a fatal error occurs. @@ -361,6 +379,23 @@ class AArch64RegressionTest : public RegressionTest { } } + /** Check the elements of the ZT0 lookup table register. + * + * This should be invoked via the `CHECK_TABLE` macro in order to provide + * better diagnostic messages, rather than called directly from test code. + */ + template + void checkTableRegister(uint8_t tag, + const std::array& values) const { + assert(tag == 0 && "Only a tag of value 0 is valid for Table registers"); + const T* data = RegressionTest::getVectorRegister( + {simeng::arch::aarch64::RegisterType::TABLE, tag}); + for (unsigned i = 0; i < (64 / sizeof(T)); i++) { + EXPECT_NEAR(data[i], values[i], 0.0005) + << "Mismatch for element " << i << "."; + } + } + /** Get the value of a general purpose register. */ template T getGeneralRegister(uint8_t tag) const { diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index 7171b0da0f..01def5ce7d 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -534,6 +534,15 @@ TEST_P(InstSme, st1w) { } TEST_P(InstSme, zero) { + // ZT0 + RUN_AARCH64(R"( + smstart + + zero {zt0} + )"); + CHECK_TABLE(0, uint64_t, {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}); + + // ZA tiles RUN_AARCH64(R"( smstart diff --git a/test/unit/aarch64/ArchInfoTest.cc b/test/unit/aarch64/ArchInfoTest.cc index 39e25a0bd1..a2b41a9ec2 100644 --- a/test/unit/aarch64/ArchInfoTest.cc +++ b/test/unit/aarch64/ArchInfoTest.cc @@ -23,7 +23,8 @@ class AArch64ArchInfoTest : public ::testing::Test { aarch64_sysreg::AARCH64_SYSREG_MIDR_EL1, aarch64_sysreg::AARCH64_SYSREG_CNTVCT_EL0, aarch64_sysreg::AARCH64_SYSREG_PMCCNTR_EL0, - aarch64_sysreg::AARCH64_SYSREG_SVCR}; + aarch64_sysreg::AARCH64_SYSREG_SVCR, + aarch64_sysreg::AARCH64_SYSREG_TPIDR2_EL0}; const std::vector archRegStruct = { {8, 32}, From 9a3dc35261ec772c56a912e2aabeeae9aaa1944f Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Fri, 25 Oct 2024 17:05:46 +0100 Subject: [PATCH 22/71] Implemented ld1d (4 consec vecs, uint64) SVE instruction with tests, and fixed PTRUE (counter) implementation. --- .../simeng/arch/aarch64/helpers/sve.hh | 12 +--- src/lib/arch/aarch64/Instruction_address.cc | 34 ++++++---- src/lib/arch/aarch64/Instruction_execute.cc | 64 +++++++++---------- test/regression/aarch64/instructions/sve.cc | 59 ++++++++++++++--- 4 files changed, 105 insertions(+), 64 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 3736a7c766..df924c1f8c 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -1356,29 +1356,21 @@ std::array svePtrue_counter(const uint16_t VL_bits) { // - Bit 15 represents the invert bit std::array out = {0, 0, 0, 0}; - // Set invert bit + // Set invert bit to 1 and count to 0 + // (The first 0 elements are FALSE) out[0] |= 0b1000000000000000; // Set Element size field - uint8_t bitsUsed = 0; if (sizeof(T) == 1) { out[0] |= 0b1; - bitsUsed += 1; } else if (sizeof(T) == 2) { out[0] |= 0b10; - bitsUsed += 2; } else if (sizeof(T) == 4) { out[0] |= 0b100; - bitsUsed += 3; } else if (sizeof(T) == 8) { out[0] |= 0b1000; - bitsUsed += 4; } - // Set Element count (max value is 256 (2048 bit VL for pnd.b)) - const uint64_t elementCount = VL_bits / (sizeof(T) * 8); - out[0] |= (elementCount << bitsUsed); - return out; } diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index d7874a44a4..f6171fff6a 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -327,19 +327,27 @@ span Instruction::generateAddresses() { setMemoryAddresses({addr, static_cast(VL_bits / 8)}); break; } - // case Opcode::AArch64_LD1D_4Z_IMM: { // ld1d {zt1.d - zt4.d}, png/z, - // [xn{, - // // #imm, mul vl}] - // const uint16_t partition_num = VL_bits / 64; - - // const uint64_t base = sourceValues_[1].get(); - // const uint64_t offset = - // static_cast(metadata_.operands[5].mem.disp); - // const uint64_t addr = base + (offset * 4 * partition_num * 8); - - // setMemoryAddresses({addr, static_cast((VL_bits / 8) * 4)}); - // break; - // } + case Opcode::AArch64_LD1D_4Z_IMM: { // ld1d {zt1.d - zt4.d}, png/z, [xn{, + // #imm, mul vl}] + const uint16_t partition_num = VL_bits / 64; + + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = + static_cast(metadata_.operands[5].mem.disp); + const uint64_t addr = base + (offset * partition_num * 8); + + std::vector addresses; + addresses.reserve(4); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + addresses.push_back({addr + 2 * blockSize, blockSize}); + addresses.push_back({addr + 3 * blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_LD1D_IMM: { // ld1d {zt.d}, pg/z, [xn{, #imm, // mul vl}] const uint16_t partition_num = VL_bits / 64; diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index c41c80b2b5..3b7bee1704 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -2752,42 +2752,40 @@ void Instruction::execute() { results_[0] = {out, 256}; break; } - // case Opcode::AArch64_LD1D_4Z_IMM: { // ld1d {zt1.d - zt4.d}, png/z, - // [xn{, - // // #imm, mul vl}] - // // LOAD - // const uint64_t* pn = sourceValues_[0].getAsVector(); - // const uint64_t* data = memoryData_[0].getAsVector(); - - // const uint16_t partition_num = VL_bits / 64; + case Opcode::AArch64_LD1D_4Z_IMM: { // ld1d {zt1.d - zt4.d}, png/z, [xn{, + // #imm, mul vl}] + // LOAD + const uint64_t* pn = sourceValues_[0].getAsVector(); - // // Get predicate-as-counter information - // const bool invert = - // (pn[0] & static_cast(0b1000000000000000)) != 0; - // const uint64_t numElems = - // (pn[0] & static_cast(0b0111111111110000)) >> 4; + // Get predicate-as-counter information + const bool invert = + (pn[0] & static_cast(0b1000000000000000)) != 0; + const uint64_t predElemCount = + (pn[0] & static_cast(0b0111111111110000)) >> 4; - // uint64_t out[4][32] = {{0}, {0}, {0}, {0}}; - // uint64_t index = 0; + uint64_t out[4][32] = {{0}, {0}, {0}, {0}}; + const uint16_t partition_num = VL_bits / 64; - // for (int r = 0; r < 4; r++) { - // // If invert = 1, dictates number of inactive elements at start of - // // each - // // vector. Otherwise, it is number of active elements at start of - // each - // // vector. - // int iMax = (invert) ? partition_num : numElems; - // for (int i = (invert) ? numElems : 0; i < iMax; i++) { - // out[r][i] = data[index]; - // index++; - // } - // } - // results_[0] = {out[0], 256}; - // results_[1] = {out[1], 256}; - // results_[2] = {out[2], 256}; - // results_[3] = {out[3], 256}; - // break; - // } + for (int r = 0; r < 4; r++) { + for (int i = 0; i < partition_num; i++) { + // If invert = 1, predElemCount dictates number of initial inactive + // elements. + // Otherwise, it is number of initial active elements. + if ((r * partition_num) + i < predElemCount) { + out[r][i] = + (invert) ? 0 : memoryData_[r].getAsVector()[i]; + } else { + out[r][i] = + (invert) ? memoryData_[r].getAsVector()[i] : 0; + } + } + } + results_[0] = {out[0], 256}; + results_[1] = {out[1], 256}; + results_[2] = {out[2], 256}; + results_[3] = {out[3], 256}; + break; + } case Opcode::AArch64_LD1D_IMM: { // ld1d {zt.d}, pg/z, [xn{, #imm, // mul vl}] // LOAD diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 21a51a5d1a..84d351c60e 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -5063,6 +5063,7 @@ TEST_P(InstSve, ld1d_gather) { } TEST_P(InstSve, ld1d) { + // Single vector initialHeapData_.resize(VL / 4); uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); std::vector src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}; @@ -5104,6 +5105,52 @@ TEST_P(InstSve, ld1d) { fillNeon({src[(base) % 4], src[(base + 1) % 4], src[(base + 2) % 4], src[(base + 3) % 4]}, VL / 16)); + + // Multi vector + initialHeapData_.resize(VL); + uint64_t* heap64_multi = reinterpret_cast(initialHeapData_.data()); + std::vector src_multi = {0xDEADBEEF, 0x12345678, 0x98765432, + 0xABCDEF01}; + fillHeap(heap64_multi, src_multi, VL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + dup z0.d, #1 + dup z1.d, #2 + dup z2.d, #3 + dup z3.d, #4 + + ptrue pn8.d + + ld1d {z0.d - z3.d}, pn8/z, [x0, #4, mul vl] + )"); + base = (VL / 64) * 4; + uint16_t offset = (VL / 64); + CHECK_NEON(0, uint64_t, + fillNeon({src[(base) % 4], src[(base + 1) % 4], + src[(base + 2) % 4], src[(base + 3) % 4]}, + VL / 8)); + CHECK_NEON( + 1, uint64_t, + fillNeon( + {src[((base + offset)) % 4], src[((base + offset) + 1) % 4], + src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]}, + VL / 8)); + CHECK_NEON(2, uint64_t, + fillNeon({src[((base + (offset * 2))) % 4], + src[((base + (offset * 2)) + 1) % 4], + src[((base + (offset * 2)) + 2) % 4], + src[((base + (offset * 2)) + 3) % 4]}, + VL / 8)); + CHECK_NEON(3, uint64_t, + fillNeon({src[((base + (offset * 3))) % 4], + src[((base + (offset * 3)) + 1) % 4], + src[((base + (offset * 3)) + 2) % 4], + src[((base + (offset * 3)) + 3) % 4]}, + VL / 8)); } TEST_P(InstSve, ld1h) { @@ -5824,17 +5871,13 @@ TEST_P(InstSve, ptrue_counter) { ptrue pn11.h )"); const uint64_t ps = - 0b0000000000000000000000000000000000000000000000001000000000000100 | - ((static_cast(VL / 32)) << 3); + 0b0000000000000000000000000000000000000000000000001000000000000100; const uint64_t pd = - 0b0000000000000000000000000000000000000000000000001000000000001000 | - ((static_cast(VL / 64)) << 4); + 0b0000000000000000000000000000000000000000000000001000000000001000; const uint64_t pb = - 0b0000000000000000000000000000000000000000000000001000000000000001 | - ((static_cast(VL / 8)) << 1); + 0b0000000000000000000000000000000000000000000000001000000000000001; const uint64_t ph = - 0b0000000000000000000000000000000000000000000000001000000000000010 | - ((static_cast(VL / 16)) << 2); + 0b0000000000000000000000000000000000000000000000001000000000000010; CHECK_PREDICATE(8, uint64_t, {ps, 0x0, 0x0, 0x0}); CHECK_PREDICATE(9, uint64_t, {pd, 0x0, 0x0, 0x0}); CHECK_PREDICATE(10, uint64_t, {pb, 0x0, 0x0, 0x0}); From 487365779bf3584675eb94b3a7236df105f6d874 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Fri, 25 Oct 2024 17:10:45 +0100 Subject: [PATCH 23/71] Implemented ld1d (2 consec vecs, uint64) SVE instruction with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 19 ++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 32 +++++++++++++++++++++ test/regression/aarch64/instructions/sve.cc | 31 +++++++++++++++++++- 3 files changed, 81 insertions(+), 1 deletion(-) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index f6171fff6a..ecb5cab359 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -327,6 +327,25 @@ span Instruction::generateAddresses() { setMemoryAddresses({addr, static_cast(VL_bits / 8)}); break; } + case Opcode::AArch64_LD1D_2Z_IMM: { // ld1d {zt1.d, zt2.d}, png/z, [xn{, + // #imm, mul vl}] + const uint16_t partition_num = VL_bits / 64; + + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = + static_cast(metadata_.operands[5].mem.disp); + const uint64_t addr = base + (offset * partition_num * 8); + + std::vector addresses; + addresses.reserve(2); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_LD1D_4Z_IMM: { // ld1d {zt1.d - zt4.d}, png/z, [xn{, // #imm, mul vl}] const uint16_t partition_num = VL_bits / 64; diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 3b7bee1704..bdcaeb1b1e 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -2752,6 +2752,38 @@ void Instruction::execute() { results_[0] = {out, 256}; break; } + case Opcode::AArch64_LD1D_2Z_IMM: { // ld1d {zt1.d, zt2.d}, png/z, [xn{, + // #imm, mul vl}] + // LOAD + const uint64_t* pn = sourceValues_[0].getAsVector(); + + // Get predicate-as-counter information + const bool invert = + (pn[0] & static_cast(0b1000000000000000)) != 0; + const uint64_t predElemCount = + (pn[0] & static_cast(0b0111111111110000)) >> 4; + + uint64_t out[2][32] = {{0}, {0}}; + const uint16_t partition_num = VL_bits / 64; + + for (int r = 0; r < 2; r++) { + for (int i = 0; i < partition_num; i++) { + // If invert = 1, predElemCount dictates number of initial inactive + // elements. + // Otherwise, it is number of initial active elements. + if ((r * partition_num) + i < predElemCount) { + out[r][i] = + (invert) ? 0 : memoryData_[r].getAsVector()[i]; + } else { + out[r][i] = + (invert) ? memoryData_[r].getAsVector()[i] : 0; + } + } + } + results_[0] = {out[0], 256}; + results_[1] = {out[1], 256}; + break; + } case Opcode::AArch64_LD1D_4Z_IMM: { // ld1d {zt1.d - zt4.d}, png/z, [xn{, // #imm, mul vl}] // LOAD diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 84d351c60e..9933a5f653 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -5112,6 +5112,35 @@ TEST_P(InstSve, ld1d) { std::vector src_multi = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}; fillHeap(heap64_multi, src_multi, VL / 8); + + // Two vector + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + dup z0.d, #1 + dup z1.d, #2 + + ptrue pn8.d + + ld1d {z0.d, z1.d}, pn8/z, [x0, #2, mul vl] + )"); + base = (VL / 64) * 2; + uint16_t offset = (VL / 64); + CHECK_NEON(0, uint64_t, + fillNeon({src[(base) % 4], src[(base + 1) % 4], + src[(base + 2) % 4], src[(base + 3) % 4]}, + VL / 8)); + CHECK_NEON( + 1, uint64_t, + fillNeon( + {src[((base + offset)) % 4], src[((base + offset) + 1) % 4], + src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]}, + VL / 8)); + + // Four vector RUN_AARCH64(R"( # Get heap address mov x0, 0 @@ -5128,7 +5157,7 @@ TEST_P(InstSve, ld1d) { ld1d {z0.d - z3.d}, pn8/z, [x0, #4, mul vl] )"); base = (VL / 64) * 4; - uint16_t offset = (VL / 64); + offset = (VL / 64); CHECK_NEON(0, uint64_t, fillNeon({src[(base) % 4], src[(base + 1) % 4], src[(base + 2) % 4], src[(base + 3) % 4]}, From b82ec90b1b6f355144d91bb79cf9617e2e2b9cd0 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 28 Oct 2024 13:12:59 +0000 Subject: [PATCH 24/71] Implemented SME mova (tile to vec, 4 regs, 8-bit) instruction with tests. --- src/lib/arch/aarch64/Instruction_execute.cc | 28 ++++++++++ test/regression/aarch64/instructions/sme.cc | 58 ++++++++++++++++++--- 2 files changed, 78 insertions(+), 8 deletions(-) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index bdcaeb1b1e..eeadf477eb 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -3892,6 +3892,34 @@ void Instruction::execute() { results_[0] = sveMlaPredicated_vecs(sourceValues_, VL_bits); break; } + case Opcode::AArch64_MOVA_4ZMXI_H_B: { // mova {zd1.b - zd4.b}, + // za0h.b[ws, offs1:offs4] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t sliceCount = VL_bits / 8; + + const uint32_t ws = sourceValues_[sliceCount].get(); + const uint8_t offs1 = + metadata_.operands[4].sme.slice_offset.imm_range.first; + const uint8_t offs4 = + metadata_.operands[4].sme.slice_offset.imm_range.offset; + + uint8_t out[4][256] = {{0}, {0}, {0}, {0}}; + + for (uint8_t i = offs1; i <= offs4; i++) { + // Get correct next row + const uint8_t* row = + sourceValues_[(ws + i) % sliceCount].getAsVector(); + // Update out and results_ + const uint8_t index = i - offs1; + memcpy(out[index], row, sliceCount); + results_[index] = {out[index], 256}; + } + break; + } case Opcode::AArch64_MOVID: { // movi dd, #imm results_[0] = {static_cast(metadata_.operands[1].imm), 256}; break; diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index 01def5ce7d..16716a5603 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -47,6 +47,47 @@ TEST_P(InstSme, mova) { CHECK_NEON(7, float, fillNeonCombined({4}, {10}, SVL / 8)); } +TEST_P(InstSme, mova_tilesToVecs) { + // uint8_t; 4 vectors + initialHeapData_.resize(SVL / 4); + uint32_t* heap32 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}; + fillHeap(heap32, src, SVL / 16); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + mov w12, #0 + ptrue p0.s + + # Pre-fill first 4 rows of za0.b + ld1w {za0h.s[w12, 0]}, p0/z, [x0] + ld1w {za1h.s[w12, 0]}, p0/z, [x0] + ld1w {za2h.s[w12, 0]}, p0/z, [x0] + ld1w {za3h.s[w12, 0]}, p0/z, [x0] + + + mova {z4.b-z7.b}, za0h.b[w12, 0:3] + + # Test Alias + mov w13, #1 + dup z11.b, #3 + mov {z8.b-z11.b}, za0h.b[w13, 0:3] + )"); + for (int i = 4; i <= 10; i++) { + CHECK_NEON( + i, uint8_t, + fillNeon({0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12, 0x32, + 0x54, 0x76, 0x98, 0x01, 0xEF, 0xCD, 0xAB}, + SVL / 8)); + } + CHECK_NEON(11, uint8_t, fillNeon({0x00}, SVL / 8)); +} + TEST_P(InstSme, fmopa) { // 32-bit RUN_AARCH64(R"( @@ -218,15 +259,16 @@ TEST_P(InstSme, ld1w) { whilelo p1.s, xzr, x1 ld1w {za1h.s[w12, 0]}, p1/z, [x0, x2, lsl #2] )"); + CHECK_MAT_ROW(AARCH64_REG_ZAS0, 1, uint32_t, + fillNeon( + {0x12345678, 0x98765432, 0xABCDEF01, 0xDEADBEEF}, SVL / 8)); + CHECK_MAT_ROW(AARCH64_REG_ZAS0, 3, uint32_t, + fillNeon( + {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8)); CHECK_MAT_ROW( - AARCH64_REG_ZAS0, 1, uint64_t, - fillNeon({0x9876543212345678, 0xDEADBEEFABCDEF01}, SVL / 8)); - CHECK_MAT_ROW( - AARCH64_REG_ZAS0, 3, uint64_t, - fillNeon({0x12345678DEADBEEF, 0xABCDEF0198765432}, SVL / 8)); - CHECK_MAT_ROW(AARCH64_REG_ZAS1, 1, uint64_t, - fillNeonCombined( - {0x12345678DEADBEEF, 0xABCDEF0198765432}, {0}, SVL / 8)); + AARCH64_REG_ZAS1, 1, uint32_t, + fillNeonCombined( + {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, {0}, SVL / 8)); // Vertical initialHeapData_.resize(SVL / 4); From 89d7501d0ac41c1164040a5846d12990db0ead1e Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 28 Oct 2024 16:02:42 +0000 Subject: [PATCH 25/71] Implemented pred-as-counter to pred_as_mask function, and added unit tests. --- .../simeng/arch/aarch64/Instruction.hh | 36 +++++++++++++++++++ test/unit/aarch64/InstructionTest.cc | 33 +++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/src/include/simeng/arch/aarch64/Instruction.hh b/src/include/simeng/arch/aarch64/Instruction.hh index b5f1f07cc5..bee47e01bc 100644 --- a/src/include/simeng/arch/aarch64/Instruction.hh +++ b/src/include/simeng/arch/aarch64/Instruction.hh @@ -283,6 +283,42 @@ enum class InsnType : uint32_t { isBranch = 1 << 14 }; +/** Predefined shift values for converting pred-as-counter to pred-as-mask. */ +const uint64_t predCountShiftVals[9] = {0, 1, 2, 0, 3, 0, 0, 0, 4}; + +/** Convert Predicate-as-Mask to Predicate-as-Masks. + * T represents the element type (i.e. for pg.s, T = uint32_t). + * V represents the number of vectors the predicate-as-counter is being used + * for. */ +template +std::vector> predAsCounterToMasks( + const uint64_t predAsCounter, const uint16_t VL_bits) { + std::vector> out(V, {0, 0, 0, 0}); + + const uint16_t elemsPerVec = VL_bits / (sizeof(T) * 8); + // Get predicate-as-counter information + const bool invert = (predAsCounter & 0b1000000000000000) != 0; + const uint64_t predElemCount = + (predAsCounter & static_cast(0b0111111111111111)) >> + predCountShiftVals[sizeof(T)]; + + for (int r = 0; r < V; r++) { + for (int i = 0; i < elemsPerVec; i++) { + // Move bit to next position based on element type + uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); + // If invert = 1, predElemCount dictates number of initial inactive + // elements. + // If invert = 0, it is number of initial active elements. + if ((r * elemsPerVec) + i < predElemCount) { + out[r][i / (64 / sizeof(T))] |= (invert) ? 0 : shifted_active; + } else { + out[r][i / (64 / sizeof(T))] |= (invert) ? shifted_active : 0; + } + } + } + return out; +} + /** A basic Armv9.2-a implementation of the `Instruction` interface. */ class Instruction : public simeng::Instruction { public: diff --git a/test/unit/aarch64/InstructionTest.cc b/test/unit/aarch64/InstructionTest.cc index 1ecf14a1a6..80024bc1c6 100644 --- a/test/unit/aarch64/InstructionTest.cc +++ b/test/unit/aarch64/InstructionTest.cc @@ -627,6 +627,39 @@ TEST_F(AArch64InstructionTest, setters) { EXPECT_TRUE(insn.isWaitingCommit()); } +// Test predAsCounterToMasks function. +TEST_F(AArch64InstructionTest, predAsCounterToMasks_test) { + // 1.5 full vectors from start, VL = 128b, uint8_t elem size + std::vector> ref(2, {0, 0, 0, 0}); + ref[0][0] = + 0b0000000000000000000000000000000000000000000000001111111111111111; + ref[1][0] = + 0b0000000000000000000000000000000000000000000000000000000011111111; + // invert = 0, num active Elems = 24 + uint64_t pn = + 0b0000000000000000000000000000000000000000000000000000000000110001; + auto out = predAsCounterToMasks(pn, 128); + EXPECT_EQ(out[0][0], ref[0][0]); + EXPECT_EQ(out[1][0], ref[1][0]); + + // 0.5 of last vector, VL = 1024b, uint64_t elem size + std::vector> ref2(4, {0, 0, 0, 0}); + ref2[3][1] = + 0b0000000100000001000000010000000100000001000000010000000100000001; + // Invert = 1, num inactive Elems = 56 + uint64_t pn2 = + 0b0000000000000000000000000000000000000000000000001000001110001000; + auto out2 = predAsCounterToMasks(pn2, 1024); + EXPECT_EQ(out2[0][0], ref2[0][0]); + EXPECT_EQ(out2[0][1], ref2[0][1]); + EXPECT_EQ(out2[1][0], ref2[1][0]); + EXPECT_EQ(out2[1][1], ref2[1][1]); + EXPECT_EQ(out2[2][0], ref2[2][0]); + EXPECT_EQ(out2[2][1], ref2[2][1]); + EXPECT_EQ(out2[3][0], ref2[3][0]); + EXPECT_EQ(out2[3][1], ref2[3][1]); +} + } // namespace aarch64 } // namespace arch } // namespace simeng \ No newline at end of file From ad5bd876c42b053794fcce9a346e5084ba95a875 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 28 Oct 2024 17:06:52 +0000 Subject: [PATCH 26/71] Implemented st1d (2 consec vecs, uint64) SVE2 instruction with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 21 ++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 56 ++++++++++----------- test/regression/aarch64/instructions/sve.cc | 35 +++++++++++++ 3 files changed, 82 insertions(+), 30 deletions(-) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index ecb5cab359..9881aeec1a 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -1002,6 +1002,27 @@ span Instruction::generateAddresses() { setMemoryAddresses(std::move(addresses)); break; } + case Opcode::AArch64_ST1D_2Z_IMM: { // st1d {zt1.d, zt2.d}, png, [xn{, + // #imm, mul vl}] + const uint64_t pn = sourceValues_[2].get(); + auto preds = predAsCounterToMasks(pn, VL_bits); + const uint16_t partition_num = VL_bits / 64; + + const uint64_t base = sourceValues_[3].get(); + const int64_t offset = + static_cast(metadata_.operands[3].mem.disp); + const uint64_t addr = base + (offset * partition_num * 8); + + std::vector addresses; + + generatePredicatedContiguousAddressBlocks(addr, partition_num, 8, 8, + preds[0].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8), + partition_num, 8, 8, + preds[1].data(), addresses); + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_ST2D_IMM: { // st2d {zt1.d, zt2.d}, pg, [{, // #imm, mul vl}] const uint64_t* p = sourceValues_[2].getAsVector(); diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index eeadf477eb..460be3f700 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -2755,28 +2755,18 @@ void Instruction::execute() { case Opcode::AArch64_LD1D_2Z_IMM: { // ld1d {zt1.d, zt2.d}, png/z, [xn{, // #imm, mul vl}] // LOAD - const uint64_t* pn = sourceValues_[0].getAsVector(); + const uint64_t pn = sourceValues_[0].get(); - // Get predicate-as-counter information - const bool invert = - (pn[0] & static_cast(0b1000000000000000)) != 0; - const uint64_t predElemCount = - (pn[0] & static_cast(0b0111111111110000)) >> 4; + auto preds = predAsCounterToMasks(pn, VL_bits); uint64_t out[2][32] = {{0}, {0}}; const uint16_t partition_num = VL_bits / 64; for (int r = 0; r < 2; r++) { for (int i = 0; i < partition_num; i++) { - // If invert = 1, predElemCount dictates number of initial inactive - // elements. - // Otherwise, it is number of initial active elements. - if ((r * partition_num) + i < predElemCount) { - out[r][i] = - (invert) ? 0 : memoryData_[r].getAsVector()[i]; - } else { - out[r][i] = - (invert) ? memoryData_[r].getAsVector()[i] : 0; + uint64_t shifted_active = 1ull << ((i % 8) * 8); + if (preds[r][i / 8] & shifted_active) { + out[r][i] = memoryData_[r].getAsVector()[i]; } } } @@ -2787,28 +2777,18 @@ void Instruction::execute() { case Opcode::AArch64_LD1D_4Z_IMM: { // ld1d {zt1.d - zt4.d}, png/z, [xn{, // #imm, mul vl}] // LOAD - const uint64_t* pn = sourceValues_[0].getAsVector(); + const uint64_t pn = sourceValues_[0].get(); - // Get predicate-as-counter information - const bool invert = - (pn[0] & static_cast(0b1000000000000000)) != 0; - const uint64_t predElemCount = - (pn[0] & static_cast(0b0111111111110000)) >> 4; + auto preds = predAsCounterToMasks(pn, VL_bits); uint64_t out[4][32] = {{0}, {0}, {0}, {0}}; const uint16_t partition_num = VL_bits / 64; for (int r = 0; r < 4; r++) { for (int i = 0; i < partition_num; i++) { - // If invert = 1, predElemCount dictates number of initial inactive - // elements. - // Otherwise, it is number of initial active elements. - if ((r * partition_num) + i < predElemCount) { - out[r][i] = - (invert) ? 0 : memoryData_[r].getAsVector()[i]; - } else { - out[r][i] = - (invert) ? memoryData_[r].getAsVector()[i] : 0; + uint64_t shifted_active = 1ull << ((i % 8) * 8); + if (preds[r][i / 8] & shifted_active) { + out[r][i] = memoryData_[r].getAsVector()[i]; } } } @@ -4800,6 +4780,22 @@ void Instruction::execute() { memoryData_ = sve_merge_store_data(d, p, VL_bits); break; } + case Opcode::AArch64_ST1D_2Z_IMM: { // st1d {zt1.d, zt2.d}, png, [xn{, + // #imm, mul vl}] + // STORE + const uint64_t* t1 = sourceValues_[0].getAsVector(); + const uint64_t* t2 = sourceValues_[1].getAsVector(); + const uint64_t pn = sourceValues_[2].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + memoryData_ = + sve_merge_store_data(t1, preds[0].data(), VL_bits); + std::vector out2 = + sve_merge_store_data(t2, preds[1].data(), VL_bits); + memoryData_.insert(memoryData_.end(), out2.begin(), out2.end()); + break; + } case Opcode::AArch64_ST1Fourv16b: { // st1 {vt.16b, vt2.16b, vt3.16b, // vt4.16b}, [xn|sp] // STORE diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 9933a5f653..8e26434bd4 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -6638,6 +6638,41 @@ TEST_P(InstSve, st1d) { } } +TEST_P(InstSve, st1d_multivec) { + // Two vectors + initialHeapData_.resize(VL / 4); + uint64_t* heap64 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}; + fillHeap(heap64, src, VL / 32); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + sub sp, sp, #4095 + mov x4, #256 + madd x4, x4, x4, x4 + ptrue p0.d + ptrue pn8.d + ld1d {z0.d}, p0/z, [x0] + ld1d {z1.d}, p0/z, [x0, #1, mul vl] + st1d {z0.d, z1.d}, pn8, [sp] + st1d {z0.d, z1.d}, pn8, [x4, #2, mul vl] + )"); + + for (uint64_t i = 0; i < (VL / 32); i++) { + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (i * 8)), + src[i % 4]); + } + + for (uint64_t i = 0; i < (VL / 32); i++) { + EXPECT_EQ(getMemoryValue(65792 + (2 * (VL / 8)) + (i * 8)), + src[i % 4]); + } +} + TEST_P(InstSve, st2d) { // 32-bit RUN_AARCH64(R"( From 9bf115a2beb5d3093b773af9cdd8ca7147ae2700 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 28 Oct 2024 17:18:17 +0000 Subject: [PATCH 27/71] Implemented st1d (2 consec vecs, uint64, scalar offset) SVE2 instruction with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 20 ++++++++++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 4 ++++ test/regression/aarch64/instructions/sve.cc | 10 +++++----- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index 9881aeec1a..1087668f53 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -1002,6 +1002,26 @@ span Instruction::generateAddresses() { setMemoryAddresses(std::move(addresses)); break; } + case Opcode::AArch64_ST1D_2Z: { // st1d {zt1.d, zt2.d}, png, [xn, xm, lsl + // #3] + const uint64_t pn = sourceValues_[2].get(); + auto preds = predAsCounterToMasks(pn, VL_bits); + const uint16_t partition_num = VL_bits / 64; + + const uint64_t base = sourceValues_[3].get(); + const uint64_t offset = sourceValues_[4].get(); + const uint64_t addr = base + (offset << 3); + + std::vector addresses; + + generatePredicatedContiguousAddressBlocks(addr, partition_num, 8, 8, + preds[0].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8), + partition_num, 8, 8, + preds[1].data(), addresses); + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_ST1D_2Z_IMM: { // st1d {zt1.d, zt2.d}, png, [xn{, // #imm, mul vl}] const uint64_t pn = sourceValues_[2].get(); diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 460be3f700..e6c070d3b6 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -4780,6 +4780,10 @@ void Instruction::execute() { memoryData_ = sve_merge_store_data(d, p, VL_bits); break; } + case Opcode::AArch64_ST1D_2Z: // st1d {zt1.d, zt2.d}, png, [xn, xm, lsl + // #3] + // STORE + [[fallthrough]]; case Opcode::AArch64_ST1D_2Z_IMM: { // st1d {zt1.d, zt2.d}, png, [xn{, // #imm, mul vl}] // STORE diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 8e26434bd4..c52fd16396 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -6651,6 +6651,7 @@ TEST_P(InstSve, st1d_multivec) { svc #0 sub sp, sp, #4095 + mov x1, #1 mov x4, #256 madd x4, x4, x4, x4 ptrue p0.d @@ -6658,18 +6659,17 @@ TEST_P(InstSve, st1d_multivec) { ld1d {z0.d}, p0/z, [x0] ld1d {z1.d}, p0/z, [x0, #1, mul vl] st1d {z0.d, z1.d}, pn8, [sp] - st1d {z0.d, z1.d}, pn8, [x4, #2, mul vl] + st1d {z0.d, z1.d}, pn8, [x4, #4, mul vl] + st1d {z0.d, z1.d}, pn8, [x4, x1, lsl #3] )"); for (uint64_t i = 0; i < (VL / 32); i++) { EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 4095 + (i * 8)), src[i % 4]); - } - - for (uint64_t i = 0; i < (VL / 32); i++) { - EXPECT_EQ(getMemoryValue(65792 + (2 * (VL / 8)) + (i * 8)), + EXPECT_EQ(getMemoryValue(65792 + (4 * (VL / 8)) + (i * 8)), src[i % 4]); + EXPECT_EQ(getMemoryValue(65792 + 8 + (i * 8)), src[i % 4]); } } From ff8bb58094b9c266151c7da45fd3aa77e50a60b2 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Tue, 29 Oct 2024 12:10:37 +0000 Subject: [PATCH 28/71] Implemented LD1W (2 vec and 4 vec, imm offset) SVE2 instructions with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 42 ++++++++++- src/lib/arch/aarch64/Instruction_execute.cc | 46 ++++++++++++ test/regression/aarch64/instructions/sve.cc | 77 ++++++++++++++++++++- 3 files changed, 163 insertions(+), 2 deletions(-) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index 1087668f53..1db4d54f4f 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -333,7 +333,7 @@ span Instruction::generateAddresses() { const uint64_t base = sourceValues_[1].get(); const uint64_t offset = - static_cast(metadata_.operands[5].mem.disp); + static_cast(metadata_.operands[3].mem.disp); const uint64_t addr = base + (offset * partition_num * 8); std::vector addresses; @@ -407,6 +407,46 @@ span Instruction::generateAddresses() { setMemoryAddresses({addr, static_cast(VL_bits / 8)}); break; } + case Opcode::AArch64_LD1W_2Z_IMM: { // ld1w {zt1.s, zt2.s}, png/z, [xn{, + // #imm, mul vl}] + const uint16_t partition_num = VL_bits / 32; + + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = + static_cast(metadata_.operands[3].mem.disp); + const uint64_t addr = base + (offset * partition_num * 4); + + std::vector addresses; + addresses.reserve(2); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_LD1W_4Z_IMM: { // ld1w {zt1.s - zt4.s}, png/z, [xn{, + // #imm, mul vl}] + const uint16_t partition_num = VL_bits / 32; + + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = + static_cast(metadata_.operands[5].mem.disp); + const uint64_t addr = base + (offset * partition_num * 4); + + std::vector addresses; + addresses.reserve(4); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + addresses.push_back({addr + 2 * blockSize, blockSize}); + addresses.push_back({addr + 3 * blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_LD2D: { // ld2d {zt1.d, zt2.d}, pg/z, [xn|sp, xm, // lsl #3] const uint64_t base = sourceValues_[1].get(); diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index e6c070d3b6..13bed72f1b 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -3251,6 +3251,52 @@ void Instruction::execute() { results_[0] = {out, 256}; break; } + case Opcode::AArch64_LD1W_2Z_IMM: { // ld1w {zt1.s, zt2.s}, png/z, [xn{, + // #imm, mul vl}] + // LOAD + const uint64_t pn = sourceValues_[0].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + uint32_t out[2][64] = {{0}, {0}}; + const uint16_t partition_num = VL_bits / 32; + + for (int r = 0; r < 2; r++) { + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % 16) * 4); + if (preds[r][i / 16] & shifted_active) { + out[r][i] = memoryData_[r].getAsVector()[i]; + } + } + } + results_[0] = {out[0], 256}; + results_[1] = {out[1], 256}; + break; + } + case Opcode::AArch64_LD1W_4Z_IMM: { // ld1w {zt1.s - zt4.s}, png/z, [xn{, + // #imm, mul vl}] + // LOAD + const uint64_t pn = sourceValues_[0].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + uint32_t out[4][64] = {{0}, {0}, {0}, {0}}; + const uint16_t partition_num = VL_bits / 32; + + for (int r = 0; r < 4; r++) { + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % 16) * 4); + if (preds[r][i / 16] & shifted_active) { + out[r][i] = memoryData_[r].getAsVector()[i]; + } + } + } + results_[0] = {out[0], 256}; + results_[1] = {out[1], 256}; + results_[2] = {out[2], 256}; + results_[3] = {out[3], 256}; + break; + } case Opcode::AArch64_LD1i32: { // ld1 {vt.s}[index], [xn] // LOAD const int index = metadata_.operands[0].vector_index; diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index c52fd16396..46cd7bdac3 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -5221,11 +5221,11 @@ TEST_P(InstSve, ld1h) { } TEST_P(InstSve, ld1w) { + // Single vector initialHeapData_.resize(VL / 4); uint32_t* heap32 = reinterpret_cast(initialHeapData_.data()); std::vector src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}; fillHeap(heap32, src, VL / 16); - RUN_AARCH64(R"( # Get heap address mov x0, 0 @@ -5260,6 +5260,81 @@ TEST_P(InstSve, ld1w) { CHECK_NEON(3, uint64_t, fillNeonCombined( {0x12345678DEADBEEF, 0xABCDEF0198765432}, {0}, VL / 8)); + + // Multi vector + initialHeapData_.resize(VL); + uint32_t* heap32_multi = reinterpret_cast(initialHeapData_.data()); + std::vector src_multi = {0xDEADBEEF, 0x12345678, 0x98765432, + 0xABCDEF01}; + fillHeap(heap32_multi, src_multi, VL / 4); + + // Two vector + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + dup z0.s, #1 + dup z1.s, #2 + + ptrue pn8.s + + ld1w {z0.s, z1.s}, pn8/z, [x0, #2, mul vl] + )"); + uint16_t base = (VL / 32) * 2; + uint16_t offset = (VL / 32); + CHECK_NEON(0, uint32_t, + fillNeon({src[(base) % 4], src[(base + 1) % 4], + src[(base + 2) % 4], src[(base + 3) % 4]}, + VL / 8)); + CHECK_NEON( + 1, uint32_t, + fillNeon( + {src[((base + offset)) % 4], src[((base + offset) + 1) % 4], + src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]}, + VL / 8)); + + // Four vector + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + dup z0.s, #1 + dup z1.s, #2 + dup z2.s, #3 + dup z3.s, #4 + + ptrue pn8.s + + ld1w {z0.s - z3.s}, pn8/z, [x0, #4, mul vl] + )"); + base = (VL / 32) * 4; + offset = (VL / 32); + CHECK_NEON(0, uint32_t, + fillNeon({src[(base) % 4], src[(base + 1) % 4], + src[(base + 2) % 4], src[(base + 3) % 4]}, + VL / 8)); + CHECK_NEON( + 1, uint32_t, + fillNeon( + {src[((base + offset)) % 4], src[((base + offset) + 1) % 4], + src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]}, + VL / 8)); + CHECK_NEON(2, uint32_t, + fillNeon({src[((base + (offset * 2))) % 4], + src[((base + (offset * 2)) + 1) % 4], + src[((base + (offset * 2)) + 2) % 4], + src[((base + (offset * 2)) + 3) % 4]}, + VL / 8)); + CHECK_NEON(3, uint32_t, + fillNeon({src[((base + (offset * 3))) % 4], + src[((base + (offset * 3)) + 1) % 4], + src[((base + (offset * 3)) + 2) % 4], + src[((base + (offset * 3)) + 3) % 4]}, + VL / 8)); } TEST_P(InstSve, ld2d) { From c40e9f4272797f724203dc7f0bee92f3640916d6 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Tue, 29 Oct 2024 12:18:48 +0000 Subject: [PATCH 29/71] Implemented LD1W (2 vec, scalar offset) SVE2 instruction with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 16 ++++++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 4 ++++ test/regression/aarch64/instructions/sve.cc | 11 +++++++++++ 3 files changed, 31 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index 1db4d54f4f..c43e8ec7b2 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -407,6 +407,22 @@ span Instruction::generateAddresses() { setMemoryAddresses({addr, static_cast(VL_bits / 8)}); break; } + case Opcode::AArch64_LD1W_2Z: { // ld1w {zt1.s, zt2.s}, png/z, [xn, + // xm, lsl #2] + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = sourceValues_[2].get(); + const uint64_t addr = base + (offset << 2); + + std::vector addresses; + addresses.reserve(2); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_LD1W_2Z_IMM: { // ld1w {zt1.s, zt2.s}, png/z, [xn{, // #imm, mul vl}] const uint16_t partition_num = VL_bits / 32; diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 13bed72f1b..e977e3e0dd 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -3251,6 +3251,10 @@ void Instruction::execute() { results_[0] = {out, 256}; break; } + case Opcode::AArch64_LD1W_2Z: // ld1w {zt1.s, zt2.s}, png/z, [xn, xm, + // lsl #2] + // LOAD + [[fallthrough]]; case Opcode::AArch64_LD1W_2Z_IMM: { // ld1w {zt1.s, zt2.s}, png/z, [xn{, // #imm, mul vl}] // LOAD diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 46cd7bdac3..96a168a5c2 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -5277,10 +5277,14 @@ TEST_P(InstSve, ld1w) { dup z0.s, #1 dup z1.s, #2 + dup z2.s, #3 + dup z3.s, #4 ptrue pn8.s + mov x1, #2 ld1w {z0.s, z1.s}, pn8/z, [x0, #2, mul vl] + ld1w {z2.s, z3.s}, pn8/z, [x0, x1, lsl #2] )"); uint16_t base = (VL / 32) * 2; uint16_t offset = (VL / 32); @@ -5295,6 +5299,13 @@ TEST_P(InstSve, ld1w) { src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]}, VL / 8)); + CHECK_NEON(2, uint32_t, + fillNeon({src[2], src[3], src[0], src[1]}, VL / 8)); + CHECK_NEON(3, uint32_t, + fillNeon({src[(2 + offset) % 4], src[(3 + offset) % 4], + src[(0 + offset) % 4], src[(1 + offset) % 4]}, + VL / 8)); + // Four vector RUN_AARCH64(R"( # Get heap address From 5f4fd1c4bea34362b91b539ce33affeda09472d4 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Tue, 29 Oct 2024 12:35:34 +0000 Subject: [PATCH 30/71] Implemented ST1W (2 vec, imm and scalar offset) SVE2 instructions with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 41 +++++++++++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 20 ++++++++++ test/regression/aarch64/instructions/sve.cc | 35 ++++++++++++++++++ 3 files changed, 96 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index c43e8ec7b2..aeec71008f 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -1208,6 +1208,47 @@ span Instruction::generateAddresses() { setMemoryAddresses(std::move(addresses)); break; } + case Opcode::AArch64_ST1W_2Z: { // st1w {zt1.s, zt2.s}, png, [xn, xm, lsl + // #2] + const uint64_t pn = sourceValues_[2].get(); + auto preds = predAsCounterToMasks(pn, VL_bits); + const uint16_t partition_num = VL_bits / 32; + + const uint64_t base = sourceValues_[3].get(); + const uint64_t offset = sourceValues_[4].get(); + const uint64_t addr = base + (offset << 2); + + std::vector addresses; + + generatePredicatedContiguousAddressBlocks(addr, partition_num, 4, 4, + preds[0].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8), + partition_num, 4, 4, + preds[1].data(), addresses); + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_ST1W_2Z_IMM: { // st1w {zt1.s, zt2.s}, png, [xn{, + // #imm, mul vl}] + const uint64_t pn = sourceValues_[2].get(); + auto preds = predAsCounterToMasks(pn, VL_bits); + const uint16_t partition_num = VL_bits / 32; + + const uint64_t base = sourceValues_[3].get(); + const int64_t offset = + static_cast(metadata_.operands[3].mem.disp); + const uint64_t addr = base + (offset * partition_num * 4); + + std::vector addresses; + + generatePredicatedContiguousAddressBlocks(addr, partition_num, 4, 4, + preds[0].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8), + partition_num, 4, 4, + preds[1].data(), addresses); + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_SST1W_D_IMM: { // st1w {zt.d}, pg, [zn.d{, #imm}] const uint64_t* p = sourceValues_[1].getAsVector(); const uint16_t partition_num = VL_bits / 64; diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index e977e3e0dd..87f52e089e 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -5054,6 +5054,26 @@ void Instruction::execute() { memoryData_ = sve_merge_store_data(d, p, VL_bits); break; } + case Opcode::AArch64_ST1W_2Z: // st1w {zt1.s, zt2.s}, png, [xn, xm, lsl + // #2] + // STORE + [[fallthrough]]; + case Opcode::AArch64_ST1W_2Z_IMM: { // st1w {zt1.s, zt2.s}, png, [xn{, + // #imm, mul vl}] + // STORE + const uint32_t* t1 = sourceValues_[0].getAsVector(); + const uint32_t* t2 = sourceValues_[1].getAsVector(); + const uint64_t pn = sourceValues_[2].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + memoryData_ = + sve_merge_store_data(t1, preds[0].data(), VL_bits); + std::vector out2 = + sve_merge_store_data(t2, preds[1].data(), VL_bits); + memoryData_.insert(memoryData_.end(), out2.begin(), out2.end()); + break; + } case Opcode::AArch64_ST1i16: { // st1 {vt.h}[index], [xn] // STORE const uint16_t* t = sourceValues_[0].getAsVector(); diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 96a168a5c2..8410f6d724 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -6977,6 +6977,41 @@ TEST_P(InstSve, st1w) { } } +TEST_P(InstSve, st1w_multivec) { + // Two vectors + initialHeapData_.resize(VL / 4); + uint32_t* heap32 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}; + fillHeap(heap32, src, VL / 16); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + sub sp, sp, #4095 + mov x1, #2 + mov x4, #256 + madd x4, x4, x4, x4 + ptrue p0.s + ptrue pn8.s + ld1w {z0.s}, p0/z, [x0] + ld1w {z1.s}, p0/z, [x0, #1, mul vl] + st1w {z0.s, z1.s}, pn8, [sp] + st1w {z0.s, z1.s}, pn8, [x4, #4, mul vl] + st1w {z0.s, z1.s}, pn8, [x4, x1, lsl #2] + )"); + + for (uint64_t i = 0; i < (VL / 16); i++) { + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (i * 4)), + src[i % 4]); + EXPECT_EQ(getMemoryValue(65792 + (4 * (VL / 8)) + (i * 4)), + src[i % 4]); + EXPECT_EQ(getMemoryValue(65792 + 8 + (i * 4)), src[i % 4]); + } +} + TEST_P(InstSve, str_predicate) { initialHeapData_.resize(VL / 64); uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); From 7a717e189e7d49342945c7292e1c137454bf3c44 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Tue, 29 Oct 2024 15:27:49 +0000 Subject: [PATCH 31/71] Implemented LD1B (2 vec, imm and scalar offset) SVE2 instructions with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 34 ++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 25 ++++++ test/regression/aarch64/instructions/sve.cc | 89 +++++++++++++++++++++ 3 files changed, 148 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index aeec71008f..a0ed89fcd4 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -319,6 +319,40 @@ span Instruction::generateAddresses() { setMemoryAddresses({addr, static_cast(VL_bits / 8)}); break; } + case Opcode::AArch64_LD1B_2Z: { // ld1b {zt1.b, zt2.b}, png/z, [xn, xm] + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = sourceValues_[2].get(); + const uint64_t addr = base + offset; + + std::vector addresses; + addresses.reserve(2); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_LD1B_2Z_IMM: { // ld1b {zt1.b, zt2.b}, png/z, [xn{, + // #imm, mul vl}] + const uint16_t partition_num = VL_bits / 8; + + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = + static_cast(metadata_.operands[3].mem.disp); + const uint64_t addr = base + (offset * partition_num); + + std::vector addresses; + addresses.reserve(2); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_LD1D: { // ld1d {zt.d}, pg/z, [xn, xm, lsl #3] const uint64_t base = sourceValues_[1].get(); const uint64_t offset = sourceValues_[2].get(); diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 87f52e089e..b5496371fc 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -2733,6 +2733,31 @@ void Instruction::execute() { results_[0] = {out, 256}; break; } + case Opcode::AArch64_LD1B_2Z: // ld1b {zt1.b, zt2.b}, png/z, [xn, xm] + // LOAD + [[fallthrough]]; + case Opcode::AArch64_LD1B_2Z_IMM: { // ld1b {zt1.b, zt2.b}, png/z, [xn{, + // #imm, mul vl}] + // LOAD + const uint64_t pn = sourceValues_[0].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + uint8_t out[2][256] = {{0}, {0}}; + const uint16_t partition_num = VL_bits / 8; + + for (int r = 0; r < 2; r++) { + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << (i % 64); + if (preds[r][i / 64] & shifted_active) { + out[r][i] = memoryData_[r].getAsVector()[i]; + } + } + } + results_[0] = {out[0], 256}; + results_[1] = {out[1], 256}; + break; + } case Opcode::AArch64_LD1D: { // ld1d {zt.d}, pg/z, [xn, xm, lsl #3] // LOAD const uint64_t* p = sourceValues_[0].getAsVector(); diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 8410f6d724..00ae0efe56 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -4893,6 +4893,7 @@ TEST_P(InstSve, ld1rw) { } TEST_P(InstSve, ld1b) { + // Single vector initialHeapData_.resize(VL / 4); uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); std::vector src = {0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12, @@ -4930,6 +4931,94 @@ TEST_P(InstSve, ld1b) { VL / 16)); std::rotate(src.begin(), src.begin() + ((VL / 8) % 16), src.end()); CHECK_NEON(2, uint8_t, fillNeon(src, VL / 16)); + + // Multi vector + initialHeapData_.resize(VL); + uint8_t* heap8_multi = reinterpret_cast(initialHeapData_.data()); + std::vector src_multi = {0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, + 0x34, 0x12, 0x32, 0x54, 0x76, 0x98, + 0x01, 0xEF, 0xCD, 0xAB}; + ; + fillHeap(heap8_multi, src_multi, VL); + + // Two vector + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + dup z0.b, #1 + dup z1.b, #2 + dup z2.b, #3 + dup z3.b, #4 + + ptrue pn8.b + mov x1, #2 + + ld1b {z0.b, z1.b}, pn8/z, [x0, #2, mul vl] + ld1b {z2.b, z3.b}, pn8/z, [x0, x1] + )"); + uint16_t base = (VL / 8) * 2; + uint16_t offset = (VL / 8); + CHECK_NEON(0, uint8_t, + fillNeon( + { + src[(base) % 16], + src[(base + 1) % 16], + src[(base + 2) % 16], + src[(base + 3) % 16], + src[(base + 4) % 16], + src[(base + 5) % 16], + src[(base + 6) % 16], + src[(base + 7) % 16], + src[(base + 8) % 16], + src[(base + 9) % 16], + src[(base + 10) % 16], + src[(base + 11) % 16], + src[(base + 12) % 16], + src[(base + 13) % 16], + src[(base + 14) % 16], + src[(base + 15) % 16], + }, + VL / 8)); + CHECK_NEON(1, uint8_t, + fillNeon( + { + src[((base + offset)) % 16], + src[((base + offset) + 1) % 16], + src[((base + offset) + 2) % 16], + src[((base + offset) + 3) % 16], + src[((base + offset) + 4) % 16], + src[((base + offset) + 5) % 16], + src[((base + offset) + 6) % 16], + src[((base + offset) + 7) % 16], + src[((base + offset) + 8) % 16], + src[((base + offset) + 9) % 16], + src[((base + offset) + 10) % 16], + src[((base + offset) + 11) % 16], + src[((base + offset) + 12) % 16], + src[((base + offset) + 13) % 16], + src[((base + offset) + 14) % 16], + src[((base + offset) + 15) % 16], + }, + VL / 8)); + CHECK_NEON(2, uint8_t, + fillNeon({src[2], src[3], src[4], src[5], src[6], src[7], + src[8], src[9], src[10], src[11], src[12], + src[13], src[14], src[15], src[0], src[1]}, + VL / 8)); + CHECK_NEON( + 3, uint8_t, + fillNeon({src[(2 + offset) % 16], src[(3 + offset) % 16], + src[(4 + offset) % 16], src[(5 + offset) % 16], + src[(6 + offset) % 16], src[(7 + offset) % 16], + src[(8 + offset) % 16], src[(9 + offset) % 16], + src[(10 + offset) % 16], src[(11 + offset) % 16], + src[(12 + offset) % 16], src[(13 + offset) % 16], + src[(14 + offset) % 16], src[(15 + offset) % 16], + src[(0 + offset) % 16], src[(1 + offset) % 16]}, + VL / 8)); } TEST_P(InstSve, ld1sw_gather) { From 6dca41088b31f43f326aaf8052baeb67e915d159 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Tue, 29 Oct 2024 15:41:58 +0000 Subject: [PATCH 32/71] Implemented UMPOA (8-bit to 32-bit widening uint) SME instruction with tests. --- src/lib/arch/aarch64/Instruction_execute.cc | 38 +++++++++++++++++++++ test/regression/aarch64/instructions/sme.cc | 32 +++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index b5496371fc..41fd194c31 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -5902,6 +5902,44 @@ void Instruction::execute() { results_[0] = {out, 256}; break; } + case Opcode::AArch64_UMOPA_MPPZZ_S: { // umopa zada.s, pn/m, pm/m, zn.b, + // zm.b + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t tileDim = VL_bits / 32; + const uint64_t* pn = sourceValues_[tileDim].getAsVector(); + const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector(); + const uint8_t* zn = sourceValues_[tileDim + 2].getAsVector(); + const uint8_t* zm = sourceValues_[tileDim + 3].getAsVector(); + + // zn is a SVLs x 4 sub matrix + // zm is a 4 x SVLs sub matrix + // Resulting SVLs x SVLs matrix has results widened to 32-bit + for (int row = 0; row < tileDim; row++) { + uint32_t outRow[64] = {0}; + const uint32_t* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < tileDim; col++) { + // Get corresponding output element + uint32_t sum = zadaRow[col]; + for (int k = 0; k < 4; k++) { + const uint16_t znIndex = 4 * row + k; + const uint16_t zmIndex = 4 * col + k; + const uint64_t shifted_active_zn = 1ull << (znIndex % 64); + const uint64_t shifted_active_zm = 1ull << (zmIndex % 64); + if ((pn[znIndex / 64] & shifted_active_zn) && + (pm[zmIndex / 64] & shifted_active_zm)) + sum += (static_cast(zn[znIndex]) * + static_cast(zm[zmIndex])); + } + outRow[col] = sum; + } + results_[row] = {outRow, 256}; + } + break; + } case Opcode::AArch64_UMOVvi32_idx0: // umov wd, vn.s[0] case Opcode::AArch64_UMOVvi32: { // umov wd, vn.s[index] const uint32_t* vec = sourceValues_[0].getAsVector(); diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index 16716a5603..f3b83d510d 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -575,6 +575,38 @@ TEST_P(InstSme, st1w) { } } +TEST_P(InstSme, umopa) { + // 32-bit + RUN_AARCH64(R"( + smstart + + dup z1.b, #8 + dup z2.b, #3 + ptrue p0.b + ptrue p1.b + + zero {za} + + umopa za0.s, p0/m, p1/m, z1.b, z2.b + + dup z3.b, #7 + dup z4.b, #4 + mov x0, #0 + mov x1, #2 + addvl x0, x0, #1 + udiv x0, x0, x1 + whilelo p2.b, xzr, x0 + + umopa za2.s, p0/m, p2/m, z3.b, z4.b + )"); + for (uint64_t i = 0; i < (SVL / 32); i++) { + CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t, + fillNeon({96}, (SVL / 8))); + CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t, + fillNeon({112}, (SVL / 16))); + } +} + TEST_P(InstSme, zero) { // ZT0 RUN_AARCH64(R"( From 8b1f9e711c3c1a218cc8411504318bcf516f8f3d Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Tue, 29 Oct 2024 16:05:02 +0000 Subject: [PATCH 33/71] Implemented LD1B (4 vec, imm offset) SVE2 instruction with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 21 ++++ src/lib/arch/aarch64/Instruction_execute.cc | 24 +++++ test/regression/aarch64/instructions/sve.cc | 104 +++++++++++++++++++- 3 files changed, 148 insertions(+), 1 deletion(-) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index a0ed89fcd4..10c391e33d 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -353,6 +353,27 @@ span Instruction::generateAddresses() { setMemoryAddresses(std::move(addresses)); break; } + case Opcode::AArch64_LD1B_4Z_IMM: { // ld1b {zt1.b - zt4.b}, png/z, [xn{, + // #imm, mul vl}] + const uint16_t partition_num = VL_bits / 8; + + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = + static_cast(metadata_.operands[3].mem.disp); + const uint64_t addr = base + (offset * partition_num); + + std::vector addresses; + addresses.reserve(4); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + addresses.push_back({addr + 2 * blockSize, blockSize}); + addresses.push_back({addr + 3 * blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_LD1D: { // ld1d {zt.d}, pg/z, [xn, xm, lsl #3] const uint64_t base = sourceValues_[1].get(); const uint64_t offset = sourceValues_[2].get(); diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 41fd194c31..dc9bce38c3 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -2758,6 +2758,30 @@ void Instruction::execute() { results_[1] = {out[1], 256}; break; } + case Opcode::AArch64_LD1B_4Z_IMM: { // ld1b {zt1.b - zt4.b}, png/z, [xn{, + // #imm, mul vl}] + // LOAD + const uint64_t pn = sourceValues_[0].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + uint8_t out[4][256] = {{0}, {0}}; + const uint16_t partition_num = VL_bits / 8; + + for (int r = 0; r < 4; r++) { + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << (i % 64); + if (preds[r][i / 64] & shifted_active) { + out[r][i] = memoryData_[r].getAsVector()[i]; + } + } + } + results_[0] = {out[0], 256}; + results_[1] = {out[1], 256}; + results_[2] = {out[2], 256}; + results_[3] = {out[3], 256}; + break; + } case Opcode::AArch64_LD1D: { // ld1d {zt.d}, pg/z, [xn, xm, lsl #3] // LOAD const uint64_t* p = sourceValues_[0].getAsVector(); diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 00ae0efe56..48f67a2b1f 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -4938,7 +4938,6 @@ TEST_P(InstSve, ld1b) { std::vector src_multi = {0xEF, 0xBE, 0xAD, 0xDE, 0x78, 0x56, 0x34, 0x12, 0x32, 0x54, 0x76, 0x98, 0x01, 0xEF, 0xCD, 0xAB}; - ; fillHeap(heap8_multi, src_multi, VL); // Two vector @@ -5019,6 +5018,109 @@ TEST_P(InstSve, ld1b) { src[(14 + offset) % 16], src[(15 + offset) % 16], src[(0 + offset) % 16], src[(1 + offset) % 16]}, VL / 8)); + + // Four vector + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + dup z0.b, #1 + dup z1.b, #2 + dup z2.b, #3 + dup z3.b, #4 + + ptrue pn8.b + + ld1b {z0.b - z3.b}, pn8/z, [x0, #4, mul vl] + )"); + base = (VL / 8) * 4; + offset = (VL / 8); + CHECK_NEON(0, uint8_t, + fillNeon( + { + src[(base) % 16], + src[(base + 1) % 16], + src[(base + 2) % 16], + src[(base + 3) % 16], + src[(base + 4) % 16], + src[(base + 5) % 16], + src[(base + 6) % 16], + src[(base + 7) % 16], + src[(base + 8) % 16], + src[(base + 9) % 16], + src[(base + 10) % 16], + src[(base + 11) % 16], + src[(base + 12) % 16], + src[(base + 13) % 16], + src[(base + 14) % 16], + src[(base + 15) % 16], + }, + VL / 8)); + CHECK_NEON(1, uint8_t, + fillNeon( + { + src[((base + offset)) % 16], + src[((base + offset) + 1) % 16], + src[((base + offset) + 2) % 16], + src[((base + offset) + 3) % 16], + src[((base + offset) + 4) % 16], + src[((base + offset) + 5) % 16], + src[((base + offset) + 6) % 16], + src[((base + offset) + 7) % 16], + src[((base + offset) + 8) % 16], + src[((base + offset) + 9) % 16], + src[((base + offset) + 10) % 16], + src[((base + offset) + 11) % 16], + src[((base + offset) + 12) % 16], + src[((base + offset) + 13) % 16], + src[((base + offset) + 14) % 16], + src[((base + offset) + 15) % 16], + }, + VL / 8)); + CHECK_NEON(2, uint8_t, + fillNeon( + { + src[((base + (2 * offset))) % 16], + src[((base + (2 * offset)) + 1) % 16], + src[((base + (2 * offset)) + 2) % 16], + src[((base + (2 * offset)) + 3) % 16], + src[((base + (2 * offset)) + 4) % 16], + src[((base + (2 * offset)) + 5) % 16], + src[((base + (2 * offset)) + 6) % 16], + src[((base + (2 * offset)) + 7) % 16], + src[((base + (2 * offset)) + 8) % 16], + src[((base + (2 * offset)) + 9) % 16], + src[((base + (2 * offset)) + 10) % 16], + src[((base + (2 * offset)) + 11) % 16], + src[((base + (2 * offset)) + 12) % 16], + src[((base + (2 * offset)) + 13) % 16], + src[((base + (2 * offset)) + 14) % 16], + src[((base + (2 * offset)) + 15) % 16], + }, + VL / 8)); + CHECK_NEON(3, uint8_t, + fillNeon( + { + src[((base + (3 * offset))) % 16], + src[((base + (3 * offset)) + 1) % 16], + src[((base + (3 * offset)) + 2) % 16], + src[((base + (3 * offset)) + 3) % 16], + src[((base + (3 * offset)) + 4) % 16], + src[((base + (3 * offset)) + 5) % 16], + src[((base + (3 * offset)) + 6) % 16], + src[((base + (3 * offset)) + 7) % 16], + src[((base + (3 * offset)) + 8) % 16], + src[((base + (3 * offset)) + 9) % 16], + src[((base + (3 * offset)) + 10) % 16], + src[((base + (3 * offset)) + 11) % 16], + src[((base + (3 * offset)) + 12) % 16], + src[((base + (3 * offset)) + 13) % 16], + src[((base + (3 * offset)) + 14) % 16], + src[((base + (3 * offset)) + 15) % 16], + }, + VL / 8)); } TEST_P(InstSve, ld1sw_gather) { From 5325d3f26ce0e12d30bec4170911dfe2dc727909 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Wed, 30 Oct 2024 12:28:34 +0000 Subject: [PATCH 34/71] Implemented UDOT (4-way, VGx4 8-bit to 32-bit widening, indexed vector) SME instruction with tests. --- src/lib/arch/aarch64/Instruction_execute.cc | 71 +++++++++++++++++++++ test/regression/aarch64/instructions/sme.cc | 61 ++++++++++++++++++ 2 files changed, 132 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index dc9bce38c3..8a2dd74612 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -5875,6 +5875,77 @@ void Instruction::execute() { results_[0] = {div_3ops(sourceValues_), 8}; break; } + case Opcode::AArch64_UDOT_VG4_M4ZZI_BtoS: { // udot za.s[wv, #off, vgx4], + // {zn1.b - zn4.b}, + // zm.b[#index] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 32; + // Get ZA stride between quarters and index into each ZA quarter + const uint16_t zaStride = zaRowCount / 4; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + // Get zm vector and zm's index + const uint8_t* zm = + sourceValues_[zaRowCount + 5].getAsVector(); + const int zmIndex = metadata_.operands[5].vector_index; + + // Pre-set all ZA result rows as only 4 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + // Loop over each source vector and destination vector (from the za + // single-vector group) pair + for (int r = 0; r < 4; r++) { + // For ZA single-vector groups of 4 vectors (vgx4), each vector is in + // a different quarter of ZA; indexed into it by Wv+off. + const uint32_t* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + const uint8_t* znr = + sourceValues_[zaRowCount + 1 + r].getAsVector(); + uint32_t out[64] = {0}; + // Loop over all 32-bit elements of output row vector `zaRow` + for (int e = 0; e < elemCount; e++) { + // This instruction destructively adds the widened dot product + // (4x 8-bit --> 1x 32-bit) of the following to each 32-bit element + // in the current `zaRow`: + // - four 8-bit values in each corresponding 32-bit element of + // the current source `znr` vector + // - four 8-bit values from a 32-bit element of `zm`, selected + // from each 128-bit segment of `zm` using an index + // + // The 128-bit segment of `zm` currently in use corresponds to the + // 128-bit segment that the current 32-bit elements of `znr` + // and `zaRow` are within. + // For example, with a SVL = 512-bits, elements `e` of `zaRow` in + // the range 0->15, and zmIndex = 1: + // - When `e` = 0 -> 3, the 32-bit element used from `zm` will be + // zm[1] (1st 32-bit element in 0th 128-bit + // segment) + // - When `e` = 4 -> 7, the 32-bit element used from `zm` will be + // zm[5] (1st 32-bit element in 1st 128-bit + // segment) + out[e] = zaRow[e]; + // MOD 4 as there are 4 32-bit elements per 128-bit segment of `zm` + const int zmSegBase = e - (e % 4); + const int s = zmSegBase + zmIndex; + // There are 4 8-bit elements per 32-bit element of `znr` and `zm` + for (int i = 0; i < 4; i++) { + out[e] += static_cast(znr[4 * e + i]) * + static_cast(zm[4 * s + i]); + } + } + // Update results_ for completed row + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } case Opcode::AArch64_UDOT_ZZZI_S: { // udot zd.s, zn.b, zm.b[index] results_[0] = sveUdot_indexed(sourceValues_, metadata_, VL_bits); diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index f3b83d510d..0237a02840 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -575,6 +575,67 @@ TEST_P(InstSme, st1w) { } } +TEST_P(InstSme, udot_vgx4) { + // 8-bit to 32-bit widening + initialHeapData_.resize(SVL / 8); + uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; + fillHeap(heap8, src, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 96 (uint32_t) + dup z1.b, #8 + dup z2.b, #3 + ptrue p0.b + ptrue p1.b + umopa za0.s, p0/m, p1/m, z1.b, z2.b + umopa za1.s, p0/m, p1/m, z1.b, z2.b + umopa za2.s, p0/m, p1/m, z1.b, z2.b + umopa za3.s, p0/m, p1/m, z1.b, z2.b + + # initialise registers + mov w8, #1 + dup z4.b, #10 + dup z5.b, #11 + dup z6.b, #12 + dup z7.b, #13 + ld1b {z10.b}, p0/z, [x0] + + udot za.s[w8, #1, vgx4], {z4.b - z7.b}, z10.b[2] + )"); + const uint16_t zaStride = (SVL / 8) / 4; + const uint16_t zaQuartIndex = 2; + for (uint64_t i = 0; i < (SVL / 8); i++) { + // Effected rows all use same zm values of {8, 9, 10, 11} + if (i == zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({476}, (SVL / 8))); + } else if (i == zaStride + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({514}, (SVL / 8))); + } else if (i == (2 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({552}, (SVL / 8))); + } else if (i == (3 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({590}, (SVL / 8))); + } else { + // un-effected rows should still be 96 throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({96}, (SVL / 8))); + } + } +} + TEST_P(InstSme, umopa) { // 32-bit RUN_AARCH64(R"( From 7125a40066f94ed718cf94e058d54896ecceaa59 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Wed, 30 Oct 2024 13:03:21 +0000 Subject: [PATCH 35/71] Implemented MOVA (array to vecs, 4 registers) SME instruction with tests. --- src/lib/arch/aarch64/Instruction_execute.cc | 29 ++++++++--- test/regression/aarch64/instructions/sme.cc | 56 ++++++++++++++++++++- 2 files changed, 76 insertions(+), 9 deletions(-) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 8a2dd74612..d95696c840 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -3986,19 +3986,32 @@ void Instruction::execute() { const uint8_t offs4 = metadata_.operands[4].sme.slice_offset.imm_range.offset; - uint8_t out[4][256] = {{0}, {0}, {0}, {0}}; - for (uint8_t i = offs1; i <= offs4; i++) { - // Get correct next row - const uint8_t* row = - sourceValues_[(ws + i) % sliceCount].getAsVector(); - // Update out and results_ const uint8_t index = i - offs1; - memcpy(out[index], row, sliceCount); - results_[index] = {out[index], 256}; + results_[index] = sourceValues_[(ws + i) % sliceCount]; } break; } + case Opcode::AArch64_MOVA_VG4_4ZMXI: { // mova {zd1.d - zd4.d}, za.d[wv, + // offs, vgx4] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + // Get ZA stride between quarters and index into each ZA quarter + const uint16_t zaStride = zaRowCount / 4; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[4].sme.slice_offset.imm) % + zaStride; + + results_[0] = sourceValues_[zaIndex]; + results_[1] = sourceValues_[zaStride + zaIndex]; + results_[2] = sourceValues_[(2 * zaStride) + zaIndex]; + results_[3] = sourceValues_[(3 * zaStride) + zaIndex]; + break; + } case Opcode::AArch64_MOVID: { // movi dd, #imm results_[0] = {static_cast(metadata_.operands[1].imm), 256}; break; diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index 0237a02840..f7a3689e62 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -7,7 +7,7 @@ namespace { using InstSme = AArch64RegressionTest; -TEST_P(InstSme, mova) { +TEST_P(InstSme, mova_tileToVec) { // 8-bit RUN_AARCH64(R"( smstart @@ -47,6 +47,60 @@ TEST_P(InstSme, mova) { CHECK_NEON(7, float, fillNeonCombined({4}, {10}, SVL / 8)); } +TEST_P(InstSme, mova_zaToVecs) { + // 4 vectors + initialHeapData_.resize(SVL / 8); + uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; + fillHeap(heap8, src, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 96 (uint32_t) + dup z1.b, #8 + dup z2.b, #3 + ptrue p0.b + ptrue p1.b + umopa za0.s, p0/m, p1/m, z1.b, z2.b + umopa za1.s, p0/m, p1/m, z1.b, z2.b + umopa za2.s, p0/m, p1/m, z1.b, z2.b + umopa za3.s, p0/m, p1/m, z1.b, z2.b + + # Set 4 of the za rows + mov w8, #1 + dup z4.b, #10 + dup z5.b, #11 + dup z6.b, #12 + dup z7.b, #13 + ld1b {z10.b}, p0/z, [x0] + udot za.s[w8, #1, vgx4], {z4.b - z7.b}, z10.b[2] + + mov w9, #0 + mova {z20.d - z23.d}, za.d[w9, #0, vgx4] + mov {z24.d - z27.d}, za.d[w8, #1, vgx4] + )"); + // Check extracted un-effected rows (two uint32_t values of 96 equal one + // uint64_t value of 412316860512) + CHECK_NEON(20, uint64_t, fillNeon({412316860512}, SVL / 8)); + CHECK_NEON(21, uint64_t, fillNeon({412316860512}, SVL / 8)); + CHECK_NEON(22, uint64_t, fillNeon({412316860512}, SVL / 8)); + CHECK_NEON(23, uint64_t, fillNeon({412316860512}, SVL / 8)); + // Check extracted effected rows (two uint32_t values concatonated into one + // uint64_t value) + CHECK_NEON(24, uint64_t, fillNeon({2044404433372}, SVL / 8)); + CHECK_NEON(25, uint64_t, fillNeon({2207613190658}, SVL / 8)); + CHECK_NEON(26, uint64_t, fillNeon({2370821947944}, SVL / 8)); + CHECK_NEON(27, uint64_t, fillNeon({2534030705230}, SVL / 8)); +} + TEST_P(InstSme, mova_tilesToVecs) { // uint8_t; 4 vectors initialHeapData_.resize(SVL / 4); From c6da568feae16035980f52d6f43816e08ccc8ade Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Wed, 30 Oct 2024 13:35:23 +0000 Subject: [PATCH 36/71] Implemented ST1W (4 vec, imm offset) SVE2 instructions with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 27 +++++++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 24 ++++++++++++++++ test/regression/aarch64/instructions/sve.cc | 32 ++++++++++++++++++++- 3 files changed, 82 insertions(+), 1 deletion(-) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index 10c391e33d..bd78d6649c 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -1304,6 +1304,33 @@ span Instruction::generateAddresses() { setMemoryAddresses(std::move(addresses)); break; } + case Opcode::AArch64_ST1W_4Z_IMM: { // st1w {zt1.s - zt4.s}, png, [xn{, + // #imm, mul vl}] + const uint64_t pn = sourceValues_[4].get(); + auto preds = predAsCounterToMasks(pn, VL_bits); + const uint16_t partition_num = VL_bits / 32; + + const uint64_t base = sourceValues_[5].get(); + const int64_t offset = + static_cast(metadata_.operands[5].mem.disp); + const uint64_t addr = base + (offset * partition_num * 4); + + std::vector addresses; + + generatePredicatedContiguousAddressBlocks(addr, partition_num, 4, 4, + preds[0].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8), + partition_num, 4, 4, + preds[1].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + 2 * (VL_bits / 8), + partition_num, 4, 4, + preds[2].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + 3 * (VL_bits / 8), + partition_num, 4, 4, + preds[3].data(), addresses); + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_SST1W_D_IMM: { // st1w {zt.d}, pg, [zn.d{, #imm}] const uint64_t* p = sourceValues_[1].getAsVector(); const uint16_t partition_num = VL_bits / 64; diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index d95696c840..1f471da2d2 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -5136,6 +5136,30 @@ void Instruction::execute() { memoryData_.insert(memoryData_.end(), out2.begin(), out2.end()); break; } + case Opcode::AArch64_ST1W_4Z_IMM: { // st1w {zt1.s - zt4.s}, png, [xn{, + // #imm, mul vl}] + // STORE + const uint32_t* t1 = sourceValues_[0].getAsVector(); + const uint32_t* t2 = sourceValues_[1].getAsVector(); + const uint32_t* t3 = sourceValues_[2].getAsVector(); + const uint32_t* t4 = sourceValues_[3].getAsVector(); + const uint64_t pn = sourceValues_[4].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + memoryData_ = + sve_merge_store_data(t1, preds[0].data(), VL_bits); + std::vector out2 = + sve_merge_store_data(t2, preds[1].data(), VL_bits); + std::vector out3 = + sve_merge_store_data(t3, preds[2].data(), VL_bits); + std::vector out4 = + sve_merge_store_data(t4, preds[3].data(), VL_bits); + memoryData_.insert(memoryData_.end(), out2.begin(), out2.end()); + memoryData_.insert(memoryData_.end(), out3.begin(), out3.end()); + memoryData_.insert(memoryData_.end(), out4.begin(), out4.end()); + break; + } case Opcode::AArch64_ST1i16: { // st1 {vt.h}[index], [xn] // STORE const uint16_t* t = sourceValues_[0].getAsVector(); diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 48f67a2b1f..1d3925dcd8 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -7192,7 +7192,6 @@ TEST_P(InstSve, st1w_multivec) { st1w {z0.s, z1.s}, pn8, [x4, #4, mul vl] st1w {z0.s, z1.s}, pn8, [x4, x1, lsl #2] )"); - for (uint64_t i = 0; i < (VL / 16); i++) { EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 4095 + (i * 4)), @@ -7201,6 +7200,37 @@ TEST_P(InstSve, st1w_multivec) { src[i % 4]); EXPECT_EQ(getMemoryValue(65792 + 8 + (i * 4)), src[i % 4]); } + + // Four vectors + initialHeapData_.resize(VL); + heap32 = reinterpret_cast(initialHeapData_.data()); + fillHeap(heap32, src, VL / 4); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + sub sp, sp, #4095 + mov x1, #2 + mov x4, #256 + madd x4, x4, x4, x4 + ptrue p0.s + ptrue pn8.s + ld1w {z0.s}, p0/z, [x0] + ld1w {z1.s}, p0/z, [x0, #1, mul vl] + ld1w {z2.s}, p0/z, [x0, #2, mul vl] + ld1w {z3.s}, p0/z, [x0, #3, mul vl] + st1w {z0.s - z3.s}, pn8, [sp] + st1w {z0.s - z3.s}, pn8, [x4, #8, mul vl] + )"); + for (uint64_t i = 0; i < (VL / 8); i++) { + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (i * 4)), + src[i % 4]); + EXPECT_EQ(getMemoryValue(65792 + (8 * (VL / 8)) + (i * 4)), + src[i % 4]); + } } TEST_P(InstSve, str_predicate) { From 7e2f9a462b72e51373f429c8166cca2e7a51485d Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Wed, 30 Oct 2024 16:55:44 +0000 Subject: [PATCH 37/71] Fixed SVE udot execution logic. --- src/include/simeng/arch/aarch64/helpers/sve.hh | 6 +++++- test/regression/aarch64/instructions/sve.cc | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index df924c1f8c..c963b22f7a 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -1640,9 +1640,13 @@ RegisterValue sveUdot_indexed( D out[256 / sizeof(D)] = {0}; for (int i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) { D acc = zd[i]; + // Index into zm selects which D-type element within each 128-bit vector + // segment to use + int base = i - (i % (128 / (sizeof(D) * 8))); + int zmIndex = base + index; for (int j = 0; j < W; j++) { acc += (static_cast(zn[(W * i) + j]) * - static_cast(zm[(W * index) + j])); + static_cast(zm[(W * zmIndex) + j])); } out[i] = acc; } diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 1d3925dcd8..bdf4320658 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -7662,7 +7662,8 @@ TEST_P(InstSve, udot) { mov x8, #214 svc #0 - ldr q0, [x0] + ptrue p0.b + ld1rqb { z0.b }, p0/z, [x0] dup z2.b, #2 dup z3.b, #3 @@ -7672,6 +7673,7 @@ TEST_P(InstSve, udot) { udot z4.s, z2.b, z0.b[0] udot z5.s, z3.b, z0.b[3] )"); + CHECK_NEON(4, uint32_t, fillNeon({1534}, VL / 8)); CHECK_NEON(5, uint32_t, fillNeon({629}, VL / 8)); } From 6772b66e4b57d434007d0b23f8b042d6f0988d95 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Wed, 30 Oct 2024 17:38:57 +0000 Subject: [PATCH 38/71] Fixed issue with LD1B SVE2 (4 vec) instruction. --- src/lib/arch/aarch64/Instruction_address.cc | 2 +- src/lib/arch/aarch64/Instruction_execute.cc | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index bd78d6649c..41b03e3216 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -359,7 +359,7 @@ span Instruction::generateAddresses() { const uint64_t base = sourceValues_[1].get(); const uint64_t offset = - static_cast(metadata_.operands[3].mem.disp); + static_cast(metadata_.operands[5].mem.disp); const uint64_t addr = base + (offset * partition_num); std::vector addresses; diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 1f471da2d2..efe8a208c6 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -2765,14 +2765,15 @@ void Instruction::execute() { auto preds = predAsCounterToMasks(pn, VL_bits); - uint8_t out[4][256] = {{0}, {0}}; + uint8_t out[4][256] = {{0}, {0}, {0}, {0}}; const uint16_t partition_num = VL_bits / 8; for (int r = 0; r < 4; r++) { + const uint8_t* data = memoryData_[r].getAsVector(); for (int i = 0; i < partition_num; i++) { uint64_t shifted_active = 1ull << (i % 64); if (preds[r][i / 64] & shifted_active) { - out[r][i] = memoryData_[r].getAsVector()[i]; + out[r][i] = data[i]; } } } From ab80ba7c514b428f9ba365af2d3b4dd447bc1959 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Thu, 31 Oct 2024 15:53:52 +0000 Subject: [PATCH 39/71] Implemented FMLA (float, double, VGx4, indexed) SME instruction with tests. --- src/lib/arch/aarch64/Instruction_execute.cc | 106 +++++++++++++++++ test/regression/aarch64/instructions/sme.cc | 122 +++++++++++++++++++- 2 files changed, 227 insertions(+), 1 deletion(-) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index efe8a208c6..3ee9dfeec1 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -1824,6 +1824,112 @@ void Instruction::execute() { [](double x, double y) -> double { return std::fmin(x, y); }); break; } + case Opcode::AArch64_FMLA_VG4_M4ZZI_D: { // fmla za.d[wv, offs, vgx4], + // {zn1.d - zn4.d}, zm.d[index] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 64; + // Get ZA stride between quarters and index into each ZA quarter + const uint16_t zaStride = zaRowCount / 4; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + // Get zm vector and zm's index + const double* zm = sourceValues_[zaRowCount + 5].getAsVector(); + const int zmIndex = metadata_.operands[5].vector_index; + + // Pre-set all ZA result rows as only 4 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + // Loop over each source vector and destination vector (from the za + // single-vector group) pair + for (int r = 0; r < 4; r++) { + // For ZA single-vector groups of 4 vectors (vgx4), each vector is in + // a different quarter of ZA; indexed into it by Wv+off. + const double* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + const double* znr = + sourceValues_[zaRowCount + 1 + r].getAsVector(); + double out[32] = {0.0}; + // Loop over all elements of output row vector `zaRow` + for (int e = 0; e < elemCount; e++) { + // This instruction multiplies each element of the current `znr` by + // an indexed element of `zm` and destructively adds the result to + // the corresponding element in the current `zaRow`. + // + // The index for `zm` specifies which element in each 128-bit + // segment to use. The 128-bit segment of `zm` currently in use + // corresponds to the 128-bit segment that the current element of + // `znr` and `zaRow` is within. + + // MOD 2 as there are 2 64-bit elements per 128-bit segment of `zm` + const int zmSegBase = e - (e % 2); + out[e] = zaRow[e] + (znr[e] * zm[zmSegBase + zmIndex]); + } + // Update results_ for completed row + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } + case Opcode::AArch64_FMLA_VG4_M4ZZI_S: { // fmla za.s[wv, offs, vgx4], + // {zn1.s - zn4.s}, zm.s[index] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 32; + // Get ZA stride between quarters and index into each ZA quarter + const uint16_t zaStride = zaRowCount / 4; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + // Get zm vector and zm's index + const float* zm = sourceValues_[zaRowCount + 5].getAsVector(); + const int zmIndex = metadata_.operands[5].vector_index; + + // Pre-set all ZA result rows as only 4 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + // Loop over each source vector and destination vector (from the za + // single-vector group) pair + for (int r = 0; r < 4; r++) { + // For ZA single-vector groups of 4 vectors (vgx4), each vector is in + // a different quarter of ZA; indexed into it by Wv+off. + const float* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + const float* znr = + sourceValues_[zaRowCount + 1 + r].getAsVector(); + float out[64] = {0.0f}; + // Loop over all elements of output row vector `zaRow` + for (int e = 0; e < elemCount; e++) { + // This instruction multiplies each element of the current `znr` by + // an indexed element of `zm` and destructively adds the result to + // the corresponding element in the current `zaRow`. + // + // The index for `zm` specifies which element in each 128-bit + // segment to use. The 128-bit segment of `zm` currently in use + // corresponds to the 128-bit segment that the current element of + // `znr` and `zaRow` is within. + + // MOD 4 as there are 4 32-bit elements per 128-bit segment of `zm` + const int zmSegBase = e - (e % 4); + out[e] = zaRow[e] + (znr[e] * zm[zmSegBase + zmIndex]); + } + // Update results_ for completed row + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } case Opcode::AArch64_FMLA_ZPmZZ_D: { // fmla zd.d, pg/m, zn.d, zm.d results_[0] = sveMlaPredicated_vecs(sourceValues_, VL_bits); break; diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index f7a3689e62..25e2b4c800 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -142,6 +142,126 @@ TEST_P(InstSme, mova_tilesToVecs) { CHECK_NEON(11, uint8_t, fillNeon({0x00}, SVL / 8)); } +TEST_P(InstSme, fmla_indexed_vgx4) { + // float + initialHeapData_.resize(SVL); + float* heapf = reinterpret_cast(initialHeapData_.data()); + std::vector srcf = {0.0f, 1.0f, 2.0f, 3.0f}; + fillHeap(heapf, srcf, SVL / 4); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 24.0f + fdup z1.s, #3.0 + fdup z2.s, #8.0 + ptrue p0.s + ptrue p1.s + fmopa za0.s, p0/m, p1/m, z1.s, z2.s + fmopa za1.s, p0/m, p1/m, z1.s, z2.s + fmopa za2.s, p0/m, p1/m, z1.s, z2.s + fmopa za3.s, p0/m, p1/m, z1.s, z2.s + + # initialise registers + mov w8, #1 + fdup z4.s, #0.25 + fdup z5.s, #1.5 + fdup z6.s, #-0.5 + fdup z7.s, #-2.5 + ld1w {z10.s}, p0/z, [x0] + + fmla za.s[w8, #1, vgx4], {z4.s - z7.s}, z10.s[2] + )"); + const uint16_t zaStride = (SVL / 8) / 4; + const uint16_t zaQuartIndex = 2; + for (uint64_t i = 0; i < (SVL / 8); i++) { + // Effected rows all use same zm value of 2.0f + if (i == zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({24.5f}, (SVL / 8))); + } else if (i == zaStride + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({27.0f}, (SVL / 8))); + } else if (i == (2 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({23.0f}, (SVL / 8))); + } else if (i == (3 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({19.0f}, (SVL / 8))); + } else { + // un-effected rows should still be 24.0f throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({24.0f}, (SVL / 8))); + } + } + + // double + initialHeapData_.resize(SVL); + double* heapd = reinterpret_cast(initialHeapData_.data()); + std::vector srcd = {2.0f, 3.0f}; + fillHeap(heapd, srcd, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 24.0f + fdup z1.d, #3.0 + fdup z2.d, #8.0 + ptrue p0.d + ptrue p1.d + fmopa za0.d, p0/m, p1/m, z1.d, z2.d + fmopa za1.d, p0/m, p1/m, z1.d, z2.d + fmopa za2.d, p0/m, p1/m, z1.d, z2.d + fmopa za3.d, p0/m, p1/m, z1.d, z2.d + fmopa za4.d, p0/m, p1/m, z1.d, z2.d + fmopa za5.d, p0/m, p1/m, z1.d, z2.d + fmopa za6.d, p0/m, p1/m, z1.d, z2.d + fmopa za7.d, p0/m, p1/m, z1.d, z2.d + + # initialise registers + mov w8, #1 + fdup z4.d, #0.25 + fdup z5.d, #1.5 + fdup z6.d, #-0.5 + fdup z7.d, #-2.5 + ld1d {z10.d}, p0/z, [x0] + + fmla za.d[w8, #1, vgx4], {z4.d - z7.d}, z10.d[0] + )"); + for (uint64_t i = 0; i < (SVL / 8); i++) { + // Effected rows all use same zm value of 2.0f + if (i == zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({24.5}, (SVL / 8))); + } else if (i == zaStride + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({27.0}, (SVL / 8))); + } else if (i == (2 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({23.0}, (SVL / 8))); + } else if (i == (3 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({19.0}, (SVL / 8))); + } else { + // un-effected rows should still be 24.0 throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({24.0}, (SVL / 8))); + } + } +} + TEST_P(InstSme, fmopa) { // 32-bit RUN_AARCH64(R"( @@ -629,7 +749,7 @@ TEST_P(InstSme, st1w) { } } -TEST_P(InstSme, udot_vgx4) { +TEST_P(InstSme, udot_Indexed_vgx4) { // 8-bit to 32-bit widening initialHeapData_.resize(SVL / 8); uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); From 9e762b88f0c10c03f6011cd4591338f1c369b80b Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Thu, 31 Oct 2024 16:43:55 +0000 Subject: [PATCH 40/71] Implemented st1d (4 consec vecs, uint64, imm offset) SVE2 instruction with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 27 ++++++++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 24 ++++++++++++++++ test/regression/aarch64/instructions/sve.cc | 31 +++++++++++++++++++++ 3 files changed, 82 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index 41b03e3216..ad61720c6a 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -1154,6 +1154,33 @@ span Instruction::generateAddresses() { setMemoryAddresses(std::move(addresses)); break; } + case Opcode::AArch64_ST1D_4Z_IMM: { // st1d {zt1.d - zt4.d}, png, [xn{, + // #imm, mul vl}] + const uint64_t pn = sourceValues_[4].get(); + auto preds = predAsCounterToMasks(pn, VL_bits); + const uint16_t partition_num = VL_bits / 64; + + const uint64_t base = sourceValues_[5].get(); + const int64_t offset = + static_cast(metadata_.operands[5].mem.disp); + const uint64_t addr = base + (offset * partition_num * 8); + + std::vector addresses; + + generatePredicatedContiguousAddressBlocks(addr, partition_num, 8, 8, + preds[0].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + (VL_bits / 8), + partition_num, 8, 8, + preds[1].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + 2 * (VL_bits / 8), + partition_num, 8, 8, + preds[2].data(), addresses); + generatePredicatedContiguousAddressBlocks(addr + 3 * (VL_bits / 8), + partition_num, 8, 8, + preds[3].data(), addresses); + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_ST2D_IMM: { // st2d {zt1.d, zt2.d}, pg, [{, // #imm, mul vl}] const uint64_t* p = sourceValues_[2].getAsVector(); diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 3ee9dfeec1..8d609ba38c 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -5019,6 +5019,30 @@ void Instruction::execute() { memoryData_.insert(memoryData_.end(), out2.begin(), out2.end()); break; } + case Opcode::AArch64_ST1D_4Z_IMM: { // st1d {zt1.d - zt4.d}, png, [xn{, + // #imm, mul vl}] + // STORE + const uint64_t* t1 = sourceValues_[0].getAsVector(); + const uint64_t* t2 = sourceValues_[1].getAsVector(); + const uint64_t* t3 = sourceValues_[2].getAsVector(); + const uint64_t* t4 = sourceValues_[3].getAsVector(); + const uint64_t pn = sourceValues_[4].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + memoryData_ = + sve_merge_store_data(t1, preds[0].data(), VL_bits); + std::vector out2 = + sve_merge_store_data(t2, preds[1].data(), VL_bits); + std::vector out3 = + sve_merge_store_data(t3, preds[2].data(), VL_bits); + std::vector out4 = + sve_merge_store_data(t4, preds[3].data(), VL_bits); + memoryData_.insert(memoryData_.end(), out2.begin(), out2.end()); + memoryData_.insert(memoryData_.end(), out3.begin(), out3.end()); + memoryData_.insert(memoryData_.end(), out4.begin(), out4.end()); + break; + } case Opcode::AArch64_ST1Fourv16b: { // st1 {vt.16b, vt2.16b, vt3.16b, // vt4.16b}, [xn|sp] // STORE diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index bdf4320658..0e8344b3a0 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -6948,6 +6948,37 @@ TEST_P(InstSve, st1d_multivec) { src[i % 4]); EXPECT_EQ(getMemoryValue(65792 + 8 + (i * 8)), src[i % 4]); } + + // Four vectors + initialHeapData_.resize(VL); + heap64 = reinterpret_cast(initialHeapData_.data()); + fillHeap(heap64, src, VL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + sub sp, sp, #4095 + mov x1, #2 + mov x4, #256 + madd x4, x4, x4, x4 + ptrue p0.d + ptrue pn8.d + ld1d {z0.d}, p0/z, [x0] + ld1d {z1.d}, p0/z, [x0, #1, mul vl] + ld1d {z2.d}, p0/z, [x0, #2, mul vl] + ld1d {z3.d}, p0/z, [x0, #3, mul vl] + st1d {z0.d - z3.d}, pn8, [sp] + st1d {z0.d - z3.d}, pn8, [x4, #8, mul vl] + )"); + for (uint64_t i = 0; i < (VL / 16); i++) { + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (i * 8)), + src[i % 4]); + EXPECT_EQ(getMemoryValue(65792 + (8 * (VL / 8)) + (i * 8)), + src[i % 4]); + } } TEST_P(InstSve, st2d) { From 7de00825c4e79a4e595dfae37a9e0e5d3eafeb12 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Thu, 31 Oct 2024 17:44:36 +0000 Subject: [PATCH 41/71] Added NEON bf16 UDOT (by element) instruction execution logic and BF16 build option. --- CMakeLists.txt | 1 + src/include/simeng/version.hh.in | 1 + src/lib/arch/aarch64/Instruction_execute.cc | 23 +++++++++++++++++++++ 3 files changed, 25 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index d0691578fe..42111288ff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,6 +116,7 @@ option(SIMENG_SANITIZE "Enable compiler sanitizers" OFF) option(SIMENG_OPTIMIZE "Enable Extra Compiler Optimizations" OFF) option(SIMENG_ENABLE_SST "Compile SimEng SST Wrapper" OFF) option(SIMENG_ENABLE_SST_TESTS "Enable testing for SST" OFF) +option(SIMENG_ENABLE_BF16 "Enable __bf16 instruction execution logic" OFF) # Set CXX flag for Apple Mac so that `binary_function` and `unary_function` types that are used in SST can be recognised. # They were deprecated in C++11 and removed in C++17, and Apple Clang v15 no longer supports these types without the following flag diff --git a/src/include/simeng/version.hh.in b/src/include/simeng/version.hh.in index 5f1e8f410b..8a2a823a66 100644 --- a/src/include/simeng/version.hh.in +++ b/src/include/simeng/version.hh.in @@ -9,5 +9,6 @@ #define SIMENG_LLVM_VERSION @SIMENG_LLVM_VERSION@ #define SIMENG_ENABLE_TESTS "${SIMENG_ENABLE_TESTS}" #define SIMENG_BUILD_DIR "${CMAKE_BINARY_DIR}" +#define SIMENG_ENABLE_BF16 "${SIMENG_ENABLE_BF16}" #endif \ No newline at end of file diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 8d609ba38c..583818fc39 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -486,6 +486,29 @@ void Instruction::execute() { branchAddress_ = instructionAddress_ + metadata_.operands[0].imm; break; } + case Opcode::AArch64_BF16DOTlanev8bf16: { // bfdot vd.4s, vn.8h, + // vm.2h[index] + // BF16 -- EXPERIMENTAL + if (std::string(SIMENG_ENABLE_BF16) == "OFF") return executionNYI(); + // Must be enabled at SimEng compile time + // Not verified to be working for all compilers or OSs. + // No Tests written + + const float* vd = sourceValues_[0].getAsVector(); + const __bf16* vn = sourceValues_[1].getAsVector<__bf16>(); + const __bf16* vm = sourceValues_[2].getAsVector<__bf16>(); + const int vmIndex = metadata_.operands[2].vector_index; + + float out[4] = {vd[0], vd[1], vd[2], vd[3]}; + for (int i = 0; i < 4; i++) { + out[i] += (static_cast(vn[2 * i]) * + static_cast(vm[2 * vmIndex])) + + (static_cast(vn[2 * i + 1]) * + static_cast(vm[2 * vmIndex + 1])); + } + results_[0] = RegisterValue(out, 256); + break; + } case Opcode::AArch64_BFMWri: { // bfm wd, wn, #immr, #imms results_[0] = { bfm_2imms(sourceValues_, metadata_, false, false), 8}; From 14a79d8192036439ccacccd2e039af57b829eecc Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Fri, 1 Nov 2024 14:33:56 +0000 Subject: [PATCH 42/71] Implemented ld1b (4 strided vectors, imm and reg offset) instructions with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 22 +++ src/lib/arch/aarch64/Instruction_execute.cc | 9 + test/regression/aarch64/instructions/sve.cc | 174 ++++++++++++++++++++ 3 files changed, 205 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index ad61720c6a..d9180f189a 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -353,6 +353,28 @@ span Instruction::generateAddresses() { setMemoryAddresses(std::move(addresses)); break; } + case Opcode::AArch64_LD1B_4Z_STRIDED: { // ld1b {zt1.b, zt2.b, zt3.b, + // zt4.b}, png/z, [xn, xm] + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = sourceValues_[2].get(); + const uint64_t addr = base + offset; + + std::vector addresses; + addresses.reserve(4); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + addresses.push_back({addr + 2 * blockSize, blockSize}); + addresses.push_back({addr + 3 * blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_LD1B_4Z_STRIDED_IMM: // ld1b {zt1.b, zt2.b, zt3.b, + // zt4.b}, png/z, [xn{, #imm, + // mul vl}] + [[fallthrough]]; case Opcode::AArch64_LD1B_4Z_IMM: { // ld1b {zt1.b - zt4.b}, png/z, [xn{, // #imm, mul vl}] const uint16_t partition_num = VL_bits / 8; diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 583818fc39..a433e409c8 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -2887,6 +2887,15 @@ void Instruction::execute() { results_[1] = {out[1], 256}; break; } + case Opcode::AArch64_LD1B_4Z_STRIDED: // ld1b {zt1.b, zt2.b, zt3.b, + // zt4.b}, png/z, [xn, xm] + // LOAD + [[fallthrough]]; + case Opcode::AArch64_LD1B_4Z_STRIDED_IMM: // ld1b {zt1.b, zt2.b, zt3.b, + // zt4.b}, png/z, [xn{, #imm, + // mul vl}] + // LOAD + [[fallthrough]]; case Opcode::AArch64_LD1B_4Z_IMM: { // ld1b {zt1.b - zt4.b}, png/z, [xn{, // #imm, mul vl}] // LOAD diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 0e8344b3a0..60099cbb5d 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -5033,10 +5033,14 @@ TEST_P(InstSve, ld1b) { ptrue pn8.b + mov x1, #4 ld1b {z0.b - z3.b}, pn8/z, [x0, #4, mul vl] + ld1b {z16.b, z20.b, z24.b, z28.b}, pn8/z, [x0, #4, mul vl] + ld1b {z17.b, z21.b, z25.b, z29.b}, pn8/z, [x0, x1] )"); base = (VL / 8) * 4; offset = (VL / 8); + // Consecutive vectors CHECK_NEON(0, uint8_t, fillNeon( { @@ -5121,6 +5125,176 @@ TEST_P(InstSve, ld1b) { src[((base + (3 * offset)) + 15) % 16], }, VL / 8)); + // Strided (4-stride) vectors + CHECK_NEON(16, uint8_t, + fillNeon( + { + src[(base) % 16], + src[(base + 1) % 16], + src[(base + 2) % 16], + src[(base + 3) % 16], + src[(base + 4) % 16], + src[(base + 5) % 16], + src[(base + 6) % 16], + src[(base + 7) % 16], + src[(base + 8) % 16], + src[(base + 9) % 16], + src[(base + 10) % 16], + src[(base + 11) % 16], + src[(base + 12) % 16], + src[(base + 13) % 16], + src[(base + 14) % 16], + src[(base + 15) % 16], + }, + VL / 8)); + CHECK_NEON(20, uint8_t, + fillNeon( + { + src[((base + offset)) % 16], + src[((base + offset) + 1) % 16], + src[((base + offset) + 2) % 16], + src[((base + offset) + 3) % 16], + src[((base + offset) + 4) % 16], + src[((base + offset) + 5) % 16], + src[((base + offset) + 6) % 16], + src[((base + offset) + 7) % 16], + src[((base + offset) + 8) % 16], + src[((base + offset) + 9) % 16], + src[((base + offset) + 10) % 16], + src[((base + offset) + 11) % 16], + src[((base + offset) + 12) % 16], + src[((base + offset) + 13) % 16], + src[((base + offset) + 14) % 16], + src[((base + offset) + 15) % 16], + }, + VL / 8)); + CHECK_NEON(24, uint8_t, + fillNeon( + { + src[((base + (2 * offset))) % 16], + src[((base + (2 * offset)) + 1) % 16], + src[((base + (2 * offset)) + 2) % 16], + src[((base + (2 * offset)) + 3) % 16], + src[((base + (2 * offset)) + 4) % 16], + src[((base + (2 * offset)) + 5) % 16], + src[((base + (2 * offset)) + 6) % 16], + src[((base + (2 * offset)) + 7) % 16], + src[((base + (2 * offset)) + 8) % 16], + src[((base + (2 * offset)) + 9) % 16], + src[((base + (2 * offset)) + 10) % 16], + src[((base + (2 * offset)) + 11) % 16], + src[((base + (2 * offset)) + 12) % 16], + src[((base + (2 * offset)) + 13) % 16], + src[((base + (2 * offset)) + 14) % 16], + src[((base + (2 * offset)) + 15) % 16], + }, + VL / 8)); + CHECK_NEON(28, uint8_t, + fillNeon( + { + src[((base + (3 * offset))) % 16], + src[((base + (3 * offset)) + 1) % 16], + src[((base + (3 * offset)) + 2) % 16], + src[((base + (3 * offset)) + 3) % 16], + src[((base + (3 * offset)) + 4) % 16], + src[((base + (3 * offset)) + 5) % 16], + src[((base + (3 * offset)) + 6) % 16], + src[((base + (3 * offset)) + 7) % 16], + src[((base + (3 * offset)) + 8) % 16], + src[((base + (3 * offset)) + 9) % 16], + src[((base + (3 * offset)) + 10) % 16], + src[((base + (3 * offset)) + 11) % 16], + src[((base + (3 * offset)) + 12) % 16], + src[((base + (3 * offset)) + 13) % 16], + src[((base + (3 * offset)) + 14) % 16], + src[((base + (3 * offset)) + 15) % 16], + }, + VL / 8)); + base = (VL / 8) + 4; + CHECK_NEON(17, uint8_t, + fillNeon( + { + src[(base) % 16], + src[(base + 1) % 16], + src[(base + 2) % 16], + src[(base + 3) % 16], + src[(base + 4) % 16], + src[(base + 5) % 16], + src[(base + 6) % 16], + src[(base + 7) % 16], + src[(base + 8) % 16], + src[(base + 9) % 16], + src[(base + 10) % 16], + src[(base + 11) % 16], + src[(base + 12) % 16], + src[(base + 13) % 16], + src[(base + 14) % 16], + src[(base + 15) % 16], + }, + VL / 8)); + CHECK_NEON(21, uint8_t, + fillNeon( + { + src[((base + offset)) % 16], + src[((base + offset) + 1) % 16], + src[((base + offset) + 2) % 16], + src[((base + offset) + 3) % 16], + src[((base + offset) + 4) % 16], + src[((base + offset) + 5) % 16], + src[((base + offset) + 6) % 16], + src[((base + offset) + 7) % 16], + src[((base + offset) + 8) % 16], + src[((base + offset) + 9) % 16], + src[((base + offset) + 10) % 16], + src[((base + offset) + 11) % 16], + src[((base + offset) + 12) % 16], + src[((base + offset) + 13) % 16], + src[((base + offset) + 14) % 16], + src[((base + offset) + 15) % 16], + }, + VL / 8)); + CHECK_NEON(25, uint8_t, + fillNeon( + { + src[((base + (2 * offset))) % 16], + src[((base + (2 * offset)) + 1) % 16], + src[((base + (2 * offset)) + 2) % 16], + src[((base + (2 * offset)) + 3) % 16], + src[((base + (2 * offset)) + 4) % 16], + src[((base + (2 * offset)) + 5) % 16], + src[((base + (2 * offset)) + 6) % 16], + src[((base + (2 * offset)) + 7) % 16], + src[((base + (2 * offset)) + 8) % 16], + src[((base + (2 * offset)) + 9) % 16], + src[((base + (2 * offset)) + 10) % 16], + src[((base + (2 * offset)) + 11) % 16], + src[((base + (2 * offset)) + 12) % 16], + src[((base + (2 * offset)) + 13) % 16], + src[((base + (2 * offset)) + 14) % 16], + src[((base + (2 * offset)) + 15) % 16], + }, + VL / 8)); + CHECK_NEON(29, uint8_t, + fillNeon( + { + src[((base + (3 * offset))) % 16], + src[((base + (3 * offset)) + 1) % 16], + src[((base + (3 * offset)) + 2) % 16], + src[((base + (3 * offset)) + 3) % 16], + src[((base + (3 * offset)) + 4) % 16], + src[((base + (3 * offset)) + 5) % 16], + src[((base + (3 * offset)) + 6) % 16], + src[((base + (3 * offset)) + 7) % 16], + src[((base + (3 * offset)) + 8) % 16], + src[((base + (3 * offset)) + 9) % 16], + src[((base + (3 * offset)) + 10) % 16], + src[((base + (3 * offset)) + 11) % 16], + src[((base + (3 * offset)) + 12) % 16], + src[((base + (3 * offset)) + 13) % 16], + src[((base + (3 * offset)) + 14) % 16], + src[((base + (3 * offset)) + 15) % 16], + }, + VL / 8)); } TEST_P(InstSve, ld1sw_gather) { From 2db03bcfdef99a322f9794a666b780865f45f655 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Fri, 1 Nov 2024 15:06:17 +0000 Subject: [PATCH 43/71] Implemented UVDOT (VGx4 8-bit to 32-bit widening, indexed vector) SME instruction with tests. --- src/lib/arch/aarch64/Instruction_execute.cc | 52 ++++++++++++++++++ test/regression/aarch64/instructions/sme.cc | 61 +++++++++++++++++++++ 2 files changed, 113 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index a433e409c8..c48886aa93 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -6146,6 +6146,58 @@ void Instruction::execute() { } break; } + case Opcode::AArch64_UVDOT_VG4_M4ZZI_BtoS: { // uvdot za.s[wv, #off, + // vgx4], {zn1.b - zn4.b}, + // zm.b[#index] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 32; + // Get ZA stride between quarters and index into each ZA quarter + const uint16_t zaStride = zaRowCount / 4; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + // Get zm vector and zm's index + const uint8_t* zm = + sourceValues_[zaRowCount + 5].getAsVector(); + const int zmIndex = metadata_.operands[5].vector_index; + + // Pre-set all ZA result rows as only 4 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + // Loop over each source vector and destination vector (from the za + // single-vector group) pair + for (int r = 0; r < 4; r++) { + // For ZA single-vector groups of 4 vectors (vgx4), each vector is in + // a different quarter of ZA; indexed into it by Wv+off. + const uint32_t* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + uint32_t out[64] = {0}; + // Loop over all 32-bit elements of output row vector `zaRow` + for (int e = 0; e < elemCount; e++) { + out[e] = zaRow[e]; + // MOD 4 as there are 4 32-bit elements per 128-bit segment of `zm` + const int zmSegBase = e - (e % 4); + const int s = zmSegBase + zmIndex; + // There are 4 8-bit elements per 32-bit element of `znr` and `zm` + for (int i = 0; i < 4; i++) { + const uint8_t* znr = + sourceValues_[zaRowCount + 1 + i].getAsVector(); + out[e] += static_cast(znr[4 * e + r]) * + static_cast(zm[4 * s + i]); + } + } + // Update results_ for completed row + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } case Opcode::AArch64_UDOT_ZZZI_S: { // udot zd.s, zn.b, zm.b[index] results_[0] = sveUdot_indexed(sourceValues_, metadata_, VL_bits); diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index 25e2b4c800..4ea8c58a74 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -810,6 +810,67 @@ TEST_P(InstSme, udot_Indexed_vgx4) { } } +TEST_P(InstSme, uvdot_vgx4) { + // 8-bit to 32-bit widening + initialHeapData_.resize(SVL / 8); + uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; + fillHeap(heap8, src, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 96 (uint32_t) + dup z1.b, #8 + dup z2.b, #3 + ptrue p0.b + ptrue p1.b + umopa za0.s, p0/m, p1/m, z1.b, z2.b + umopa za1.s, p0/m, p1/m, z1.b, z2.b + umopa za2.s, p0/m, p1/m, z1.b, z2.b + umopa za3.s, p0/m, p1/m, z1.b, z2.b + + # initialise registers + mov w8, #1 + dup z4.b, #10 + dup z5.b, #11 + dup z6.b, #12 + dup z7.b, #13 + ld1b {z10.b}, p0/z, [x0] + + uvdot za.s[w8, #1, vgx4], {z4.b - z7.b}, z10.b[2] + )"); + const uint16_t zaStride = (SVL / 8) / 4; + const uint16_t zaQuartIndex = 2; + for (uint64_t i = 0; i < (SVL / 8); i++) { + // Effected rows all use same zm values of {8, 9, 10, 11} + if (i == zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({538}, (SVL / 8))); + } else if (i == zaStride + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({538}, (SVL / 8))); + } else if (i == (2 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({538}, (SVL / 8))); + } else if (i == (3 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({538}, (SVL / 8))); + } else { + // un-effected rows should still be 96 throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({96}, (SVL / 8))); + } + } +} + TEST_P(InstSme, umopa) { // 32-bit RUN_AARCH64(R"( From 68038b713a8f393cd4ef1570129f241cf2732da5 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Fri, 1 Nov 2024 18:11:03 +0000 Subject: [PATCH 44/71] Implemented ST4W (imm offset) SVE instruction with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 20 +++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 40 ++++++++++++++++++ test/regression/aarch64/instructions/sve.cc | 47 ++++++++++++++++++++- 3 files changed, 106 insertions(+), 1 deletion(-) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index d9180f189a..88e3c1bde4 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -1222,6 +1222,26 @@ span Instruction::generateAddresses() { setMemoryAddresses(std::move(addresses)); break; } + case Opcode::AArch64_ST4W_IMM: { // st4w {zt1.s, zt2.s, zt3.s, zt4.s}, + // pg, [{, #imm, mul vl}] + const uint64_t* p = sourceValues_[4].getAsVector(); + const uint16_t partition_num = VL_bits / 32; + + const uint64_t base = sourceValues_[5].get(); + const int64_t offset = + static_cast(metadata_.operands[5].mem.disp); + + std::vector addresses; + addresses.reserve(partition_num * 4); + + uint64_t addr = base + (offset * partition_num * 4); + + generatePredicatedContiguousAddressBlocks(addr, partition_num, 16, 4, p, + addresses); + + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_ST1_MXIPXX_H_D: // st1d {zath.d[ws, #imm]}, pg, // [{, xm, lsl #3}] case Opcode::AArch64_ST1_MXIPXX_V_D: { // st1d {zatv.d[ws, #imm]}, pg, diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index c48886aa93..4b90ef5d55 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -5448,6 +5448,46 @@ void Instruction::execute() { results_[0] = sourceValues_[2].get() + postIndex; break; } + case Opcode::AArch64_ST4W_IMM: { // st4w {zt1.s, zt2.s, zt3.s, zt4.s}, + // pg, [{, #imm, mul vl}] + // STORE + const uint32_t* d1 = sourceValues_[0].getAsVector(); + const uint32_t* d2 = sourceValues_[1].getAsVector(); + const uint32_t* d3 = sourceValues_[2].getAsVector(); + const uint32_t* d4 = sourceValues_[3].getAsVector(); + const uint64_t* p = sourceValues_[4].getAsVector(); + + std::vector memData; + bool inActiveBlock = false; + + const uint16_t partition_num = VL_bits / 32; + uint16_t index = 0; + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % 16) * 4); + if (p[i / 16] & shifted_active) { + // If active and not in active block, initialise + if (!inActiveBlock) { + memData.clear(); + inActiveBlock = true; + } + memData.push_back(d1[i]); + memData.push_back(d2[i]); + memData.push_back(d3[i]); + memData.push_back(d4[i]); + } else if (inActiveBlock) { + inActiveBlock = false; + memoryData_[index] = RegisterValue( + (char*)memData.data(), sizeof(uint32_t) * memData.size()); + index++; + } + } + // Add final block if needed + if (inActiveBlock) + memoryData_[index] = RegisterValue((char*)memData.data(), + sizeof(uint32_t) * memData.size()); + + break; + } case Opcode::AArch64_STLRB: { // stlrb wt, [xn] // STORE memoryData_[0] = sourceValues_[0]; diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 60099cbb5d..18b27a5708 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -7156,7 +7156,6 @@ TEST_P(InstSve, st1d_multivec) { } TEST_P(InstSve, st2d) { - // 32-bit RUN_AARCH64(R"( ptrue p0.d mov x0, #0 @@ -7193,6 +7192,52 @@ TEST_P(InstSve, st2d) { } } +TEST_P(InstSve, st4w) { + // 32-bit + RUN_AARCH64(R"( + ptrue p0.s + mov x0, #0 + addvl x1, x0, #1 + mov x2, #8 + udiv x3, x1, x2 + whilelo p1.s, xzr, x3 + + sub sp, sp, #4095 + mov x6, #300 + + dup z0.s, #3 + dup z1.s, #4 + dup z2.s, #5 + dup z3.s, #6 + + st4w {z0.s - z3.s}, p0, [sp] + st4w {z0.s - z3.s}, p1, [x6, #4, mul vl] + )"); + + for (uint64_t i = 0; i < (VL / 32); i++) { + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (4 * i * 4)), + 3); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (4 * i * 4) + 4), + 4); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (4 * i * 4) + 8), + 5); + EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - + 4095 + (4 * i * 4) + 12), + 6); + } + + int index = 4 * (VL / 8); + for (uint64_t i = 0; i < (VL / 64); i++) { + EXPECT_EQ(getMemoryValue(300 + index + (4 * i * 4)), 3); + EXPECT_EQ(getMemoryValue(300 + index + (4 * i * 4) + 4), 4); + EXPECT_EQ(getMemoryValue(300 + index + (4 * i * 4) + 8), 5); + EXPECT_EQ(getMemoryValue(300 + index + (4 * i * 4) + 12), 6); + } +} + TEST_P(InstSve, st1w_scatter) { // 32-bit RUN_AARCH64(R"( From 4a8f3f64397342088b9c28ee812060a89328b56c Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Fri, 1 Nov 2024 18:23:02 +0000 Subject: [PATCH 45/71] Implemented LD1W (4 vec, scalar offset) SVE2 instruction with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 18 ++++++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 4 ++++ test/regression/aarch64/instructions/sve.cc | 24 +++++++++++++++++++++ 3 files changed, 46 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index 88e3c1bde4..07c2e84709 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -519,6 +519,24 @@ span Instruction::generateAddresses() { setMemoryAddresses(std::move(addresses)); break; } + case Opcode::AArch64_LD1W_4Z: { // ld1w {zt1.s - zt4.s}, png/z, [xn, + // xm, lsl #2] + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = sourceValues_[2].get(); + const uint64_t addr = base + (offset << 2); + + std::vector addresses; + addresses.reserve(4); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + addresses.push_back({addr + 2 * blockSize, blockSize}); + addresses.push_back({addr + 3 * blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_LD1W_4Z_IMM: { // ld1w {zt1.s - zt4.s}, png/z, [xn{, // #imm, mul vl}] const uint16_t partition_num = VL_bits / 32; diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 4b90ef5d55..d36486154b 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -3465,6 +3465,10 @@ void Instruction::execute() { results_[1] = {out[1], 256}; break; } + case Opcode::AArch64_LD1W_4Z: // ld1w {zt1.s - zt4.s}, png/z, [xn, + // xm, lsl #2] + // LOAD + [[fallthrough]]; case Opcode::AArch64_LD1W_4Z_IMM: { // ld1w {zt1.s - zt4.s}, png/z, [xn{, // #imm, mul vl}] // LOAD diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 18b27a5708..a0df6713ea 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -5684,8 +5684,10 @@ TEST_P(InstSve, ld1w) { dup z3.s, #4 ptrue pn8.s + addvl x1, x1, #1 ld1w {z0.s - z3.s}, pn8/z, [x0, #4, mul vl] + ld1w {z4.s - z7.s}, pn8/z, [x0, x1, lsl #2] )"); base = (VL / 32) * 4; offset = (VL / 32); @@ -5711,6 +5713,28 @@ TEST_P(InstSve, ld1w) { src[((base + (offset * 3)) + 2) % 4], src[((base + (offset * 3)) + 3) % 4]}, VL / 8)); + CHECK_NEON(4, uint32_t, + fillNeon({src[(base) % 4], src[(base + 1) % 4], + src[(base + 2) % 4], src[(base + 3) % 4]}, + VL / 8)); + CHECK_NEON( + 5, uint32_t, + fillNeon( + {src[((base + offset)) % 4], src[((base + offset) + 1) % 4], + src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]}, + VL / 8)); + CHECK_NEON(6, uint32_t, + fillNeon({src[((base + (offset * 2))) % 4], + src[((base + (offset * 2)) + 1) % 4], + src[((base + (offset * 2)) + 2) % 4], + src[((base + (offset * 2)) + 3) % 4]}, + VL / 8)); + CHECK_NEON(7, uint32_t, + fillNeon({src[((base + (offset * 3))) % 4], + src[((base + (offset * 3)) + 1) % 4], + src[((base + (offset * 3)) + 2) % 4], + src[((base + (offset * 3)) + 3) % 4]}, + VL / 8)); } TEST_P(InstSve, ld2d) { From 3d5b288f08e2f74341d1b3586ff99ef8daf923cb Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Fri, 1 Nov 2024 19:09:34 +0000 Subject: [PATCH 46/71] Implemented FMLA (float, VGx4) SME instruction with tests. --- src/lib/arch/aarch64/Instruction_execute.cc | 42 +++++++++++++++ test/regression/aarch64/instructions/sme.cc | 59 +++++++++++++++++++++ 2 files changed, 101 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index d36486154b..131b327fae 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -1847,6 +1847,48 @@ void Instruction::execute() { [](double x, double y) -> double { return std::fmin(x, y); }); break; } + case Opcode::AArch64_FMLA_VG4_M4Z4Z_S: { // fmla za.s[wv, offs, vgx4], + // {zn1.s - zn4.s}, {zm1.s - + // zm4.s} + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 32; + // Get ZA stride between quarters and index into each ZA quarter + const uint16_t zaStride = zaRowCount / 4; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + + // Pre-set all ZA result rows as only 4 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + // Get sourceValues_ index of first zn and zm regs + const uint16_t n = zaRowCount + 1; + const uint16_t m = zaRowCount + 5; + + // Loop over each source vector and destination vector (from the za + // single-vector group) pair + for (int r = 0; r < 4; r++) { + // For ZA single-vector groups of 4 vectors (vgx4), each vector is in + // a different quarter of ZA; indexed into it by Wv+off. + const float* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + const float* zn = sourceValues_[n + r].getAsVector(); + const float* zm = sourceValues_[m + r].getAsVector(); + float out[64] = {0.0f}; + for (int e = 0; e < elemCount; e++) { + out[e] = zaRow[e] + (zn[e] * zm[e]); + } + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } case Opcode::AArch64_FMLA_VG4_M4ZZI_D: { // fmla za.d[wv, offs, vgx4], // {zn1.d - zn4.d}, zm.d[index] // SME diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index 4ea8c58a74..9e9406ff51 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -142,6 +142,65 @@ TEST_P(InstSme, mova_tilesToVecs) { CHECK_NEON(11, uint8_t, fillNeon({0x00}, SVL / 8)); } +TEST_P(InstSme, fmla_multiVecs) { + // float, vgx4 + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 24.0f + fdup z1.s, #3.0 + fdup z2.s, #8.0 + ptrue p0.s + ptrue p1.s + fmopa za0.s, p0/m, p1/m, z1.s, z2.s + fmopa za1.s, p0/m, p1/m, z1.s, z2.s + fmopa za2.s, p0/m, p1/m, z1.s, z2.s + fmopa za3.s, p0/m, p1/m, z1.s, z2.s + + # initialise registers + mov w8, #1 + fdup z4.s, #0.25 + fdup z5.s, #1.5 + fdup z6.s, #-0.5 + fdup z7.s, #-2.5 + fdup z8.s, #3.0 + fdup z9.s, #4.0 + fdup z10.s, #5.0 + fdup z11.s, #6.0 + + fmla za.s[w8, #1, vgx4], {z4.s - z7.s}, {z8.s - z11.s} + )"); + const uint16_t zaStride = (SVL / 8) / 4; + const uint16_t zaQuartIndex = 2; + for (uint64_t i = 0; i < (SVL / 8); i++) { + // Effected rows all use same zm value of 2.0f + if (i == zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({24.75f}, (SVL / 8))); + } else if (i == zaStride + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({30.0f}, (SVL / 8))); + } else if (i == (2 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({21.5f}, (SVL / 8))); + } else if (i == (3 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({9.0f}, (SVL / 8))); + } else { + // un-effected rows should still be 24.0f throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({24.0f}, (SVL / 8))); + } + } +} + TEST_P(InstSme, fmla_indexed_vgx4) { // float initialHeapData_.resize(SVL); From b9dcabeac801775439707660080d6b1d094cdf04 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 4 Nov 2024 11:56:51 +0000 Subject: [PATCH 47/71] Implemented MOVA (array to vecs, 2 registers) SME instruction with tests. --- src/lib/arch/aarch64/Instruction_execute.cc | 18 +++++++ test/regression/aarch64/instructions/sme.cc | 58 ++++++++++++++++++++- 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 131b327fae..072555cc11 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -4177,6 +4177,24 @@ void Instruction::execute() { } break; } + case Opcode::AArch64_MOVA_VG2_2ZMXI: { // mova {zd1.d, zd2.d}, za.d[wv, + // offs, vgx2] + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + // Get ZA stride between halves and index into each ZA quarter + const uint16_t zaStride = zaRowCount / 2; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[2].sme.slice_offset.imm) % + zaStride; + + results_[0] = sourceValues_[zaIndex]; + results_[1] = sourceValues_[zaStride + zaIndex]; + break; + } case Opcode::AArch64_MOVA_VG4_4ZMXI: { // mova {zd1.d - zd4.d}, za.d[wv, // offs, vgx4] // SME diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index 9e9406ff51..066970b9ea 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -48,7 +48,7 @@ TEST_P(InstSme, mova_tileToVec) { } TEST_P(InstSme, mova_zaToVecs) { - // 4 vectors + // 2 vectors initialHeapData_.resize(SVL / 8); uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); std::vector src = {0, 1, 2, 3, 4, 5, 6, 7, @@ -83,6 +83,62 @@ TEST_P(InstSme, mova_zaToVecs) { ld1b {z10.b}, p0/z, [x0] udot za.s[w8, #1, vgx4], {z4.b - z7.b}, z10.b[2] + # Extravt un-updated values + mov w9, #0 + mova {z20.d, z21.d}, za.d[w9, #0, vgx2] + # Extract 0th and 2nd updated rows + mov {z24.d, z25.d}, za.d[w8, #1, vgx2] + # Extract 1st and 3rd updated rows (get new offset into each half) + addvl x10, x10, #1 + mov x20, #4 + udiv x10, x10, x20 + mov {z26.d, z27.d}, za.d[w10, #2, vgx2] + )"); + // Check extracted un-effected rows (two uint32_t values of 96 equal one + // uint64_t value of 412316860512) + CHECK_NEON(20, uint64_t, fillNeon({412316860512}, SVL / 8)); + CHECK_NEON(21, uint64_t, fillNeon({412316860512}, SVL / 8)); + // Check extracted effected rows (two uint32_t values concatonated into one + // uint64_t value) + CHECK_NEON(24, uint64_t, fillNeon({2044404433372}, SVL / 8)); + CHECK_NEON(25, uint64_t, fillNeon({2370821947944}, SVL / 8)); + CHECK_NEON(26, uint64_t, fillNeon({2207613190658}, SVL / 8)); + CHECK_NEON(27, uint64_t, fillNeon({2534030705230}, SVL / 8)); + + // 4 vectors + initialHeapData_.resize(SVL / 8); + heap8 = reinterpret_cast(initialHeapData_.data()); + src = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + fillHeap(heap8, src, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 96 (uint32_t) + dup z1.b, #8 + dup z2.b, #3 + ptrue p0.b + ptrue p1.b + umopa za0.s, p0/m, p1/m, z1.b, z2.b + umopa za1.s, p0/m, p1/m, z1.b, z2.b + umopa za2.s, p0/m, p1/m, z1.b, z2.b + umopa za3.s, p0/m, p1/m, z1.b, z2.b + + # Set 4 of the za rows + mov w8, #1 + dup z4.b, #10 + dup z5.b, #11 + dup z6.b, #12 + dup z7.b, #13 + ld1b {z10.b}, p0/z, [x0] + udot za.s[w8, #1, vgx4], {z4.b - z7.b}, z10.b[2] + mov w9, #0 mova {z20.d - z23.d}, za.d[w9, #0, vgx4] mov {z24.d - z27.d}, za.d[w8, #1, vgx4] From b988e0117a095f0be1d5f8bcc7a42e14842dfa29 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 4 Nov 2024 12:36:46 +0000 Subject: [PATCH 48/71] Implemented FADD (float, vgx2) SME instruction with tests. --- src/lib/arch/aarch64/Instruction_execute.cc | 37 +++++++++++++++ test/regression/aarch64/instructions/sme.cc | 52 +++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 072555cc11..9a9e4239fc 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -1373,6 +1373,43 @@ void Instruction::execute() { results_[0] = {add_3ops(sourceValues_), 256}; break; } + case Opcode::AArch64_FADD_VG2_M2Z_S: { // fadd za.s[wv, #off, vgx2], + // {zn1.s, zn2.s} + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 32; + // Get ZA stride between halves and index into each ZA half + const uint16_t zaStride = zaRowCount / 2; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + + // Pre-set all ZA result rows as only 2 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + // For each source vector and ZA Row pair + for (int r = 0; r < 2; r++) { + // Get row in correct ZA half + const float* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + // Get current source vector + const float* znr = + sourceValues_[zaRowCount + 1 + r].getAsVector(); + float out[64] = {0.0f}; + // Loop over all elements and destructively add + for (int e = 0; e < elemCount; e++) { + out[e] = zaRow[e] + znr[e]; + } + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } case Opcode::AArch64_FADD_ZPmI_D: { // fadd zdn.d, pg/m, zdn.d, const results_[0] = sveAddPredicated_const(sourceValues_, metadata_, VL_bits); diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index 066970b9ea..4efd18849f 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -198,6 +198,58 @@ TEST_P(InstSme, mova_tilesToVecs) { CHECK_NEON(11, uint8_t, fillNeon({0x00}, SVL / 8)); } +TEST_P(InstSme, fadd) { + // Float, VGx2 + initialHeapData_.resize(SVL / 8); + uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; + fillHeap(heap8, src, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 24.0f + fdup z1.s, #3.0 + fdup z2.s, #8.0 + ptrue p0.s + ptrue p1.s + fmopa za0.s, p0/m, p1/m, z1.s, z2.s + fmopa za1.s, p0/m, p1/m, z1.s, z2.s + fmopa za2.s, p0/m, p1/m, z1.s, z2.s + fmopa za3.s, p0/m, p1/m, z1.s, z2.s + + # initialise registers + mov w8, #1 + fdup z4.s, #-2.5 + fdup z5.s, #3.0 + + fadd za.s[w8, #1, vgx2], {z4.s, z5.s} + )"); + const uint16_t zaStride = (SVL / 8) / 2; + const uint16_t zaHalfIndex = 2; + for (uint64_t i = 0; i < (SVL / 8); i++) { + // Effected rows all use same zm value of 2.0f + if (i == zaHalfIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({21.5f}, (SVL / 8))); + } else if (i == zaStride + zaHalfIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({27.0f}, (SVL / 8))); + } else { + // un-effected rows should still be 24.0f throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, + fillNeon({24.0f}, (SVL / 8))); + } + } +} + TEST_P(InstSme, fmla_multiVecs) { // float, vgx4 RUN_AARCH64(R"( From 4f75ffe8466609b2b8ad267cfc8a7a41cb710b64 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 4 Nov 2024 14:16:30 +0000 Subject: [PATCH 49/71] Implemented LD1D (4 vec, scalar offset) SVE2 instruction with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 18 ++++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 4 ++++ test/regression/aarch64/instructions/sve.cc | 26 +++++++++++++++++++++ 3 files changed, 48 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index 07c2e84709..4f52762bac 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -423,6 +423,24 @@ span Instruction::generateAddresses() { setMemoryAddresses(std::move(addresses)); break; } + case Opcode::AArch64_LD1D_4Z: { // ld1d {zt1.d - zt4.d}, png/z, [xn, + // xm, lsl #3] + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = sourceValues_[2].get(); + const uint64_t addr = base + (offset << 3); + + std::vector addresses; + addresses.reserve(4); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + addresses.push_back({addr + 2 * blockSize, blockSize}); + addresses.push_back({addr + 3 * blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_LD1D_4Z_IMM: { // ld1d {zt1.d - zt4.d}, png/z, [xn{, // #imm, mul vl}] const uint16_t partition_num = VL_bits / 64; diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 9a9e4239fc..52d94c3b3a 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -3041,6 +3041,10 @@ void Instruction::execute() { results_[1] = {out[1], 256}; break; } + case Opcode::AArch64_LD1D_4Z: // ld1d {zt1.d - zt4.d}, png/z, [xn, + // xm, lsl #3] + // LOAD + [[fallthrough]]; case Opcode::AArch64_LD1D_4Z_IMM: { // ld1d {zt1.d - zt4.d}, png/z, [xn{, // #imm, mul vl}] // LOAD diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index a0df6713ea..3acf783558 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -5520,6 +5520,10 @@ TEST_P(InstSve, ld1d) { ptrue pn8.d ld1d {z0.d - z3.d}, pn8/z, [x0, #4, mul vl] + addvl x1, x1, #1 + mov x2, #2 + udiv x1, x1, x2 + ld1d {z4.d - z7.d}, pn8/z, [x0, x1, lsl #3] )"); base = (VL / 64) * 4; offset = (VL / 64); @@ -5545,6 +5549,28 @@ TEST_P(InstSve, ld1d) { src[((base + (offset * 3)) + 2) % 4], src[((base + (offset * 3)) + 3) % 4]}, VL / 8)); + CHECK_NEON(4, uint64_t, + fillNeon({src[(base) % 4], src[(base + 1) % 4], + src[(base + 2) % 4], src[(base + 3) % 4]}, + VL / 8)); + CHECK_NEON( + 5, uint64_t, + fillNeon( + {src[((base + offset)) % 4], src[((base + offset) + 1) % 4], + src[((base + offset) + 2) % 4], src[((base + offset) + 3) % 4]}, + VL / 8)); + CHECK_NEON(6, uint64_t, + fillNeon({src[((base + (offset * 2))) % 4], + src[((base + (offset * 2)) + 1) % 4], + src[((base + (offset * 2)) + 2) % 4], + src[((base + (offset * 2)) + 3) % 4]}, + VL / 8)); + CHECK_NEON(7, uint64_t, + fillNeon({src[((base + (offset * 3))) % 4], + src[((base + (offset * 3)) + 1) % 4], + src[((base + (offset * 3)) + 2) % 4], + src[((base + (offset * 3)) + 3) % 4]}, + VL / 8)); } TEST_P(InstSve, ld1h) { From f35472b02d6891bde114b75861d00ca297edf818 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 4 Nov 2024 15:13:54 +0000 Subject: [PATCH 50/71] Implemented FMLA (double, VGx4) SME instruction with tests. --- src/lib/arch/aarch64/Instruction_execute.cc | 42 +++++++++++++++ test/regression/aarch64/instructions/sme.cc | 59 +++++++++++++++++++++ 2 files changed, 101 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 52d94c3b3a..4d55b022e7 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -1884,6 +1884,48 @@ void Instruction::execute() { [](double x, double y) -> double { return std::fmin(x, y); }); break; } + case Opcode::AArch64_FMLA_VG4_M4Z4Z_D: { // fmla za.d[wv, offs, vgx4], + // {zn1.d - zn4.d}, {zm1.d - + // zm4.d} + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 64; + // Get ZA stride between quarters and index into each ZA quarter + const uint16_t zaStride = zaRowCount / 4; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + + // Pre-set all ZA result rows as only 4 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + // Get sourceValues_ index of first zn and zm regs + const uint16_t n = zaRowCount + 1; + const uint16_t m = zaRowCount + 5; + + // Loop over each source vector and destination vector (from the za + // single-vector group) pair + for (int r = 0; r < 4; r++) { + // For ZA single-vector groups of 4 vectors (vgx4), each vector is in + // a different quarter of ZA; indexed into it by Wv+off. + const double* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + const double* zn = sourceValues_[n + r].getAsVector(); + const double* zm = sourceValues_[m + r].getAsVector(); + double out[32] = {0.0}; + for (int e = 0; e < elemCount; e++) { + out[e] = zaRow[e] + (zn[e] * zm[e]); + } + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } case Opcode::AArch64_FMLA_VG4_M4Z4Z_S: { // fmla za.s[wv, offs, vgx4], // {zn1.s - zn4.s}, {zm1.s - // zm4.s} diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index 4efd18849f..40c0b8ec99 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -307,6 +307,65 @@ TEST_P(InstSme, fmla_multiVecs) { fillNeon({24.0f}, (SVL / 8))); } } + + // double, vgx4 + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 24.0 + fdup z1.d, #3.0 + fdup z2.d, #8.0 + ptrue p0.d + ptrue p1.d + fmopa za0.d, p0/m, p1/m, z1.d, z2.d + fmopa za1.d, p0/m, p1/m, z1.d, z2.d + fmopa za2.d, p0/m, p1/m, z1.d, z2.d + fmopa za3.d, p0/m, p1/m, z1.d, z2.d + fmopa za4.d, p0/m, p1/m, z1.d, z2.d + fmopa za5.d, p0/m, p1/m, z1.d, z2.d + fmopa za6.d, p0/m, p1/m, z1.d, z2.d + fmopa za7.d, p0/m, p1/m, z1.d, z2.d + + # initialise registers + mov w8, #1 + fdup z4.d, #0.25 + fdup z5.d, #1.5 + fdup z6.d, #-0.5 + fdup z7.d, #-2.5 + fdup z8.d, #3.0 + fdup z9.d, #4.0 + fdup z10.d, #5.0 + fdup z11.d, #6.0 + + fmla za.d[w8, #1, vgx4], {z4.d - z7.d}, {z8.d - z11.d} + )"); + for (uint64_t i = 0; i < (SVL / 8); i++) { + // Effected rows all use same zm value of 2.0 + if (i == zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({24.75}, (SVL / 8))); + } else if (i == zaStride + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({30.0}, (SVL / 8))); + } else if (i == (2 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({21.5}, (SVL / 8))); + } else if (i == (3 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({9.0}, (SVL / 8))); + } else { + // un-effected rows should still be 24.0 throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({24.0}, (SVL / 8))); + } + } } TEST_P(InstSme, fmla_indexed_vgx4) { From 1bf3306deb071c6226fc4160cef9f5808400e337 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 4 Nov 2024 15:19:09 +0000 Subject: [PATCH 51/71] Implemented FADD (double, vgx2) SME instruction with tests. --- src/lib/arch/aarch64/Instruction_execute.cc | 37 +++++++++++++++ test/regression/aarch64/instructions/sme.cc | 52 ++++++++++++++++++++- 2 files changed, 88 insertions(+), 1 deletion(-) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 4d55b022e7..459c999261 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -1373,6 +1373,43 @@ void Instruction::execute() { results_[0] = {add_3ops(sourceValues_), 256}; break; } + case Opcode::AArch64_FADD_VG2_M2Z_D: { // fadd za.d[wv, #off, vgx2], + // {zn1.d, zn2.d} + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 64; + // Get ZA stride between halves and index into each ZA half + const uint16_t zaStride = zaRowCount / 2; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + + // Pre-set all ZA result rows as only 2 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + // For each source vector and ZA Row pair + for (int r = 0; r < 2; r++) { + // Get row in correct ZA half + const double* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + // Get current source vector + const double* znr = + sourceValues_[zaRowCount + 1 + r].getAsVector(); + double out[32] = {0.0}; + // Loop over all elements and destructively add + for (int e = 0; e < elemCount; e++) { + out[e] = zaRow[e] + znr[e]; + } + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } case Opcode::AArch64_FADD_VG2_M2Z_S: { // fadd za.s[wv, #off, vgx2], // {zn1.s, zn2.s} // SME diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index 40c0b8ec99..eb3ef04e4f 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -235,7 +235,6 @@ TEST_P(InstSme, fadd) { const uint16_t zaStride = (SVL / 8) / 2; const uint16_t zaHalfIndex = 2; for (uint64_t i = 0; i < (SVL / 8); i++) { - // Effected rows all use same zm value of 2.0f if (i == zaHalfIndex) { CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, fillNeon({21.5f}, (SVL / 8))); @@ -248,6 +247,57 @@ TEST_P(InstSme, fadd) { fillNeon({24.0f}, (SVL / 8))); } } + + // Double, VGx2 + initialHeapData_.resize(SVL / 8); + heap8 = reinterpret_cast(initialHeapData_.data()); + src = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + fillHeap(heap8, src, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 24.0 + fdup z1.d, #3.0 + fdup z2.d, #8.0 + ptrue p0.d + ptrue p1.d + fmopa za0.d, p0/m, p1/m, z1.d, z2.d + fmopa za1.d, p0/m, p1/m, z1.d, z2.d + fmopa za2.d, p0/m, p1/m, z1.d, z2.d + fmopa za3.d, p0/m, p1/m, z1.d, z2.d + fmopa za4.d, p0/m, p1/m, z1.d, z2.d + fmopa za5.d, p0/m, p1/m, z1.d, z2.d + fmopa za6.d, p0/m, p1/m, z1.d, z2.d + fmopa za7.d, p0/m, p1/m, z1.d, z2.d + + + # initialise registers + mov w8, #1 + fdup z4.d, #-2.5 + fdup z5.d, #3.0 + + fadd za.d[w8, #1, vgx2], {z4.d, z5.d} + )"); + for (uint64_t i = 0; i < (SVL / 8); i++) { + if (i == zaHalfIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({21.5}, (SVL / 8))); + } else if (i == zaStride + zaHalfIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({27.0}, (SVL / 8))); + } else { + // un-effected rows should still be 24.0f throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, + fillNeon({24.0}, (SVL / 8))); + } + } } TEST_P(InstSme, fmla_multiVecs) { From 4effde42896b63259b6f6526e7b8eb4c14bfca9e Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 4 Nov 2024 15:34:28 +0000 Subject: [PATCH 52/71] Implemented LD1H (Single vec, imm offset) SVE instruction with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 11 +++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 4 ++++ test/regression/aarch64/instructions/sve.cc | 13 +++++++++++++ 3 files changed, 28 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index 4f52762bac..78956747f2 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -482,6 +482,17 @@ span Instruction::generateAddresses() { setMemoryAddresses({addr, static_cast(VL_bits / 8)}); break; } + case Opcode::AArch64_LD1H_IMM: { // ld1h {zt.h}, pg/z, [xn{, #imm, mul + // vl}] + const uint16_t partition_num = VL_bits / 16; + + const uint64_t base = sourceValues_[1].get(); + const int64_t offset = metadata_.operands[2].mem.disp; + const uint64_t addr = base + (offset * partition_num * 2); + + setMemoryAddresses({addr, static_cast(VL_bits / 8)}); + break; + } case Opcode::AArch64_LD1W: { // ld1w {zt.s}, pg/z, [xn, xm, lsl #2] const uint64_t base = sourceValues_[1].get(); const uint64_t offset = sourceValues_[2].get(); diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 459c999261..cec2201ebb 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -3168,6 +3168,10 @@ void Instruction::execute() { results_[0] = {out, 256}; break; } + case Opcode::AArch64_LD1H_IMM: // ld1h {zt.h}, pg/z, [xn{, #imm, mul + // vl}] + // LOAD + [[fallthrough]]; case Opcode::AArch64_LD1H: { // ld1h {zt.h}, pg/z, [xn, xm, lsl #1] // LOAD const uint64_t* p = sourceValues_[0].getAsVector(); diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 3acf783558..f8a7a80606 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -5591,6 +5591,7 @@ TEST_P(InstSve, ld1h) { ptrue p0.h # Load and broadcast values from heap ld1h {z0.h}, p0/z, [x0, x1, lsl #1] + ld1h {z2.h}, p0/z, [x0] # Test for inactive lanes mov x1, #0 @@ -5600,6 +5601,10 @@ TEST_P(InstSve, ld1h) { mov x2, #0 whilelo p1.h, xzr, x1 ld1h {z1.h}, p1/z, [x0, x2, lsl #1] + + addvl x10, x10, #1 + add x10, x10, x0 + ld1h {z3.h}, p1/z, [x10, #-1, mul vl] )"); CHECK_NEON(0, uint16_t, fillNeon({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, @@ -5609,6 +5614,14 @@ TEST_P(InstSve, ld1h) { fillNeonCombined({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, 0xEF01, 0xABCD}, {0}, VL / 8)); + CHECK_NEON(2, uint16_t, + fillNeon({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, + 0xEF01, 0xABCD}, + VL / 8)); + CHECK_NEON(3, uint16_t, + fillNeonCombined({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, + 0x9876, 0xEF01, 0xABCD}, + {0}, VL / 8)); } TEST_P(InstSve, ld1w) { From 40bba12eb5b5a7fa1a752d9343a53d092f5a8f88 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 4 Nov 2024 16:37:53 +0000 Subject: [PATCH 53/71] Added SVE bf16 DOT (indexed) instruction execution logic. --- src/lib/arch/aarch64/Instruction_execute.cc | 38 +++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index cec2201ebb..30d591f7b6 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -509,6 +509,44 @@ void Instruction::execute() { results_[0] = RegisterValue(out, 256); break; } + case Opcode::AArch64_BFDOT_ZZI: { // bfdot zd.s, zn.h, zm.h[index] + // BF16 -- EXPERIMENTAL + if (std::string(SIMENG_ENABLE_BF16) == "OFF") return executionNYI(); + // Must be enabled at SimEng compile time + // Not verified to be working for all compilers or OSs. + // No Tests written + + const uint16_t partition_num = VL_bits / 16; + + const float* zd = sourceValues_[0].getAsVector(); + // Extract data as uint16_t so that bytes-per-element is correct + const uint16_t* zn = sourceValues_[1].getAsVector(); + const uint16_t* zm = sourceValues_[2].getAsVector(); + const int index = metadata_.operands[2].vector_index; + + float out[64] = {0.0f}; + for (int i = 0; i < partition_num; i++) { + // MOD 4 as 4 32-bit elements in each 128-bit segment + const int zmBase = i - (i % 4); + const int zmIndex = zmBase + index; + + float zn1, zn2, zm1, zm2; + // Horrible hack in order to convert bf16 (currently stored in a + // uint16_t) into a float. + // Each bf16 is copied into the least significant 16-bits of each + // float variable. + // Need to re-interpret each float destination as a uint16_t* inside + // the memcpy so that the least-significant bits can be accessed. + memcpy((uint16_t*)&zn1 + 1, &zn[2 * i], 2); + memcpy((uint16_t*)&zn2 + 1, &zn[2 * i + 1], 2); + memcpy((uint16_t*)&zm1 + 1, &zm[2 * zmIndex], 2); + memcpy((uint16_t*)&zm2 + 1, &zm[2 * zmIndex + 1], 2); + + out[i] = zd[i] + ((zn1 * zm1) + (zn2 * zm2)); + } + results_[0] = RegisterValue(out, 256); + break; + } case Opcode::AArch64_BFMWri: { // bfm wd, wn, #immr, #imms results_[0] = { bfm_2imms(sourceValues_, metadata_, false, false), 8}; From 39323607103a56425b53bfd9f32e9233c168d6be Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 4 Nov 2024 17:12:51 +0000 Subject: [PATCH 54/71] Implemented LD1H (two vec, imm and scalar offset) SVE instruction with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 35 ++++++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 26 ++++++++++++ test/regression/aarch64/instructions/sve.cc | 46 +++++++++++++++++++++ 3 files changed, 107 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index 78956747f2..703df9d849 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -493,6 +493,41 @@ span Instruction::generateAddresses() { setMemoryAddresses({addr, static_cast(VL_bits / 8)}); break; } + case Opcode::AArch64_LD1H_2Z: { // ld1h {zt1.h, zt2.h}, png/z, [xn, xm, + // lsl #1] + const uint64_t base = sourceValues_[1].get(); + const int64_t offset = sourceValues_[2].get(); + const uint64_t addr = base + (offset << 1); + + std::vector addresses; + addresses.reserve(2); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_LD1H_2Z_IMM: { // ld1h {zt1.h, zt2.h}, png/z, [xn{, + // #imm, mul vl}] + const uint16_t partition_num = VL_bits / 16; + + const uint64_t base = sourceValues_[1].get(); + const uint64_t offset = + static_cast(metadata_.operands[3].mem.disp); + const uint64_t addr = base + (offset * partition_num * 4); + + std::vector addresses; + addresses.reserve(2); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_LD1W: { // ld1w {zt.s}, pg/z, [xn, xm, lsl #2] const uint64_t base = sourceValues_[1].get(); const uint64_t offset = sourceValues_[2].get(); diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 30d591f7b6..bc56b6186e 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -3229,6 +3229,32 @@ void Instruction::execute() { results_[0] = {out, 256}; break; } + case Opcode::AArch64_LD1H_2Z: // ld1h {zt1.h, zt2.h}, png/z, [xn, xm, + // lsl #1] + // LOAD + [[fallthrough]]; + case Opcode::AArch64_LD1H_2Z_IMM: { // ld1h {zt1.h, zt2.h}, png/z, [xn{, + // #imm, mul vl}] + // LOAD + const uint64_t pn = sourceValues_[0].get(); + + auto preds = predAsCounterToMasks(pn, VL_bits); + + uint16_t out[2][128] = {{0}, {0}}; + const uint16_t partition_num = VL_bits / 16; + + for (int r = 0; r < 2; r++) { + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % 32) * 2); + if (preds[r][i / 32] & shifted_active) { + out[r][i] = memoryData_[r].getAsVector()[i]; + } + } + } + results_[0] = {out[0], 256}; + results_[1] = {out[1], 256}; + break; + } case Opcode::AArch64_LD1Onev16b: { // ld1 {vt.16b} [xn] results_[0] = memoryData_[0].zeroExtend(memoryData_[0].size(), 256); break; diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index f8a7a80606..c1f97fca4a 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -5574,6 +5574,7 @@ TEST_P(InstSve, ld1d) { } TEST_P(InstSve, ld1h) { + // Single vector initialHeapData_.resize(VL / 4); uint16_t* heap16 = reinterpret_cast(initialHeapData_.data()); fillHeap( @@ -5622,6 +5623,51 @@ TEST_P(InstSve, ld1h) { fillNeonCombined({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, 0xEF01, 0xABCD}, {0}, VL / 8)); + + // Multi vector + + // Two vector + initialHeapData_.resize(VL); + heap16 = reinterpret_cast(initialHeapData_.data()); + fillHeap( + heap16, {0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, 0xEF01, 0xABCD}, + VL / 2); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + ptrue pn8.h + mov x1, #1 + ld1h {z0.h, z1.h}, pn8/z, [x0] + ld1h {z2.h, z3.h}, pn8/z, [x0, x1, lsl #1] + ld1h {z4.h, z5.h}, pn8/z, [x0, #2, mul vl] + )"); + CHECK_NEON(0, uint16_t, + fillNeon({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, + 0xEF01, 0xABCD}, + VL / 8)); + CHECK_NEON(1, uint16_t, + fillNeon({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, + 0xEF01, 0xABCD}, + VL / 8)); + CHECK_NEON(2, uint16_t, + fillNeon({0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, 0xEF01, + 0xABCD, 0xBEEF}, + VL / 8)); + CHECK_NEON(3, uint16_t, + fillNeon({0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, 0xEF01, + 0xABCD, 0xBEEF}, + VL / 8)); + CHECK_NEON(4, uint16_t, + fillNeon({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, + 0xEF01, 0xABCD}, + VL / 8)); + CHECK_NEON(5, uint16_t, + fillNeon({0xBEEF, 0xDEAD, 0x5678, 0x1234, 0x5432, 0x9876, + 0xEF01, 0xABCD}, + VL / 8)); } TEST_P(InstSve, ld1w) { From 5aad523885719d9b4bd3264fbf47bddac1a7f110 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 4 Nov 2024 17:33:19 +0000 Subject: [PATCH 55/71] Implemented BFMOPA (widening) SME instruction. --- src/lib/arch/aarch64/Instruction_execute.cc | 58 +++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index bc56b6186e..af49bcb743 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -2219,6 +2219,64 @@ void Instruction::execute() { results_[0] = vecFmlsIndexed_3vecs(sourceValues_, metadata_); break; } + case Opcode::AArch64_BFMOPA_MPPZZ: { // bfmopa zada.s, pn/m, pm/m, zn.h, + // zm.h + // SME + // BF16 -- EXPERIMENTAL + if (std::string(SIMENG_ENABLE_BF16) == "OFF") return executionNYI(); + // Must be enabled at SimEng compile time + // Not verified to be working for all compilers or OSs. + // No Tests written + + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t rowCount = VL_bits / 32; + const uint64_t* pn = sourceValues_[rowCount].getAsVector(); + const uint64_t* pm = + sourceValues_[rowCount + 1].getAsVector(); + // Use uint16_t to get 2-byte elements + const uint16_t* zn = + sourceValues_[rowCount + 2].getAsVector(); + const uint16_t* zm = + sourceValues_[rowCount + 3].getAsVector(); + + // zn is row, zm is col + for (int row = 0; row < rowCount; row++) { + float outRow[64] = {0.0f}; + // Shifted active is for bf16 elements + uint64_t shifted_active_row = 1ull << ((row % 32) * 2); + const float* zadaRow = sourceValues_[row].getAsVector(); + for (int col = 0; col < rowCount; col++) { + outRow[col] = zadaRow[col]; + // Shifted active is for bf16 elements + uint64_t shifted_active_col = 1ull << ((col % 32) * 2); + bool pred_row1 = pn[(2 * row) / 32] & shifted_active_row; + bool pred_row2 = pn[(2 * row + 1) / 32] & shifted_active_row; + bool pred_col1 = pm[(2 * col) / 32] & shifted_active_col; + bool pred_col2 = pm[(2 * col + 1) / 32] & shifted_active_col; + if ((pred_row1 && pred_col1) || (pred_row2 && pred_col2)) { + float zn1, zn2, zm1, zm2; + // Horrible hack in order to convert bf16 (currently stored in a + // uint16_t) into a float. + // Each bf16 is copied into the least significant 16-bits of each + // float variable. + // Need to re-interpret each float destination as a uint16_t* + // inside the memcpy so that the least-significant bits can be + // accessed. + memcpy((uint16_t*)&zn1 + 1, &zn[2 * row], 2); + memcpy((uint16_t*)&zn2 + 1, &zn[2 * row + 1], 2); + memcpy((uint16_t*)&zm1 + 1, &zm[2 * col], 2); + memcpy((uint16_t*)&zm2 + 1, &zm[2 * col + 1], 2); + outRow[col] += (pred_row1 && pred_col1) ? zn1 * zm1 : 0.0f; + outRow[col] += (pred_row2 && pred_col2) ? zn2 * zm2 : 0.0f; + } + } + results_[row] = {outRow, 256}; + } + break; + } case Opcode::AArch64_FMOPA_MPPZZ_D: { // fmopa zada.d, pn/m, pm/m, zn.d, // zm.d // SME From 430c775915055369156ed1c058af870411048e8b Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 4 Nov 2024 18:01:57 +0000 Subject: [PATCH 56/71] Minor UMAXP fix. --- src/include/simeng/arch/aarch64/helpers/neon.hh | 4 ++-- test/regression/aarch64/instructions/neon.cc | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh index c2bf42e6fa..a10c8afd74 100644 --- a/src/include/simeng/arch/aarch64/helpers/neon.hh +++ b/src/include/simeng/arch/aarch64/helpers/neon.hh @@ -570,8 +570,8 @@ RegisterValue vecUMaxP(srcValContainer& sourceValues) { // Concatenate the vectors T temp[2 * I]; - memcpy(temp, m, sizeof(T) * I); - memcpy(temp + (sizeof(T) * I), n, sizeof(T) * I); + memcpy(temp, n, sizeof(T) * I); + memcpy(temp + (sizeof(T) * I), m, sizeof(T) * I); // Compare each adjacent pair of elements T out[I]; for (int i = 0; i < I; i++) { diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc index ca9ae26a4e..1621cbbdad 100644 --- a/test/regression/aarch64/instructions/neon.cc +++ b/test/regression/aarch64/instructions/neon.cc @@ -2738,7 +2738,7 @@ TEST_P(InstNeon, umaxp) { ldr q0, [x0] ldr q1, [x0, #16] - umaxp v2.16b, v0.16b, v1.16b + umaxp v2.16b, v1.16b, v0.16b )"); CHECK_NEON(2, uint8_t, From a01c2fca5406a27765bbbc6617e393f1b2c97fde Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 4 Nov 2024 18:13:27 +0000 Subject: [PATCH 57/71] Fixed function comment. --- src/include/simeng/arch/aarch64/Instruction.hh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/simeng/arch/aarch64/Instruction.hh b/src/include/simeng/arch/aarch64/Instruction.hh index bee47e01bc..f3854c84b4 100644 --- a/src/include/simeng/arch/aarch64/Instruction.hh +++ b/src/include/simeng/arch/aarch64/Instruction.hh @@ -286,7 +286,7 @@ enum class InsnType : uint32_t { /** Predefined shift values for converting pred-as-counter to pred-as-mask. */ const uint64_t predCountShiftVals[9] = {0, 1, 2, 0, 3, 0, 0, 0, 4}; -/** Convert Predicate-as-Mask to Predicate-as-Masks. +/** Convert Predicate-as-Counter to Predicate-as-Masks. * T represents the element type (i.e. for pg.s, T = uint32_t). * V represents the number of vectors the predicate-as-counter is being used * for. */ From 9790c6e8098cb1e168969a9e4d020ae3b2cceba5 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Tue, 5 Nov 2024 17:42:33 +0000 Subject: [PATCH 58/71] Updated BF16 comment. --- src/lib/arch/aarch64/Instruction_execute.cc | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index af49bcb743..a058c354fa 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -533,10 +533,9 @@ void Instruction::execute() { float zn1, zn2, zm1, zm2; // Horrible hack in order to convert bf16 (currently stored in a // uint16_t) into a float. - // Each bf16 is copied into the least significant 16-bits of each - // float variable. - // Need to re-interpret each float destination as a uint16_t* inside - // the memcpy so that the least-significant bits can be accessed. + // Each bf16 is copied into the most significant 16-bits of each + // float variable; given IEEE FP32 and BF16 have the same width + // exponent and one sign bit. memcpy((uint16_t*)&zn1 + 1, &zn[2 * i], 2); memcpy((uint16_t*)&zn2 + 1, &zn[2 * i + 1], 2); memcpy((uint16_t*)&zm1 + 1, &zm[2 * zmIndex], 2); @@ -2260,11 +2259,9 @@ void Instruction::execute() { float zn1, zn2, zm1, zm2; // Horrible hack in order to convert bf16 (currently stored in a // uint16_t) into a float. - // Each bf16 is copied into the least significant 16-bits of each - // float variable. - // Need to re-interpret each float destination as a uint16_t* - // inside the memcpy so that the least-significant bits can be - // accessed. + // Each bf16 is copied into the most significant 16-bits of each + // float variable; given IEEE FP32 and BF16 have the same width + // exponent and one sign bit. memcpy((uint16_t*)&zn1 + 1, &zn[2 * row], 2); memcpy((uint16_t*)&zn2 + 1, &zn[2 * row + 1], 2); memcpy((uint16_t*)&zm1 + 1, &zm[2 * col], 2); From 5bc9330315777b37132c211877f88a39759cf8f0 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Wed, 6 Nov 2024 12:45:32 +0000 Subject: [PATCH 59/71] Implemented NEON UDOT (by vector) instruction with tests. --- .../simeng/arch/aarch64/helpers/neon.hh | 27 +++++++++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 4 +++ test/regression/aarch64/instructions/neon.cc | 24 +++++++++++++++++ 3 files changed, 55 insertions(+) diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh index a10c8afd74..52d0ef9011 100644 --- a/src/include/simeng/arch/aarch64/helpers/neon.hh +++ b/src/include/simeng/arch/aarch64/helpers/neon.hh @@ -951,6 +951,33 @@ RegisterValue vecUzp(srcValContainer& sourceValues, bool isUzp1) { return {out, 256}; } +/** Helper function for NEON instructions with the format `udot vd.s, vn.b, + * vm.b`. D represents the number of elements in the output vector to be updated + * (i.e. for vd.2s D = 2). Only 2 or 4 are valid. Returns correctly formatted + * RegisterValue. */ +template +RegisterValue vecUdot( + srcValContainer& sourceValues, + const simeng::arch::aarch64::InstructionMetadata& metadata) { + // Check D and N are valid values + static_assert((D == 2 || D == 4) && + "D must be either 2 or 4 to align with vd.2s or vd.4s."); + + const uint32_t* vd = sourceValues[0].getAsVector(); + const uint8_t* vn = sourceValues[1].getAsVector(); + const uint8_t* vm = sourceValues[2].getAsVector(); + + uint32_t out[D] = {0}; + for (int i = 0; i < D; i++) { + out[i] = vd[i]; + for (int j = 0; j < 4; j++) { + out[i] += (static_cast(vn[(4 * i) + j]) * + static_cast(vm[(4 * i) + j])); + } + } + return {out, 256}; +} + /** Helper function for NEON instructions with the format `udot vd.s, vn.b, * vm.4b[index]`. * D represents the number of elements in the output vector to be updated (i.e. diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index a058c354fa..505520287c 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -6550,6 +6550,10 @@ void Instruction::execute() { metadata_, VL_bits); break; } + case Opcode::AArch64_UDOTv16i8: { // udot vd.4s, vn.16b, vm.16b + results_[0] = vecUdot<4>(sourceValues_, metadata_); + break; + } case Opcode::AArch64_UDOTlanev16i8: { // udot vd.4s, vn.16b, vm.4b[index] results_[0] = vecUdot_byElement<4>(sourceValues_, metadata_); break; diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc index 1621cbbdad..6271023ea4 100644 --- a/test/regression/aarch64/instructions/neon.cc +++ b/test/regression/aarch64/instructions/neon.cc @@ -3681,6 +3681,30 @@ TEST_P(InstNeon, udot) { CHECK_NEON(3, uint32_t, {0xd328, 0x288e8, 0x27e25, 0x2b87f}); CHECK_NEON(4, uint32_t, {0xc333, 0x2731b, 0x0, 0x0}); CHECK_NEON(5, uint32_t, {0x1fe2, 0x8e62, 0xad7e, 0xb52f}); + + // udot by vector + initialHeapData_.resize(128); + heap64 = reinterpret_cast(initialHeapData_.data()); + heap64[0] = 0xDEADBEEFFFFFFFFF; + heap64[1] = 0x01234567ABBACAFE; + heap64[2] = 0xFEDCBA98FFFFFFFF; + heap64[3] = 0xDEADCAFEABBABEEF; + RUN_AARCH64(R"( + # Get heap address + mov x0, #0 + mov x8, #214 + svc #0 + + ldr q0, [x0] + ldr q1, [x0, #16] + + movi v2.4s, #3 + + udot v2.4s, v1.16b, v0.16b + )"); + CHECK_NEON(0, uint64_t, {0xDEADBEEFFFFFFFFF, 0x01234567ABBACAFE}); + CHECK_NEON(1, uint64_t, {0xFEDCBA98FFFFFFFF, 0xDEADCAFEABBABEEF}); + CHECK_NEON(2, uint32_t, {0x3F807, 0x288E7, 0x27C6E, 0xB52C}); } TEST_P(InstNeon, uzp) { From 1fd130cde40b882e8a02dbf0217ba57ddd904be5 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Wed, 6 Nov 2024 18:20:14 +0000 Subject: [PATCH 60/71] Implemented SVE UDOT (by vector, 4-way) instruction with tests. --- .../simeng/arch/aarch64/helpers/sve.hh | 28 +++++++++++++++++++ src/lib/arch/aarch64/Instruction_execute.cc | 5 ++++ test/regression/aarch64/instructions/sve.cc | 22 ++++++++++++++- 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index c963b22f7a..50eb19c657 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -1618,6 +1618,34 @@ RegisterValue sveTrn2_3vecs(srcValContainer& sourceValues, return {out, 256}; } +/** Helper function for SVE instructions with the format `udot zd, zn, zm`. + * D represents the element type of the destination register (i.e. for zd.s, + * D = uint32_t). + * N represents the element type of the source registers (i.e. for zn.b, N = + * uint8_t). + * W represents how many source elements are multiplied to form an output + * element (i.e. for 4-way, W = 4). + * Returns correctly formatted RegisterValue. */ +template +RegisterValue sveUdot( + srcValContainer& sourceValues, + const simeng::arch::aarch64::InstructionMetadata& metadata, + const uint16_t VL_bits) { + const D* zd = sourceValues[0].getAsVector(); + const N* zn = sourceValues[1].getAsVector(); + const N* zm = sourceValues[2].getAsVector(); + + D out[256 / sizeof(D)] = {0}; + for (int i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) { + out[i] = zd[i]; + for (int j = 0; j < W; j++) { + out[i] += + (static_cast(zn[(W * i) + j]) * static_cast(zm[(W * i) + j])); + } + } + return {out, 256}; +} + /** Helper function for SVE instructions with the format `udot zd, zn, * zm[index]`. * D represents the element type of the destination register (i.e. for uint32_t, diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 505520287c..aa44d69079 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -6545,6 +6545,11 @@ void Instruction::execute() { } break; } + case Opcode::AArch64_UDOT_ZZZ_S: { // udot zd.s, zn.b, zm.b + results_[0] = + sveUdot(sourceValues_, metadata_, VL_bits); + break; + } case Opcode::AArch64_UDOT_ZZZI_S: { // udot zd.s, zn.b, zm.b[index] results_[0] = sveUdot_indexed(sourceValues_, metadata_, VL_bits); diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index c1f97fca4a..43382697e6 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -8032,9 +8032,29 @@ TEST_P(InstSve, udot) { udot z4.s, z2.b, z0.b[0] udot z5.s, z3.b, z0.b[3] )"); - CHECK_NEON(4, uint32_t, fillNeon({1534}, VL / 8)); CHECK_NEON(5, uint32_t, fillNeon({629}, VL / 8)); + + // udot by vector - 4-way + initialHeapData_.resize(16); + heap64 = reinterpret_cast(initialHeapData_.data()); + heap64[0] = 0xDEADBEEFFFFF00FF; + heap64[1] = 0x01234567ABBACAFE; + RUN_AARCH64(R"( + # Get heap address + mov x0, #0 + mov x8, #214 + svc #0 + + ptrue p0.b + ld1rqb { z0.b }, p0/z, [x0] + + dup z2.b, #2 + dup z4.s, #4 + + udot z4.s, z2.b, z0.b + )"); + CHECK_NEON(4, uint32_t, fillNeon({1534, 1652, 1630, 420}, VL / 8)); } TEST_P(InstSve, uqdec) { From 81ddba7e9737ccf9f38ed0567449b064d8b8090a Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Wed, 6 Nov 2024 20:44:12 +0000 Subject: [PATCH 61/71] Implemented SVE ST4W (scalar offset) instruction with tests, and changed address generation logic for ST2W and ST4W. --- .../simeng/arch/aarch64/operandContainer.hh | 2 +- src/lib/arch/aarch64/Instruction_address.cc | 68 +++++++++++++++++-- src/lib/arch/aarch64/Instruction_execute.cc | 51 +++----------- test/regression/aarch64/instructions/sve.cc | 10 +++ 4 files changed, 82 insertions(+), 49 deletions(-) diff --git a/src/include/simeng/arch/aarch64/operandContainer.hh b/src/include/simeng/arch/aarch64/operandContainer.hh index c73b8881da..996454b007 100644 --- a/src/include/simeng/arch/aarch64/operandContainer.hh +++ b/src/include/simeng/arch/aarch64/operandContainer.hh @@ -10,7 +10,7 @@ namespace arch { namespace aarch64 { /** The maximum number of source registers a non-SME instruction can have. */ -const uint8_t MAX_SOURCE_REGISTERS = 6; +const uint8_t MAX_SOURCE_REGISTERS = 7; /** The maximum number of destination registers a non-SME instruction can have. */ diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index 703df9d849..7e4da09efc 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -1299,8 +1299,52 @@ span Instruction::generateAddresses() { uint64_t addr = base + (offset * partition_num * 8); - generatePredicatedContiguousAddressBlocks(addr, partition_num, 16, 8, p, - addresses); + // As vectors are stored in an interleaved manner (i.e. zt1[0], zt2[0], + // zt1[1], zt2[1], ...) we must generate an address for each element (if + // the predicate is true for that element). This is because, if the + // predicate indicates that all elements are active, a single address + // and MemoryAccessTarget will be generated with a size of 2xVL. This + // could lead to issues for core models which have a maximum store + // bandwidth of 1xVL. + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % 8) * 8); + if (p[i / 8] & shifted_active) { + addresses.push_back({addr + (2 * i * 8), 8}); + addresses.push_back({addr + (2 * i * 8) + 8, 8}); + } + } + setMemoryAddresses(std::move(addresses)); + break; + } + case Opcode::AArch64_ST4W: { // st4w {zt1.s, zt2.s, zt3.s, zt4.s}, + // pg, [, xm, lsl #2] + const uint64_t* p = sourceValues_[4].getAsVector(); + const uint16_t partition_num = VL_bits / 32; + + const uint64_t base = sourceValues_[5].get(); + const int64_t offset = sourceValues_[6].get(); + + std::vector addresses; + addresses.reserve(partition_num * 4); + + uint64_t addr = base + (offset << 2); + + // As vectors are stored in an interleaved manner (i.e. zt1[0], zt2[0], + // zt3[0], zt4[0], zt1[1], zt2[1], zt3[1], zt4[1] ...) we must generate + // an address for each element (if the predicate is true for that + // element). This is because, if the predicate indicates that all + // elements are active, a single address and MemoryAccessTarget will be + // generated with a size of 4xVL. This could lead to issues for core + // models which have a maximum store bandwidth of 1xVL. + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % 16) * 4); + if (p[i / 16] & shifted_active) { + addresses.push_back({addr + (4 * i * 4), 4}); + addresses.push_back({addr + (4 * i * 4) + 4, 4}); + addresses.push_back({addr + (4 * i * 4) + 8, 4}); + addresses.push_back({addr + (4 * i * 4) + 12, 4}); + } + } setMemoryAddresses(std::move(addresses)); break; } @@ -1315,12 +1359,24 @@ span Instruction::generateAddresses() { std::vector addresses; addresses.reserve(partition_num * 4); - uint64_t addr = base + (offset * partition_num * 4); - generatePredicatedContiguousAddressBlocks(addr, partition_num, 16, 4, p, - addresses); - + // As vectors are stored in an interleaved manner (i.e. zt1[0], zt2[0], + // zt3[0], zt4[0], zt1[1], zt2[1], zt3[1], zt4[1] ...) we must generate + // an address for each element (if the predicate is true for that + // element). This is because, if the predicate indicates that all + // elements are active, a single address and MemoryAccessTarget will be + // generated with a size of 4xVL. This could lead to issues for core + // models which have a maximum store bandwidth of 1xVL. + for (int i = 0; i < partition_num; i++) { + uint64_t shifted_active = 1ull << ((i % 16) * 4); + if (p[i / 16] & shifted_active) { + addresses.push_back({addr + (4 * i * 4), 4}); + addresses.push_back({addr + (4 * i * 4) + 4, 4}); + addresses.push_back({addr + (4 * i * 4) + 8, 4}); + addresses.push_back({addr + (4 * i * 4) + 12, 4}); + } + } setMemoryAddresses(std::move(addresses)); break; } diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index aa44d69079..c63d38e3d2 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -5709,33 +5709,15 @@ void Instruction::execute() { const uint64_t* d2 = sourceValues_[1].getAsVector(); const uint64_t* p = sourceValues_[2].getAsVector(); - std::vector memData; - bool inActiveBlock = false; - const uint16_t partition_num = VL_bits / 64; uint16_t index = 0; for (int i = 0; i < partition_num; i++) { uint64_t shifted_active = 1ull << ((i % 8) * 8); if (p[i / 8] & shifted_active) { - // If active and not in active block, initialise - if (!inActiveBlock) { - memData.clear(); - inActiveBlock = true; - } - memData.push_back(d1[i]); - memData.push_back(d2[i]); - } else if (inActiveBlock) { - inActiveBlock = false; - memoryData_[index] = RegisterValue( - (char*)memData.data(), sizeof(uint64_t) * memData.size()); - index++; + memoryData_[index++] = RegisterValue(d1[i], 8); + memoryData_[index++] = RegisterValue(d2[i], 8); } } - // Add final block if needed - if (inActiveBlock) - memoryData_[index] = RegisterValue((char*)memData.data(), - sizeof(uint64_t) * memData.size()); - break; } case Opcode::AArch64_ST2Twov4s_POST: { // st2 {vt1.4s, vt2.4s}, [xn], @@ -5755,6 +5737,9 @@ void Instruction::execute() { results_[0] = sourceValues_[2].get() + postIndex; break; } + case Opcode::AArch64_ST4W: // st4w {zt1.s, zt2.s, zt3.s, zt4.s}, + // pg, [, xm, lsl #2] + [[fallthrough]]; case Opcode::AArch64_ST4W_IMM: { // st4w {zt1.s, zt2.s, zt3.s, zt4.s}, // pg, [{, #imm, mul vl}] // STORE @@ -5764,35 +5749,17 @@ void Instruction::execute() { const uint32_t* d4 = sourceValues_[3].getAsVector(); const uint64_t* p = sourceValues_[4].getAsVector(); - std::vector memData; - bool inActiveBlock = false; - const uint16_t partition_num = VL_bits / 32; uint16_t index = 0; for (int i = 0; i < partition_num; i++) { uint64_t shifted_active = 1ull << ((i % 16) * 4); if (p[i / 16] & shifted_active) { - // If active and not in active block, initialise - if (!inActiveBlock) { - memData.clear(); - inActiveBlock = true; - } - memData.push_back(d1[i]); - memData.push_back(d2[i]); - memData.push_back(d3[i]); - memData.push_back(d4[i]); - } else if (inActiveBlock) { - inActiveBlock = false; - memoryData_[index] = RegisterValue( - (char*)memData.data(), sizeof(uint32_t) * memData.size()); - index++; + memoryData_[index++] = RegisterValue(d1[i], 4); + memoryData_[index++] = RegisterValue(d2[i], 4); + memoryData_[index++] = RegisterValue(d3[i], 4); + memoryData_[index++] = RegisterValue(d4[i], 4); } } - // Add final block if needed - if (inActiveBlock) - memoryData_[index] = RegisterValue((char*)memData.data(), - sizeof(uint32_t) * memData.size()); - break; } case Opcode::AArch64_STLRB: { // stlrb wt, [xn] diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 43382697e6..a4103b9ecb 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -7321,6 +7321,8 @@ TEST_P(InstSve, st4w) { st4w {z0.s - z3.s}, p0, [sp] st4w {z0.s - z3.s}, p1, [x6, #4, mul vl] + addvl x7, x7, #3 + st4w {z0.s - z3.s}, p1, [x6, x7, lsl #2] )"); for (uint64_t i = 0; i < (VL / 32); i++) { @@ -7345,6 +7347,14 @@ TEST_P(InstSve, st4w) { EXPECT_EQ(getMemoryValue(300 + index + (4 * i * 4) + 8), 5); EXPECT_EQ(getMemoryValue(300 + index + (4 * i * 4) + 12), 6); } + + index = 12 * (VL / 8); + for (uint64_t i = 0; i < (VL / 64); i++) { + EXPECT_EQ(getMemoryValue(300 + index + (4 * i * 4)), 3); + EXPECT_EQ(getMemoryValue(300 + index + (4 * i * 4) + 4), 4); + EXPECT_EQ(getMemoryValue(300 + index + (4 * i * 4) + 8), 5); + EXPECT_EQ(getMemoryValue(300 + index + (4 * i * 4) + 12), 6); + } } TEST_P(InstSve, st1w_scatter) { From 4c99a0f4a1e69e6a144c3400527c7d1800ddc97d Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Wed, 6 Nov 2024 22:00:13 +0000 Subject: [PATCH 62/71] Implemented LD1B (4 vec, scalar offset) SVE2 instruction with tests. --- src/lib/arch/aarch64/Instruction_address.cc | 17 ++++ src/lib/arch/aarch64/Instruction_execute.cc | 5 +- test/regression/aarch64/instructions/sve.cc | 92 ++++++++++++++++++++- 3 files changed, 112 insertions(+), 2 deletions(-) diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index 7e4da09efc..67e4599e5c 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -396,6 +396,23 @@ span Instruction::generateAddresses() { setMemoryAddresses(std::move(addresses)); break; } + case Opcode::AArch64_LD1B_4Z: { // ld1b {zt1.b - zt4.b}, png/z, [xn, xm] + const uint64_t base = sourceValues_[1].get(); + const int64_t offset = sourceValues_[2].get(); + const uint64_t addr = base + offset; + + std::vector addresses; + addresses.reserve(4); + + uint16_t blockSize = VL_bits / 8; + addresses.push_back({addr, blockSize}); + addresses.push_back({addr + blockSize, blockSize}); + addresses.push_back({addr + 2 * blockSize, blockSize}); + addresses.push_back({addr + 3 * blockSize, blockSize}); + + setMemoryAddresses(std::move(addresses)); + break; + } case Opcode::AArch64_LD1D: { // ld1d {zt.d}, pg/z, [xn, xm, lsl #3] const uint64_t base = sourceValues_[1].get(); const uint64_t offset = sourceValues_[2].get(); diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index c63d38e3d2..d398e7ef39 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -3075,7 +3075,7 @@ void Instruction::execute() { } break; } - case Opcode::AArch64_LD1B: { // ld1b {zt.b}, pg/z, [xn, xm] + case Opcode::AArch64_LD1B: { // ld1b {zt.b}, pg/z, [xn, xm] // LOAD const uint64_t* p = sourceValues_[0].getAsVector(); @@ -3147,6 +3147,9 @@ void Instruction::execute() { // mul vl}] // LOAD [[fallthrough]]; + case Opcode::AArch64_LD1B_4Z: // ld1b {zt1.b - zt4.b}, png/z, [xn, xm] + // LOAD + [[fallthrough]]; case Opcode::AArch64_LD1B_4Z_IMM: { // ld1b {zt1.b - zt4.b}, png/z, [xn{, // #imm, mul vl}] // LOAD diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index a4103b9ecb..16a966d00a 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -5035,6 +5035,7 @@ TEST_P(InstSve, ld1b) { mov x1, #4 ld1b {z0.b - z3.b}, pn8/z, [x0, #4, mul vl] + ld1b {z4.b - z7.b}, pn8/z, [x0, x1] ld1b {z16.b, z20.b, z24.b, z28.b}, pn8/z, [x0, #4, mul vl] ld1b {z17.b, z21.b, z25.b, z29.b}, pn8/z, [x0, x1] )"); @@ -5125,7 +5126,95 @@ TEST_P(InstSve, ld1b) { src[((base + (3 * offset)) + 15) % 16], }, VL / 8)); + base = 4; + offset = (VL / 8); + CHECK_NEON(4, uint8_t, + fillNeon( + { + src[(base) % 16], + src[(base + 1) % 16], + src[(base + 2) % 16], + src[(base + 3) % 16], + src[(base + 4) % 16], + src[(base + 5) % 16], + src[(base + 6) % 16], + src[(base + 7) % 16], + src[(base + 8) % 16], + src[(base + 9) % 16], + src[(base + 10) % 16], + src[(base + 11) % 16], + src[(base + 12) % 16], + src[(base + 13) % 16], + src[(base + 14) % 16], + src[(base + 15) % 16], + }, + VL / 8)); + CHECK_NEON(4, uint8_t, + fillNeon( + { + src[((base + offset)) % 16], + src[((base + offset) + 1) % 16], + src[((base + offset) + 2) % 16], + src[((base + offset) + 3) % 16], + src[((base + offset) + 4) % 16], + src[((base + offset) + 5) % 16], + src[((base + offset) + 6) % 16], + src[((base + offset) + 7) % 16], + src[((base + offset) + 8) % 16], + src[((base + offset) + 9) % 16], + src[((base + offset) + 10) % 16], + src[((base + offset) + 11) % 16], + src[((base + offset) + 12) % 16], + src[((base + offset) + 13) % 16], + src[((base + offset) + 14) % 16], + src[((base + offset) + 15) % 16], + }, + VL / 8)); + CHECK_NEON(6, uint8_t, + fillNeon( + { + src[((base + (2 * offset))) % 16], + src[((base + (2 * offset)) + 1) % 16], + src[((base + (2 * offset)) + 2) % 16], + src[((base + (2 * offset)) + 3) % 16], + src[((base + (2 * offset)) + 4) % 16], + src[((base + (2 * offset)) + 5) % 16], + src[((base + (2 * offset)) + 6) % 16], + src[((base + (2 * offset)) + 7) % 16], + src[((base + (2 * offset)) + 8) % 16], + src[((base + (2 * offset)) + 9) % 16], + src[((base + (2 * offset)) + 10) % 16], + src[((base + (2 * offset)) + 11) % 16], + src[((base + (2 * offset)) + 12) % 16], + src[((base + (2 * offset)) + 13) % 16], + src[((base + (2 * offset)) + 14) % 16], + src[((base + (2 * offset)) + 15) % 16], + }, + VL / 8)); + CHECK_NEON(7, uint8_t, + fillNeon( + { + src[((base + (3 * offset))) % 16], + src[((base + (3 * offset)) + 1) % 16], + src[((base + (3 * offset)) + 2) % 16], + src[((base + (3 * offset)) + 3) % 16], + src[((base + (3 * offset)) + 4) % 16], + src[((base + (3 * offset)) + 5) % 16], + src[((base + (3 * offset)) + 6) % 16], + src[((base + (3 * offset)) + 7) % 16], + src[((base + (3 * offset)) + 8) % 16], + src[((base + (3 * offset)) + 9) % 16], + src[((base + (3 * offset)) + 10) % 16], + src[((base + (3 * offset)) + 11) % 16], + src[((base + (3 * offset)) + 12) % 16], + src[((base + (3 * offset)) + 13) % 16], + src[((base + (3 * offset)) + 14) % 16], + src[((base + (3 * offset)) + 15) % 16], + }, + VL / 8)); // Strided (4-stride) vectors + base = (VL / 8) * 4; + offset = (VL / 8); CHECK_NEON(16, uint8_t, fillNeon( { @@ -5210,7 +5299,8 @@ TEST_P(InstSve, ld1b) { src[((base + (3 * offset)) + 15) % 16], }, VL / 8)); - base = (VL / 8) + 4; + base = 4; + offset = (VL / 8); CHECK_NEON(17, uint8_t, fillNeon( { From 0d74234ea2e591fab17e8da5baac2a78b93625be Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Thu, 7 Nov 2024 11:34:27 +0000 Subject: [PATCH 63/71] Implemented UDOT (4-way, VGx4 8-bit to 32-bit widening) SME instruction with tests. --- src/lib/arch/aarch64/Instruction_execute.cc | 49 ++++++++++++++++ test/regression/aarch64/instructions/sme.cc | 65 ++++++++++++++++++++- 2 files changed, 113 insertions(+), 1 deletion(-) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index d398e7ef39..4300fc68c3 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -6392,6 +6392,55 @@ void Instruction::execute() { results_[0] = {div_3ops(sourceValues_), 8}; break; } + case Opcode::AArch64_UDOT_VG4_M4Z4Z_BtoS: { // udot za.s[wv, #off, vgx4], + // {zn1.b - zn4.b}, {zm1.b - + // zm4.b} + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 32; + // Get ZA stride between quarters and index into each ZA quarter + const uint16_t zaStride = zaRowCount / 4; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + + // Pre-set all ZA result rows as only 4 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + // Get base zn and zm register indexed in sourceValues + const uint16_t znBase = zaRowCount + 1; + const uint16_t zmBase = zaRowCount + 5; + + // Loop over each source vector and destination vector (from the za + // single-vector group) pair + for (int r = 0; r < 4; r++) { + // For ZA single-vector groups of 4 vectors (vgx4), each vector is in + // a different quarter of ZA; indexed into it by Wv+off. + const uint32_t* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + const uint8_t* znr = sourceValues_[znBase + r].getAsVector(); + const uint8_t* zmr = sourceValues_[zmBase + r].getAsVector(); + uint32_t out[64] = {0}; + // Loop over all 32-bit elements of output row vector `zaRow` + for (int e = 0; e < elemCount; e++) { + out[e] = zaRow[e]; + // There are 4 8-bit elements per 32-bit element of `znr` and `zmr` + for (int i = 0; i < 4; i++) { + out[e] += static_cast(znr[4 * e + i]) * + static_cast(zmr[4 * e + i]); + } + } + // Update results_ for completed row + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } case Opcode::AArch64_UDOT_VG4_M4ZZI_BtoS: { // udot za.s[wv, #off, vgx4], // {zn1.b - zn4.b}, // zm.b[#index] diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index eb3ef04e4f..8b12725472 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -1086,7 +1086,70 @@ TEST_P(InstSme, udot_Indexed_vgx4) { } } -TEST_P(InstSme, uvdot_vgx4) { +TEST_P(InstSme, udot_vgx4) { + // 8-bit to 32-bit widening + initialHeapData_.resize(SVL / 8); + uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); + std::vector src = {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; + fillHeap(heap8, src, SVL / 8); + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 96 (uint32_t) + dup z1.b, #8 + dup z2.b, #3 + ptrue p0.b + ptrue p1.b + umopa za0.s, p0/m, p1/m, z1.b, z2.b + umopa za1.s, p0/m, p1/m, z1.b, z2.b + umopa za2.s, p0/m, p1/m, z1.b, z2.b + umopa za3.s, p0/m, p1/m, z1.b, z2.b + + # initialise registers + mov w8, #1 + dup z4.b, #10 + dup z5.b, #11 + dup z6.b, #12 + dup z7.b, #13 + ld1b {z8.b}, p0/z, [x0] + ld1b {z9.b}, p0/z, [x0] + ld1b {z10.b}, p0/z, [x0] + ld1b {z11.b}, p0/z, [x0] + + udot za.s[w8, #1, vgx4], {z4.b - z7.b}, {z8.b - z11.b} + )"); + const uint16_t zaStride = (SVL / 8) / 4; + const uint16_t zaQuartIndex = 2; + for (uint64_t i = 0; i < (SVL / 8); i++) { + if (i == zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({156, 316, 476, 636}, (SVL / 8))); + } else if (i == zaStride + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({162, 338, 514, 690}, (SVL / 8))); + } else if (i == (2 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({168, 360, 552, 744}, (SVL / 8))); + } else if (i == (3 * zaStride) + zaQuartIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({174, 382, 590, 798}, (SVL / 8))); + } else { + // un-effected rows should still be 96 throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({96}, (SVL / 8))); + } + } +} + +TEST_P(InstSme, uvdot_indexed_vgx4) { // 8-bit to 32-bit widening initialHeapData_.resize(SVL / 8); uint8_t* heap8 = reinterpret_cast(initialHeapData_.data()); From 40a0fa4d0c762eb3e59f67d289f81f280a797856 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Thu, 7 Nov 2024 16:50:31 +0000 Subject: [PATCH 64/71] Implemented ADD (uint32, vgx2, vectors and ZA), SME instruction with tests. --- src/lib/arch/aarch64/Instruction_execute.cc | 34 ++++++++++++++++ test/regression/aarch64/instructions/sme.cc | 45 +++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 4300fc68c3..28a2546797 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -328,6 +328,40 @@ void Instruction::execute() { results_[0] = vecAdd_3ops(sourceValues_); break; } + case Opcode::AArch64_ADD_VG2_M2Z_S: { // add za.s[wv, off, vgx2], {zn1.s, + // zn2.s} + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 32; + + // Get ZA stride between halves and index into each ZA half + const uint16_t zaStride = zaRowCount / 2; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + + // Pre-set all ZA result rows as only 2 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + for (int r = 0; r < 2; r++) { + const uint32_t* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + const uint32_t* znr = + sourceValues_[zaRowCount + 1 + r].getAsVector(); + uint32_t out[64] = {0}; + for (int i = 0; i < elemCount; i++) { + out[i] = zaRow[i] + znr[i]; + } + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } case Opcode::AArch64_ADR: { // adr xd, #imm results_[0] = instructionAddress_ + metadata_.operands[1].imm; break; diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index 8b12725472..aca3d0ba99 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -7,6 +7,51 @@ namespace { using InstSme = AArch64RegressionTest; +TEST_P(InstSme, add) { + // uint32_T, vgx2, vecs with ZA + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 96 (uint32_t) + dup z0.b, #8 + dup z1.b, #3 + ptrue p0.b + ptrue p1.b + umopa za0.s, p0/m, p1/m, z0.b, z1.b + umopa za1.s, p0/m, p1/m, z0.b, z1.b + umopa za2.s, p0/m, p1/m, z0.b, z1.b + umopa za3.s, p0/m, p1/m, z0.b, z1.b + + # Set 2 of the za rows + mov w8, #1 + dup z0.s, #8 + dup z1.s, #3 + add za.s[w8, #1, vgx2], {z0.s, z1.s} + )"); + const uint16_t zaStride = (SVL / 8) / 2; + const uint16_t zaHalfIndex = 2; + for (uint64_t i = 0; i < (SVL / 8); i++) { + if (i == zaHalfIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({104}, (SVL / 8))); + } else if (i == zaStride + zaHalfIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({99}, (SVL / 8))); + } else { + // un-effected rows should still be 96 throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({96}, (SVL / 8))); + } + } +} + TEST_P(InstSme, mova_tileToVec) { // 8-bit RUN_AARCH64(R"( From 950de4124cdfbfb13bc67fcac043da4232451e0a Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Thu, 7 Nov 2024 19:58:23 +0000 Subject: [PATCH 65/71] Implemented ZIP (4 vectors) SVE2 instruction with tests. --- src/lib/arch/aarch64/Instruction_execute.cc | 23 +++++++++++++++++++++ test/regression/aarch64/instructions/sve.cc | 15 +++++++++++++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 28a2546797..558ebc0525 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -7076,6 +7076,29 @@ void Instruction::execute() { results_[0] = vecZip(sourceValues_, true); break; } + case Opcode::AArch64_ZIP_VG4_4Z4Z_S: { // zip {zd1.s - zd4.s}, {zn1.s - + // zn4.s} + const uint32_t* zn[4]; + zn[0] = sourceValues_[0].getAsVector(); + zn[1] = sourceValues_[1].getAsVector(); + zn[2] = sourceValues_[2].getAsVector(); + zn[3] = sourceValues_[3].getAsVector(); + + const uint16_t quads = VL_bits / (32 * 4); + + uint32_t out[4][64] = {{0}, {0}, {0}, {0}}; + for (int r = 0; r < 4; r++) { + const uint16_t base = r * quads; + for (int q = 0; q < quads; q++) { + out[r][4 * q] = zn[0][base + q]; + out[r][4 * q + 1] = zn[1][base + q]; + out[r][4 * q + 2] = zn[2][base + q]; + out[r][4 * q + 3] = zn[3][base + q]; + } + results_[r] = RegisterValue(out[r], 256); + } + break; + } case Opcode::AArch64_ZERO_M: { // zero {mask} // SME // Not in right context mode. Raise exception diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index 16a966d00a..f9699593f3 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -9145,13 +9145,26 @@ TEST_P(InstSve, zip) { zip1 z16.b, z12.b, z13.b zip2 z17.b, z14.b, z15.b )"); - CHECK_NEON(4, double, fillNeon({0.5, -0.5}, VL / 8)); CHECK_NEON(5, double, fillNeon({0.75, -0.75}, VL / 8)); CHECK_NEON(10, float, fillNeon({0.5, -0.75}, VL / 8)); CHECK_NEON(11, float, fillNeon({-0.5, 0.75}, VL / 8)); CHECK_NEON(16, int8_t, fillNeon({1, -2}, VL / 8)); CHECK_NEON(17, int8_t, fillNeon({-1, 2}, VL / 8)); + + // Multi-vector + RUN_AARCH64(R"( + #32-bit + dup z0.s, #5 + dup z1.s, #6 + dup z2.s, #7 + dup z3.s, #8 + zip {z4.s - z7.s}, {z0.s - z3.s} + )"); + CHECK_NEON(4, uint32_t, fillNeon({5, 6, 7, 8}, VL / 8)); + CHECK_NEON(5, uint32_t, fillNeon({5, 6, 7, 8}, VL / 8)); + CHECK_NEON(6, uint32_t, fillNeon({5, 6, 7, 8}, VL / 8)); + CHECK_NEON(7, uint32_t, fillNeon({5, 6, 7, 8}, VL / 8)); } TEST_P(InstSve, psel) { From 03a95e70f21701be27d2f8e01bdd31955a53f248 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Tue, 10 Dec 2024 12:11:14 +0000 Subject: [PATCH 66/71] Attended PR comments. --- CMakeLists.txt | 3 +-- src/include/simeng/arch/aarch64/Instruction.hh | 7 ++++--- src/include/simeng/arch/aarch64/helpers/sve.hh | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 42111288ff..afeeb4abac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -157,8 +157,7 @@ if(SIMENG_ENABLE_TESTS) # Print message containing if the full test suite will run if (${LLVM_PACKAGE_VERSION} VERSION_LESS "14.0") message(STATUS "LLVM version does not support AArch64 extensions SVE2, SVE2.1, SME, or SME2. Related tests will fail.") - endif() - if (${LLVM_PACKAGE_VERSION} VERSION_LESS "18.0") + elseif (${LLVM_PACKAGE_VERSION} VERSION_LESS "18.0") message(STATUS "LLVM version does not support AArch64 extensions SME2 or SVE2.1. Related test will fail.") endif() diff --git a/src/include/simeng/arch/aarch64/Instruction.hh b/src/include/simeng/arch/aarch64/Instruction.hh index f3854c84b4..6db73f0e69 100644 --- a/src/include/simeng/arch/aarch64/Instruction.hh +++ b/src/include/simeng/arch/aarch64/Instruction.hh @@ -306,9 +306,10 @@ std::vector> predAsCounterToMasks( for (int i = 0; i < elemsPerVec; i++) { // Move bit to next position based on element type uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); - // If invert = 1, predElemCount dictates number of initial inactive - // elements. - // If invert = 0, it is number of initial active elements. + // If invert = True (invert bit = 1), predElemCount dictates number of + // initial inactive elements. + // If invert = False (invert bit = 0), it indicates the number of initial + // active elements. if ((r * elemsPerVec) + i < predElemCount) { out[r][i / (64 / sizeof(T))] |= (invert) ? 0 : shifted_active; } else { diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index 50eb19c657..a42bd9680c 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -626,7 +626,7 @@ std::enable_if_t, RegisterValue> sveFDivPredicated( return {out, 256}; } -/** Helpfer function for SVE instructions with the format `faddv rd, pg, zn. +/** Helper function for SVE instructions with the format `faddv rd, pg, zn. * D represents the source vector element type and the destination scalar * register type (i.e. for zn.s and sd, D = float). * Returns correctly formatted RegisterValue. */ From 672936312ab2b115cd167ce17d59affbb079fd9b Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Fri, 13 Dec 2024 18:02:01 +0000 Subject: [PATCH 67/71] Minor bug fixes. --- src/lib/arch/aarch64/Instruction_decode.cc | 11 ++++++----- src/lib/arch/aarch64/Instruction_execute.cc | 9 +++++---- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/lib/arch/aarch64/Instruction_decode.cc b/src/lib/arch/aarch64/Instruction_decode.cc index 6d2007cb55..a6bc075efd 100644 --- a/src/lib/arch/aarch64/Instruction_decode.cc +++ b/src/lib/arch/aarch64/Instruction_decode.cc @@ -537,8 +537,9 @@ void Instruction::decode() { if (isInstruction(InsnType::isStoreData)) { // Identify store instruction group - if (AARCH64_REG_Z0 <= metadata_.operands[0].reg && - metadata_.operands[0].reg <= AARCH64_REG_Z31) { + if ((AARCH64_REG_Z0 <= metadata_.operands[0].reg && + metadata_.operands[0].reg <= AARCH64_REG_Z31) || + metadata_.operands[0].reg == AARCH64_REG_ZT0) { setInstructionType(InsnType::isSVEData); } else if ((metadata_.operands[0].reg <= AARCH64_REG_S31 && metadata_.operands[0].reg >= AARCH64_REG_Q0) || @@ -548,7 +549,7 @@ void Instruction::decode() { } else if (metadata_.operands[0].is_vreg) { setInstructionType(InsnType::isVectorData); } else if ((metadata_.operands[0].reg >= AARCH64_REG_ZAB0 && - metadata_.operands[0].reg <= AARCH64_REG_ZT0) || + metadata_.operands[0].reg < AARCH64_REG_ZT0) || metadata_.operands[0].reg == AARCH64_REG_ZA) { setInstructionType(InsnType::isSMEData); } @@ -644,8 +645,8 @@ void Instruction::decode() { } } } else { - // For SME instructions, resize the following structures to have the - // exact amount of space required + // For SME instructions (not using ZT0), resize the following structures to + // have the exact amount of space required sourceRegisters_.resize(sourceRegisterCount_); destinationRegisters_.resize(destinationRegisterCount_); sourceValues_.resize(sourceRegisterCount_); diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 558ebc0525..6ed9d6695a 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -4181,10 +4181,11 @@ void Instruction::execute() { } case Opcode::AArch64_LDRSBWpost: { // ldrsb wt, [xn], #imm // LOAD - results_[1] = RegisterValue( - static_cast(memoryData_[0].get()), 4); - results_[0] = - sourceValues_[0].get() + metadata_.operands[2].imm; + results_[1] = + RegisterValue(static_cast(memoryData_[0].get()), 4) + .zeroExtend(4, 8); + results_[0] = RegisterValue( + sourceValues_[0].get() + metadata_.operands[2].imm, 8); break; } case Opcode::AArch64_LDRSBWroX: { // ldrsb wt, [xn, xm{, extend From 850b741068d7adf2696bfc4e01a7f9c8450c9a7c Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Mon, 16 Dec 2024 13:16:01 +0000 Subject: [PATCH 68/71] Attended PR comments. --- .../simeng/arch/aarch64/Instruction.hh | 5 +---- src/include/simeng/version.hh.in | 2 +- src/lib/arch/aarch64/Instruction_execute.cc | 7 ++++--- .../aarch64/AArch64RegressionTest.hh | 21 ++++++++----------- test/regression/aarch64/Exception.cc | 2 ++ test/regression/aarch64/instructions/float.cc | 6 +++--- test/regression/aarch64/instructions/sme.cc | 2 +- 7 files changed, 21 insertions(+), 24 deletions(-) diff --git a/src/include/simeng/arch/aarch64/Instruction.hh b/src/include/simeng/arch/aarch64/Instruction.hh index 6db73f0e69..b1ffb97575 100644 --- a/src/include/simeng/arch/aarch64/Instruction.hh +++ b/src/include/simeng/arch/aarch64/Instruction.hh @@ -283,9 +283,6 @@ enum class InsnType : uint32_t { isBranch = 1 << 14 }; -/** Predefined shift values for converting pred-as-counter to pred-as-mask. */ -const uint64_t predCountShiftVals[9] = {0, 1, 2, 0, 3, 0, 0, 0, 4}; - /** Convert Predicate-as-Counter to Predicate-as-Masks. * T represents the element type (i.e. for pg.s, T = uint32_t). * V represents the number of vectors the predicate-as-counter is being used @@ -300,7 +297,7 @@ std::vector> predAsCounterToMasks( const bool invert = (predAsCounter & 0b1000000000000000) != 0; const uint64_t predElemCount = (predAsCounter & static_cast(0b0111111111111111)) >> - predCountShiftVals[sizeof(T)]; + static_cast(std::log2f(sizeof(T)) + 1); for (int r = 0; r < V; r++) { for (int i = 0; i < elemsPerVec; i++) { diff --git a/src/include/simeng/version.hh.in b/src/include/simeng/version.hh.in index 8a2a823a66..f563e281f9 100644 --- a/src/include/simeng/version.hh.in +++ b/src/include/simeng/version.hh.in @@ -9,6 +9,6 @@ #define SIMENG_LLVM_VERSION @SIMENG_LLVM_VERSION@ #define SIMENG_ENABLE_TESTS "${SIMENG_ENABLE_TESTS}" #define SIMENG_BUILD_DIR "${CMAKE_BINARY_DIR}" -#define SIMENG_ENABLE_BF16 "${SIMENG_ENABLE_BF16}" +#define SIMENG_ENABLE_BF16 ${SIMENG_ENABLE_BF16} #endif \ No newline at end of file diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 6ed9d6695a..78dcb6c5d8 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -520,10 +520,10 @@ void Instruction::execute() { branchAddress_ = instructionAddress_ + metadata_.operands[0].imm; break; } +#if SIMENG_ENABLE_BF16 == 1 case Opcode::AArch64_BF16DOTlanev8bf16: { // bfdot vd.4s, vn.8h, // vm.2h[index] // BF16 -- EXPERIMENTAL - if (std::string(SIMENG_ENABLE_BF16) == "OFF") return executionNYI(); // Must be enabled at SimEng compile time // Not verified to be working for all compilers or OSs. // No Tests written @@ -545,7 +545,6 @@ void Instruction::execute() { } case Opcode::AArch64_BFDOT_ZZI: { // bfdot zd.s, zn.h, zm.h[index] // BF16 -- EXPERIMENTAL - if (std::string(SIMENG_ENABLE_BF16) == "OFF") return executionNYI(); // Must be enabled at SimEng compile time // Not verified to be working for all compilers or OSs. // No Tests written @@ -580,6 +579,7 @@ void Instruction::execute() { results_[0] = RegisterValue(out, 256); break; } +#endif case Opcode::AArch64_BFMWri: { // bfm wd, wn, #immr, #imms results_[0] = { bfm_2imms(sourceValues_, metadata_, false, false), 8}; @@ -2252,11 +2252,11 @@ void Instruction::execute() { results_[0] = vecFmlsIndexed_3vecs(sourceValues_, metadata_); break; } +#if SIMENG_ENABLE_BF16 == 1 case Opcode::AArch64_BFMOPA_MPPZZ: { // bfmopa zada.s, pn/m, pm/m, zn.h, // zm.h // SME // BF16 -- EXPERIMENTAL - if (std::string(SIMENG_ENABLE_BF16) == "OFF") return executionNYI(); // Must be enabled at SimEng compile time // Not verified to be working for all compilers or OSs. // No Tests written @@ -2308,6 +2308,7 @@ void Instruction::execute() { } break; } +#endif case Opcode::AArch64_FMOPA_MPPZZ_D: { // fmopa zada.d, pn/m, pm/m, zn.d, // zm.d // SME diff --git a/test/regression/aarch64/AArch64RegressionTest.hh b/test/regression/aarch64/AArch64RegressionTest.hh index 8285726ee7..6afdc47d2a 100644 --- a/test/regression/aarch64/AArch64RegressionTest.hh +++ b/test/regression/aarch64/AArch64RegressionTest.hh @@ -192,20 +192,19 @@ inline std::vector> genCoreTypeSVLPairs( /** Check each element of the Lookup Table register ZT0 against expected values. * - * The `tag` argument is the register index (must be 0), and the `type` argument - * is the C++ data type to use for value comparisons. The third argument should - * be an initializer list containing one value for each register element (for a - * total of `(64 / sizeof(type))` values). + * The `type` argument is the C++ data type to use for value comparisons. The + * third argument should be an initializer list containing one value for each + * register element (for a total of `(64 / sizeof(type))` values). * * For example: * * // Compare zt0 to some expected 32-bit uint64 values. * CHECK_TABLE(0, uint32_t, {1, 2, 3, 4, ..., 16}); */ -#define CHECK_TABLE(tag, type, ...) \ - { \ - SCOPED_TRACE("<<== error generated here"); \ - checkTableRegister(tag, __VA_ARGS__); \ +#define CHECK_TABLE(type, ...) \ + { \ + SCOPED_TRACE("<<== error generated here"); \ + checkTableRegister(__VA_ARGS__); \ } /** A helper macro to predecode the first instruction in a snippet of Armv9.2-a @@ -385,11 +384,9 @@ class AArch64RegressionTest : public RegressionTest { * better diagnostic messages, rather than called directly from test code. */ template - void checkTableRegister(uint8_t tag, - const std::array& values) const { - assert(tag == 0 && "Only a tag of value 0 is valid for Table registers"); + void checkTableRegister(const std::array& values) const { const T* data = RegressionTest::getVectorRegister( - {simeng::arch::aarch64::RegisterType::TABLE, tag}); + {simeng::arch::aarch64::RegisterType::TABLE, 0}); for (unsigned i = 0; i < (64 / sizeof(T)); i++) { EXPECT_NEAR(data[i], values[i], 0.0005) << "Mismatch for element " << i << "."; diff --git a/test/regression/aarch64/Exception.cc b/test/regression/aarch64/Exception.cc index 2133629473..b987ae4429 100644 --- a/test/regression/aarch64/Exception.cc +++ b/test/regression/aarch64/Exception.cc @@ -151,6 +151,7 @@ TEST_P(Exception, unmapped_sys_reg) { EXPECT_EQ(stdout_.substr(0, strlen(err)), err); } +#if SIMENG_LLVM_VERSION >= 14 // TODO: Write test for InstructionException::StreamingModeUpdate once it has a // trigger case // TODO: Write test for InstructionException::ZAregisterStatusUpdate once it has @@ -370,6 +371,7 @@ TEST_P(Exception, svcr) { fillNeon({0}, SVL / 8)); } } +#endif INSTANTIATE_TEST_SUITE_P( AArch64, Exception, diff --git a/test/regression/aarch64/instructions/float.cc b/test/regression/aarch64/instructions/float.cc index bc2d09ea27..627e710e7c 100644 --- a/test/regression/aarch64/instructions/float.cc +++ b/test/regression/aarch64/instructions/float.cc @@ -1459,9 +1459,9 @@ TEST_P(InstFloat, ucvtf) { // representation error to ensure tests pass initialHeapData_.resize(12); heap32 = reinterpret_cast(initialHeapData_.data()); - heap32[0] = 0x000001EE; - heap32[1] = 0x00021F3B; - heap32[2] = 0x32FE6B75; + heap32[0] = 0x000001EE; // 123.5 (2 fraction bits) + heap32[1] = 0x00021F3B; // 543.23 (8 fraction bits) + heap32[2] = 0x32FE6B75; // 101.987654321 (23 fraction bits) RUN_AARCH64(R"( # Get heap address mov x0, 0 diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index aca3d0ba99..d908d13a1d 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -1294,7 +1294,7 @@ TEST_P(InstSme, zero) { zero {zt0} )"); - CHECK_TABLE(0, uint64_t, {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}); + CHECK_TABLE(uint64_t, {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}); // ZA tiles RUN_AARCH64(R"( From 1d0409697df3921e553057312dfb7e07772601d6 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Wed, 18 Dec 2024 10:29:18 +0000 Subject: [PATCH 69/71] Updated multi-vector load logic. --- src/lib/arch/aarch64/Instruction_execute.cc | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 78dcb6c5d8..1981a02b71 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -3162,10 +3162,11 @@ void Instruction::execute() { const uint16_t partition_num = VL_bits / 8; for (int r = 0; r < 2; r++) { + const uint8_t* data = memoryData_[r].getAsVector(); for (int i = 0; i < partition_num; i++) { uint64_t shifted_active = 1ull << (i % 64); if (preds[r][i / 64] & shifted_active) { - out[r][i] = memoryData_[r].getAsVector()[i]; + out[r][i] = data[i]; } } } @@ -3240,10 +3241,11 @@ void Instruction::execute() { const uint16_t partition_num = VL_bits / 64; for (int r = 0; r < 2; r++) { + const uint64_t* data = memoryData_[r].getAsVector(); for (int i = 0; i < partition_num; i++) { uint64_t shifted_active = 1ull << ((i % 8) * 8); if (preds[r][i / 8] & shifted_active) { - out[r][i] = memoryData_[r].getAsVector()[i]; + out[r][i] = data[i]; } } } @@ -3266,10 +3268,11 @@ void Instruction::execute() { const uint16_t partition_num = VL_bits / 64; for (int r = 0; r < 4; r++) { + const uint64_t* data = memoryData_[r].getAsVector(); for (int i = 0; i < partition_num; i++) { uint64_t shifted_active = 1ull << ((i % 8) * 8); if (preds[r][i / 8] & shifted_active) { - out[r][i] = memoryData_[r].getAsVector()[i]; + out[r][i] = data[i]; } } } @@ -3337,10 +3340,11 @@ void Instruction::execute() { const uint16_t partition_num = VL_bits / 16; for (int r = 0; r < 2; r++) { + const uint16_t* data = memoryData_[r].getAsVector(); for (int i = 0; i < partition_num; i++) { uint64_t shifted_active = 1ull << ((i % 32) * 2); if (preds[r][i / 32] & shifted_active) { - out[r][i] = memoryData_[r].getAsVector()[i]; + out[r][i] = data[i]; } } } @@ -3777,10 +3781,11 @@ void Instruction::execute() { const uint16_t partition_num = VL_bits / 32; for (int r = 0; r < 2; r++) { + const uint32_t* data = memoryData_[r].getAsVector(); for (int i = 0; i < partition_num; i++) { uint64_t shifted_active = 1ull << ((i % 16) * 4); if (preds[r][i / 16] & shifted_active) { - out[r][i] = memoryData_[r].getAsVector()[i]; + out[r][i] = data[i]; } } } @@ -3803,10 +3808,11 @@ void Instruction::execute() { const uint16_t partition_num = VL_bits / 32; for (int r = 0; r < 4; r++) { + const uint32_t* data = memoryData_[r].getAsVector(); for (int i = 0; i < partition_num; i++) { uint64_t shifted_active = 1ull << ((i % 16) * 4); if (preds[r][i / 16] & shifted_active) { - out[r][i] = memoryData_[r].getAsVector()[i]; + out[r][i] = data[i]; } } } From 246d39ab47748e81262955290a319a17027ad307 Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Fri, 20 Dec 2024 11:06:10 +0000 Subject: [PATCH 70/71] CI CD fixes. --- src/include/simeng/arch/aarch64/Instruction.hh | 4 ++-- src/include/simeng/arch/aarch64/helpers/sve.hh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/include/simeng/arch/aarch64/Instruction.hh b/src/include/simeng/arch/aarch64/Instruction.hh index b1ffb97575..6cbc0c2908 100644 --- a/src/include/simeng/arch/aarch64/Instruction.hh +++ b/src/include/simeng/arch/aarch64/Instruction.hh @@ -300,14 +300,14 @@ std::vector> predAsCounterToMasks( static_cast(std::log2f(sizeof(T)) + 1); for (int r = 0; r < V; r++) { - for (int i = 0; i < elemsPerVec; i++) { + for (uint16_t i = 0; i < elemsPerVec; i++) { // Move bit to next position based on element type uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T)); // If invert = True (invert bit = 1), predElemCount dictates number of // initial inactive elements. // If invert = False (invert bit = 0), it indicates the number of initial // active elements. - if ((r * elemsPerVec) + i < predElemCount) { + if (static_cast(r * elemsPerVec) + i < predElemCount) { out[r][i / (64 / sizeof(T))] |= (invert) ? 0 : shifted_active; } else { out[r][i / (64 / sizeof(T))] |= (invert) ? shifted_active : 0; diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh index a42bd9680c..cf9ffd5683 100644 --- a/src/include/simeng/arch/aarch64/helpers/sve.hh +++ b/src/include/simeng/arch/aarch64/helpers/sve.hh @@ -1636,7 +1636,7 @@ RegisterValue sveUdot( const N* zm = sourceValues[2].getAsVector(); D out[256 / sizeof(D)] = {0}; - for (int i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) { + for (size_t i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) { out[i] = zd[i]; for (int j = 0; j < W; j++) { out[i] += @@ -1666,7 +1666,7 @@ RegisterValue sveUdot_indexed( const int index = metadata.operands[2].vector_index; D out[256 / sizeof(D)] = {0}; - for (int i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) { + for (size_t i = 0; i < (VL_bits / (sizeof(D) * 8)); i++) { D acc = zd[i]; // Index into zm selects which D-type element within each 128-bit vector // segment to use From 0ec0b8db24f62d28d7f131c4830a8677441f49dd Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Fri, 20 Dec 2024 11:49:48 +0000 Subject: [PATCH 71/71] CI CD fixes pt2. --- test/regression/aarch64/instructions/sme.cc | 46 ++++++++++----------- test/regression/aarch64/instructions/sve.cc | 4 +- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index d908d13a1d..75be221ae0 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -37,7 +37,7 @@ TEST_P(InstSme, add) { )"); const uint16_t zaStride = (SVL / 8) / 2; const uint16_t zaHalfIndex = 2; - for (uint64_t i = 0; i < (SVL / 8); i++) { + for (uint16_t i = 0; i < (SVL / 8); i++) { if (i == zaHalfIndex) { CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, fillNeon({104}, (SVL / 8))); @@ -279,7 +279,7 @@ TEST_P(InstSme, fadd) { )"); const uint16_t zaStride = (SVL / 8) / 2; const uint16_t zaHalfIndex = 2; - for (uint64_t i = 0; i < (SVL / 8); i++) { + for (uint16_t i = 0; i < (SVL / 8); i++) { if (i == zaHalfIndex) { CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, fillNeon({21.5f}, (SVL / 8))); @@ -330,7 +330,7 @@ TEST_P(InstSme, fadd) { fadd za.d[w8, #1, vgx2], {z4.d, z5.d} )"); - for (uint64_t i = 0; i < (SVL / 8); i++) { + for (uint16_t i = 0; i < (SVL / 8); i++) { if (i == zaHalfIndex) { CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, fillNeon({21.5}, (SVL / 8))); @@ -382,7 +382,7 @@ TEST_P(InstSme, fmla_multiVecs) { )"); const uint16_t zaStride = (SVL / 8) / 4; const uint16_t zaQuartIndex = 2; - for (uint64_t i = 0; i < (SVL / 8); i++) { + for (uint16_t i = 0; i < (SVL / 8); i++) { // Effected rows all use same zm value of 2.0f if (i == zaQuartIndex) { CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, @@ -441,7 +441,7 @@ TEST_P(InstSme, fmla_multiVecs) { fmla za.d[w8, #1, vgx4], {z4.d - z7.d}, {z8.d - z11.d} )"); - for (uint64_t i = 0; i < (SVL / 8); i++) { + for (uint16_t i = 0; i < (SVL / 8); i++) { // Effected rows all use same zm value of 2.0 if (i == zaQuartIndex) { CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, @@ -501,7 +501,7 @@ TEST_P(InstSme, fmla_indexed_vgx4) { )"); const uint16_t zaStride = (SVL / 8) / 4; const uint16_t zaQuartIndex = 2; - for (uint64_t i = 0; i < (SVL / 8); i++) { + for (uint16_t i = 0; i < (SVL / 8); i++) { // Effected rows all use same zm value of 2.0f if (i == zaQuartIndex) { CHECK_MAT_ROW(AARCH64_REG_ZA, i, float, @@ -561,7 +561,7 @@ TEST_P(InstSme, fmla_indexed_vgx4) { fmla za.d[w8, #1, vgx4], {z4.d - z7.d}, z10.d[0] )"); - for (uint64_t i = 0; i < (SVL / 8); i++) { + for (uint16_t i = 0; i < (SVL / 8); i++) { // Effected rows all use same zm value of 2.0f if (i == zaQuartIndex) { CHECK_MAT_ROW(AARCH64_REG_ZA, i, double, @@ -605,7 +605,7 @@ TEST_P(InstSme, fmopa) { fmopa za2.s, p0/m, p2/m, z3.s, z4.s )"); - for (uint64_t i = 0; i < (SVL / 32); i++) { + for (uint16_t i = 0; i < (SVL / 32); i++) { CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, float, fillNeon({10.0f}, (SVL / 8))); CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, float, @@ -633,7 +633,7 @@ TEST_P(InstSme, fmopa) { fmopa za2.d, p0/m, p2/m, z3.d, z4.d )"); - for (uint64_t i = 0; i < (SVL / 64); i++) { + for (uint16_t i = 0; i < (SVL / 64); i++) { CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, double, fillNeon({10.0}, (SVL / 8))); CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, double, @@ -850,7 +850,7 @@ TEST_P(InstSme, st1d) { st1d {za0h.d[w12, 0]}, p0, [sp, x1, lsl #3] st1d {za1h.d[w12, 1]}, p0, [x4] )"); - for (uint64_t i = 0; i < (SVL / 64); i++) { + for (uint16_t i = 0; i < (SVL / 64); i++) { EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 4095 + (i * 8)), src[i % 2]); @@ -880,7 +880,7 @@ TEST_P(InstSme, st1d) { ld1d {za1h.d[w13, 1]}, p1/z, [x0, x3, lsl #3] st1d {za1h.d[w13, 1]}, p1, [x5, x3, lsl #3] )"); - for (uint64_t i = 0; i < (SVL / 128); i++) { + for (uint16_t i = 0; i < (SVL / 128); i++) { EXPECT_EQ(getMemoryValue(800 + (i * 8)), src[i % 2]); EXPECT_EQ(getMemoryValue(800 + 16 + (i * 8)), src[i % 2]); } @@ -911,7 +911,7 @@ TEST_P(InstSme, st1d) { st1d {za0v.d[w12, 0]}, p0, [sp, x1, lsl #3] st1d {za1v.d[w12, 1]}, p0, [x4] )"); - for (uint64_t i = 0; i < (SVL / 64); i++) { + for (uint16_t i = 0; i < (SVL / 64); i++) { EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 4095 + (i * 8)), src_vert[i % 2]); @@ -941,7 +941,7 @@ TEST_P(InstSme, st1d) { ld1d {za1v.d[w13, 1]}, p1/z, [x0, x3, lsl #3] st1d {za1v.d[w13, 1]}, p1, [x5, x3, lsl #3] )"); - for (uint64_t i = 0; i < (SVL / 128); i++) { + for (uint16_t i = 0; i < (SVL / 128); i++) { EXPECT_EQ(getMemoryValue(800 + (i * 8)), src_vert[i % 2]); EXPECT_EQ(getMemoryValue(800 + 16 + (i * 8)), src_vert[i % 2]); } @@ -974,7 +974,7 @@ TEST_P(InstSme, st1w) { st1w {za0h.s[w12, 0]}, p0, [sp, x1, lsl #2] st1w {za1h.s[w12, 1]}, p0, [x4] )"); - for (uint64_t i = 0; i < (SVL / 32); i++) { + for (uint16_t i = 0; i < (SVL / 32); i++) { EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 4095 + (i * 4)), src[i % 4]); @@ -1003,7 +1003,7 @@ TEST_P(InstSme, st1w) { ld1w {za1h.s[w12, 2]}, p1/z, [x0, x3, lsl #2] st1w {za1h.s[w12, 2]}, p1, [x5, x3, lsl #2] )"); - for (uint64_t i = 0; i < (SVL / 64); i++) { + for (uint16_t i = 0; i < (SVL / 64); i++) { EXPECT_EQ(getMemoryValue(800 + (i * 4)), src[i % 4]); EXPECT_EQ(getMemoryValue(800 + 16 + (i * 4)), src[i % 4]); } @@ -1035,7 +1035,7 @@ TEST_P(InstSme, st1w) { st1w {za0v.s[w12, 0]}, p0, [sp, x1, lsl #2] st1w {za1v.s[w12, 1]}, p0, [x4] )"); - for (uint64_t i = 0; i < (SVL / 32); i++) { + for (uint16_t i = 0; i < (SVL / 32); i++) { EXPECT_EQ(getMemoryValue(process_->getInitialStackPointer() - 4095 + (i * 4)), src_vert[i % 4]); @@ -1064,7 +1064,7 @@ TEST_P(InstSme, st1w) { ld1w {za1v.s[w12, 2]}, p1/z, [x0, x3, lsl #2] st1w {za1v.s[w12, 2]}, p1, [x5, x3, lsl #2] )"); - for (uint64_t i = 0; i < (SVL / 64); i++) { + for (uint16_t i = 0; i < (SVL / 64); i++) { EXPECT_EQ(getMemoryValue(800 + (i * 4)), src_vert[i % 4]); EXPECT_EQ(getMemoryValue(800 + 16 + (i * 4)), src_vert[i % 4]); } @@ -1109,7 +1109,7 @@ TEST_P(InstSme, udot_Indexed_vgx4) { )"); const uint16_t zaStride = (SVL / 8) / 4; const uint16_t zaQuartIndex = 2; - for (uint64_t i = 0; i < (SVL / 8); i++) { + for (uint16_t i = 0; i < (SVL / 8); i++) { // Effected rows all use same zm values of {8, 9, 10, 11} if (i == zaQuartIndex) { CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, @@ -1173,7 +1173,7 @@ TEST_P(InstSme, udot_vgx4) { )"); const uint16_t zaStride = (SVL / 8) / 4; const uint16_t zaQuartIndex = 2; - for (uint64_t i = 0; i < (SVL / 8); i++) { + for (uint16_t i = 0; i < (SVL / 8); i++) { if (i == zaQuartIndex) { CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, fillNeon({156, 316, 476, 636}, (SVL / 8))); @@ -1233,7 +1233,7 @@ TEST_P(InstSme, uvdot_indexed_vgx4) { )"); const uint16_t zaStride = (SVL / 8) / 4; const uint16_t zaQuartIndex = 2; - for (uint64_t i = 0; i < (SVL / 8); i++) { + for (uint16_t i = 0; i < (SVL / 8); i++) { // Effected rows all use same zm values of {8, 9, 10, 11} if (i == zaQuartIndex) { CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, @@ -1279,7 +1279,7 @@ TEST_P(InstSme, umopa) { umopa za2.s, p0/m, p2/m, z3.b, z4.b )"); - for (uint64_t i = 0; i < (SVL / 32); i++) { + for (uint16_t i = 0; i < (SVL / 32); i++) { CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t, fillNeon({96}, (SVL / 8))); CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t, @@ -1302,7 +1302,7 @@ TEST_P(InstSme, zero) { zero {za} )"); - for (uint64_t i = 0; i < (SVL / 8); i++) { + for (uint16_t i = 0; i < (SVL / 8); i++) { CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint64_t, fillNeon({0}, SVL / 8)); } @@ -1339,7 +1339,7 @@ TEST_P(InstSme, zero) { zero {za0.s, za2.s} )"); - for (uint64_t i = 0; i < (SVL / 32); i++) { + for (uint16_t i = 0; i < (SVL / 32); i++) { CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t, fillNeon({0}, SVL / 8)); CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t, diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc index f9699593f3..9411ef0085 100644 --- a/test/regression/aarch64/instructions/sve.cc +++ b/test/regression/aarch64/instructions/sve.cc @@ -2884,7 +2884,7 @@ TEST_P(InstSve, faddv) { )"); float s3 = 0.0f; float s4 = 0.0f; - for (int i = 0; i < VL / 32; i++) { + for (uint64_t i = 0; i < VL / 32; i++) { s3 += fsrc[i % (fsrc.size())]; if (i < (VL / 64)) s4 += fsrc[i % (fsrc.size())]; } @@ -2922,7 +2922,7 @@ TEST_P(InstSve, faddv) { )"); double d3 = 0.0; double d4 = 0.0; - for (int i = 0; i < VL / 64; i++) { + for (uint64_t i = 0; i < (VL / 64); i++) { d3 += dsrc[i % (dsrc.size())]; if (i < (VL / 128)) d4 += dsrc[i % (dsrc.size())]; }