From 9d7ec7639517183382b07b8e24e1620a902503ce Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Thu, 7 Nov 2024 16:50:31 +0000 Subject: [PATCH] Implemented ADD (uint32, vgx2, vectors and ZA), SME instruction with tests. --- src/lib/arch/aarch64/Instruction_execute.cc | 34 ++++++++++++++++ test/regression/aarch64/instructions/sme.cc | 45 +++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 4300fc68c..28a254679 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -328,6 +328,40 @@ void Instruction::execute() { results_[0] = vecAdd_3ops(sourceValues_); break; } + case Opcode::AArch64_ADD_VG2_M2Z_S: { // add za.s[wv, off, vgx2], {zn1.s, + // zn2.s} + // SME + // Check core is in correct context mode (check SM first) + if (!SMenabled) return SMdisabled(); + if (!ZAenabled) return ZAdisabled(); + + const uint16_t zaRowCount = VL_bits / 8; + const uint16_t elemCount = VL_bits / 32; + + // Get ZA stride between halves and index into each ZA half + const uint16_t zaStride = zaRowCount / 2; + const uint32_t zaIndex = (sourceValues_[zaRowCount].get() + + metadata_.operands[0].sme.slice_offset.imm) % + zaStride; + + // Pre-set all ZA result rows as only 2 will be updated in loop below + for (int z = 0; z < zaRowCount; z++) { + results_[z] = sourceValues_[z]; + } + + for (int r = 0; r < 2; r++) { + const uint32_t* zaRow = + sourceValues_[(r * zaStride) + zaIndex].getAsVector(); + const uint32_t* znr = + sourceValues_[zaRowCount + 1 + r].getAsVector(); + uint32_t out[64] = {0}; + for (int i = 0; i < elemCount; i++) { + out[i] = zaRow[i] + znr[i]; + } + results_[(r * zaStride) + zaIndex] = RegisterValue(out, 256); + } + break; + } case Opcode::AArch64_ADR: { // adr xd, #imm results_[0] = instructionAddress_ + metadata_.operands[1].imm; break; diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc index 8b1272547..aca3d0ba9 100644 --- a/test/regression/aarch64/instructions/sme.cc +++ b/test/regression/aarch64/instructions/sme.cc @@ -7,6 +7,51 @@ namespace { using InstSme = AArch64RegressionTest; +TEST_P(InstSme, add) { + // uint32_T, vgx2, vecs with ZA + RUN_AARCH64(R"( + # Get heap address + mov x0, 0 + mov x8, 214 + svc #0 + + smstart + + zero {za} + + # Pre-fill all of za with 96 (uint32_t) + dup z0.b, #8 + dup z1.b, #3 + ptrue p0.b + ptrue p1.b + umopa za0.s, p0/m, p1/m, z0.b, z1.b + umopa za1.s, p0/m, p1/m, z0.b, z1.b + umopa za2.s, p0/m, p1/m, z0.b, z1.b + umopa za3.s, p0/m, p1/m, z0.b, z1.b + + # Set 2 of the za rows + mov w8, #1 + dup z0.s, #8 + dup z1.s, #3 + add za.s[w8, #1, vgx2], {z0.s, z1.s} + )"); + const uint16_t zaStride = (SVL / 8) / 2; + const uint16_t zaHalfIndex = 2; + for (uint64_t i = 0; i < (SVL / 8); i++) { + if (i == zaHalfIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({104}, (SVL / 8))); + } else if (i == zaStride + zaHalfIndex) { + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({99}, (SVL / 8))); + } else { + // un-effected rows should still be 96 throughout + CHECK_MAT_ROW(AARCH64_REG_ZA, i, uint32_t, + fillNeon({96}, (SVL / 8))); + } + } +} + TEST_P(InstSme, mova_tileToVec) { // 8-bit RUN_AARCH64(R"(