encode,encode2: Support AVX-512 encoding

aengelke · Jul 7, 2024 · 0d67083 · 0d67083
1 parent a16367e
commit 0d67083
Show file tree

Hide file tree

Showing 11 changed files with 901 additions and 91 deletions.
diff --git a/README.md b/README.md
@@ -90,25 +90,26 @@ failed |= fe_enc64(&cur, FE_RET);
 The API consists of one function to handle encode requests, as well as some macros. More information can be found in [fadec-enc.h](fadec-enc.h). Usage of internals like enum values is not recommended.
 
 - `int fe_enc64(uint8_t** buf, uint64_t mnem, int64_t operands...)`
-    - Encodes an instruction for x86-64 into `*buf`.
+    - Encodes an instruction for x86-64 into `*buf`. EVEX-encoded instructions will transparently encode with the shorter VEX prefix where permitted.
     - Return value: `0` on success, a negative value in error cases.
     - `buf`: Pointer to the pointer to the instruction buffer. The pointer (`*buf`) will be advanced by the number of bytes written. The instruction buffer must have at least 15 bytes left.
     - `mnem`: Instruction mnemonic to encode combined with extra flags:
         - `FE_SEG(segreg)`: override segment to specified segment register.
         - `FE_ADDR32`: override address size to 32-bit.
         - `FE_JMPL`: use longest possible offset encoding, useful when jump target is not known.
+        - `FE_MASK(maskreg)`: specify non-zero mask register (1--7) for instructions that support masking (suffixed with `_mask` or `_maskz`) or require a mask (AVX-512 gather/scatter).
+        - `FE_RC_RN/RD/RU/RZ`: set rounding mode for instructions with static rounding control (suffixed `_er`).
     - `operands...`: Up to 4 instruction operands. The operand kinds must match the requirements of the mnemonic.
-        - For register operands, use the register: `FE_AX`, `FE_AH`, `FE_XMM12`.
-        - For immediate operands, use the constant: `12`, `-0xbeef`.
-        - For memory operands, use: `FE_MEM(basereg,scale,indexreg,offset)`. Use `0` to specify _no register_. For RIP-relative addressing, the size of the instruction is added automatically.
-        - For offset operands, specify the target address.
+        - For register operands (`r`=non-mask register, `k`=mask register), use the register: `FE_AX`, `FE_AH`, `FE_XMM12`.
+        - For immediate operands (`i`=regular, `a`=absolute address), use the constant: `12`, `-0xbeef`.
+        - For memory operands (`m`=regular or `b`=broadcast), use: `FE_MEM(basereg,scale,indexreg,offset)`. Use `0` to specify _no register_. For RIP-relative addressing, the size of the instruction is added automatically.
+        - For offset operands (`o`), specify the target address.
 
 ## Known issues
 - Decoder/Encoder: register uniqueness constraints are not enforced. This affects:
     - VSIB-encoded instructions: no vector register may be used more than once
     - AMX instructions: no tile register may be used more than once
     - AVX-512 complex FP16 multiplication: destination must be not be equal to a source register
-- Encoder: AVX-512 not supported (yet).
 - Prefixes for indirect jumps and calls are not properly decoded, e.g. `notrack`, `bnd`.
 - Low test coverage. (Help needed.)
 - No Python API.

diff --git a/encode-test.c b/encode-test.c
@@ -54,6 +54,7 @@ main(int argc, char** argv)
     // VSIB encoding doesn't differ for this API
 #define FE_MEMV FE_MEM
 #define FE_PTR(off) ((intptr_t) buf + (off))
+#define FLAGMASK(flags, mask) (flags | FE_MASK(mask & 7))
 #include "encode-test.inc"
 
     puts(failed ? "Some tests FAILED" : "All tests PASSED");

diff --git a/encode-test.inc b/encode-test.inc
diff --git a/encode.c b/encode.c
@@ -30,24 +30,32 @@
 #define OPC_67 FE_ADDR32
 #define OPC_SEG_MSK 0xe0000000
 #define OPC_JMPL FE_JMPL
-#define OPC_MASK_MSK 0x1e00000000
+#define OPC_MASK_MSK 0xe00000000
+#define OPC_EVEXZ 0x1000000000
 #define OPC_USER_MSK (OPC_67|OPC_SEG_MSK|OPC_MASK_MSK)
 #define OPC_FORCE_SIB 0x2000000000
+#define OPC_DOWNGRADE_VEX 0x4000000000
+#define OPC_DOWNGRADE_VEX_FLIPW 0x40000000000
+#define OPC_EVEX_DISP8SCALE 0x38000000000
 #define OPC_GPH_OP0 0x200000000000
 #define OPC_GPH_OP1 0x400000000000
 
-#define EPFX_REX_MSK 0x0f
-#define EPFX_REX 0x08
-#define EPFX_REXR 0x04
-#define EPFX_REXX 0x02
-#define EPFX_REXB 0x01
-#define EPFX_VVVV_IDX 4
+#define EPFX_REX_MSK 0x43f
+#define EPFX_REX 0x20
+#define EPFX_EVEX 0x40
+#define EPFX_REXR 0x10
+#define EPFX_REXX 0x08
+#define EPFX_REXB 0x04
+#define EPFX_REXR4 0x02
+#define EPFX_REXB4 0x01
+#define EPFX_REXX4 0x400
+#define EPFX_VVVV_IDX 11
 
 static bool op_mem(FeOp op) { return op < 0; }
 static bool op_reg(FeOp op) { return op >= 0; }
-static bool op_reg_gpl(FeOp op) { return (op & ~0xf) == 0x100; }
+static bool op_reg_gpl(FeOp op) { return (op & ~0x1f) == 0x100; }
 static bool op_reg_gph(FeOp op) { return (op & ~0x3) == 0x204; }
-static bool op_reg_xmm(FeOp op) { return (op & ~0xf) == 0x600; }
+static bool op_reg_xmm(FeOp op) { return (op & ~0x1f) == 0x600; }
 static int64_t op_mem_offset(FeOp op) { return (int32_t) op; }
 static unsigned op_mem_base(FeOp op) { return (op >> 32) & 0xfff; }
 static unsigned op_mem_idx(FeOp op) { return (op >> 44) & 0xfff; }
@@ -97,13 +105,29 @@ enc_opc(uint8_t** restrict buf, uint64_t opc, uint64_t epfx)
         *(*buf)++ = (0x65643e362e2600 >> (8 * ((opc >> 29) & 7))) & 0xff;
     if (opc & OPC_67) *(*buf)++ = 0x67;
     if (opc & OPC_EVEXL0) {
-        return -1;
+        *(*buf)++ = 0x62;
+        unsigned b1 = opc >> 16 & 7;
+        if (!(epfx & EPFX_REXR)) b1 |= 0x80;
+        if (!(epfx & EPFX_REXX)) b1 |= 0x40;
+        if (!(epfx & EPFX_REXB)) b1 |= 0x20;
+        if (!(epfx & EPFX_REXR4)) b1 |= 0x10;
+        if ((epfx & EPFX_REXB4)) b1 |= 0x08;
+        *(*buf)++ = b1;
+        unsigned b2 = opc >> 20 & 3;
+        if (!(epfx & EPFX_REXX4)) b2 |= 0x04;
+        b2 |= (~(epfx >> EPFX_VVVV_IDX) & 0xf) << 3;
+        if (opc & OPC_REXW) b2 |= 0x80;
+        *(*buf)++ = b2;
+        unsigned b3 = opc >> 33 & 7;
+        b3 |= (~(epfx >> EPFX_VVVV_IDX) & 0x10) >> 1;
+        if (opc & OPC_EVEXB) b3 |= 0x10;
+        b3 |= (opc >> 23 & 3) << 5;
+        if (opc & OPC_EVEXZ) b3 |= 0x80;
+        *(*buf)++ = b3;
     } else if (opc & OPC_VEXL0) {
+        if (epfx & (EPFX_REXR4|EPFX_REXX4|EPFX_REXB4|(0x10<<EPFX_VVVV_IDX))) return -1;
         bool vex3 = opc & (OPC_REXW|0x20000) || epfx & (EPFX_REXX|EPFX_REXB);
-        unsigned pp = 0;
-        if (opc & OPC_66) pp = 1;
-        if (opc & OPC_F3) pp = 2;
-        if (opc & OPC_F2) pp = 3;
+        unsigned pp = opc >> 20 & 3;
         *(*buf)++ = 0xc4 | !vex3;
         unsigned b2 = pp | (opc & 0x800000 ? 0x4 : 0);
         if (vex3) {
@@ -171,13 +195,28 @@ enc_mr(uint8_t** restrict buf, uint64_t opc, uint64_t epfx, uint64_t op0,
 {
     // If !op_reg(op1), it is a constant value for ModRM.reg
     if (op_reg(op0) && (op_reg_idx(op0) & 0x8)) epfx |= EPFX_REXB;
+    if (op_reg(op0) && (op_reg_idx(op0) & 0x10))
+        epfx |= 0 ? EPFX_REXB4 : EPFX_REXX|EPFX_EVEX;
     if (op_mem(op0) && (op_mem_base(op0) & 0x8)) epfx |= EPFX_REXB;
+    if (op_mem(op0) && (op_mem_base(op0) & 0x10)) epfx |= EPFX_REXB4;
     if (op_mem(op0) && (op_mem_idx(op0) & 0x8)) epfx |= EPFX_REXX;
-    if (op_reg(op1) && op_reg_idx(op1) & 0x8) epfx |= EPFX_REXR;
+    if (op_mem(op0) && (op_mem_idx(op0) & 0x10))
+        epfx |= opc & OPC_VSIB ? 0x10<<EPFX_VVVV_IDX : EPFX_REXX4;
+    if (op_reg(op1) && (op_reg_idx(op1) & 0x8)) epfx |= EPFX_REXR;
+    if (op_reg(op1) && (op_reg_idx(op1) & 0x10)) epfx |= EPFX_REXR4;
 
-    bool has_rex = opc & OPC_REXW || epfx & EPFX_REX_MSK;
+    bool has_rex = opc & (OPC_REXW|OPC_VEXL0|OPC_EVEXL0) || (epfx & EPFX_REX_MSK);
     if (has_rex && (op_reg_gph(op0) || op_reg_gph(op1))) return -1;
 
+    if (epfx & (EPFX_EVEX|EPFX_REXB4|EPFX_REXX4|EPFX_REXR4|(0x10<<EPFX_VVVV_IDX))) {
+        if (!(opc & OPC_EVEXL0)) return -1;
+    } else if (opc & OPC_DOWNGRADE_VEX) { // downgrade EVEX to VEX
+        // clear EVEX and disp8scale, set VEX
+        opc = (opc & ~(uint64_t) (OPC_EVEXL0|OPC_EVEX_DISP8SCALE)) | OPC_VEXL0;
+        if (opc & OPC_DOWNGRADE_VEX_FLIPW)
+            opc ^= OPC_REXW;
+    }
+
     if (LIKELY(op_reg(op0))) {
         if (enc_opc(buf, opc, epfx)) return -1;
         *(*buf)++ = 0xc0 | ((op_reg_idx(op1) & 7) << 3) | (op_reg_idx(op0) & 7);
@@ -198,6 +237,8 @@ enc_mr(uint8_t** restrict buf, uint64_t opc, uint64_t epfx, uint64_t op0,
         if (opc & OPC_VSIB)
         {
             if (!op_reg_xmm(op_mem_idx(op0))) return -1;
+            // EVEX VSIB requires non-zero opmask
+            if ((opc & OPC_EVEXL0) && !(opc & OPC_MASK_MSK)) return -1;
         }
         else
         {
@@ -235,8 +276,15 @@ enc_mr(uint8_t** restrict buf, uint64_t opc, uint64_t epfx, uint64_t op0,
             rm = 4;
         }
         if (off) {
-            mod = op_imm_n(off, 1) ? 0x40 : 0x80;
-            dispsz = op_imm_n(off, 1) ? 1 : 4;
+            unsigned disp8scale = (opc & OPC_EVEX_DISP8SCALE) >> 39;
+            if (!(off & ((1 << disp8scale) - 1)) && op_imm_n(off >> disp8scale, 1)) {
+                mod = 0x40;
+                dispsz = 1;
+                off >>= disp8scale;
+            } else {
+                mod = 0x80;
+                dispsz = 4;
+            }
         } else if (rm == 5) {
             mod = 0x40;
             dispsz = 1;
@@ -370,7 +418,8 @@ try_encode:;
         FeOp modreg = ei->modreg ? ops[ei->modreg^3] : (opc & 0xff00) >> 8;
         if (ei->vexreg)
             epfx |= ((uint64_t) op_reg_idx(ops[ei->vexreg^3])) << EPFX_VVVV_IDX;
-        if (enc_mr(buf, opc, epfx, ops[ei->modrm^3], modreg, immsz)) goto fail;
+        // Can fail for upgrade to EVEX due to high register numbers
+        if (enc_mr(buf, opc, epfx, ops[ei->modrm^3], modreg, immsz)) goto next;
     } else if (ei->modreg) {
         if (enc_o(buf, opc, epfx, ops[ei->modreg^3])) goto fail;
     } else {

diff --git a/encode2-test.c b/encode2-test.c
@@ -42,6 +42,7 @@ main(void) {
 #define ENC_TEST_TYPESAFE
     // Silence -Warray-bounds with double cast
 #define FE_PTR(off) (const void*) ((uintptr_t) buf + (off))
+#define FLAGMASK(flags, mask) flags, mask
 #include "encode-test.inc"
 
     TEST("\x90", NOP, 0);

diff --git a/encode2-test.cc b/encode2-test.cc
@@ -43,6 +43,7 @@ int main() {
 #define ENC_TEST_TYPESAFE
     // Silence -Warray-bounds with double cast
 #define FE_PTR(off) (const void*) ((uintptr_t) buf.data() + (off))
+#define FLAGMASK(flags, mask) flags, mask
 #include "encode-test.inc"
 
     std::puts(failed ? "Some tests FAILED" : "All tests PASSED");