Skip to content

Commit

Permalink
Adds ccmp logic into emitter backend.
Browse files Browse the repository at this point in the history
  • Loading branch information
anthonycanino committed Dec 20, 2024
1 parent 69c7a29 commit a625f59
Show file tree
Hide file tree
Showing 11 changed files with 490 additions and 123 deletions.
1 change: 1 addition & 0 deletions src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,7 @@ class CodeGen final : public CodeGenInterface
#if defined(TARGET_AMD64)
void genAmd64EmitterUnitTestsSse2();
void genAmd64EmitterUnitTestsApx();
void genAmd64EmitterUnitTestsCCMP();
#endif

#endif // defined(DEBUG)
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/codegenlinear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2702,6 +2702,10 @@ void CodeGen::genEmitterUnitTests()
{
genAmd64EmitterUnitTestsApx();
}
if (unitTestSectionAll || (strstr(unitTestSection, "ccmp") != nullptr))
{
genAmd64EmitterUnitTestsCCMP();
}

#elif defined(TARGET_ARM64)
if (unitTestSectionAll || (strstr(unitTestSection, "general") != nullptr))
Expand Down
137 changes: 111 additions & 26 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,8 @@ void CodeGen::instGen_Set_Reg_To_Imm(emitAttr size,
else
{
// For section constant, the immediate will be relocatable
GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm, INS_OPTS_NONE DEBUGARG(targetHandle) DEBUGARG(gtFlags));
GetEmitter()->emitIns_R_I(INS_mov, size, reg, imm,
INS_OPTS_NONE DEBUGARG(targetHandle) DEBUGARG(gtFlags));
}
}
else
Expand Down Expand Up @@ -769,8 +770,8 @@ void CodeGen::genCodeForNegNot(GenTree* tree)
{
GenTree* operand = tree->gtGetOp1();
assert(operand->isUsedFromReg());
regNumber operandReg = genConsumeReg(operand);
instruction ins = genGetInsForOper(tree->OperGet(), targetType);
regNumber operandReg = genConsumeReg(operand);
instruction ins = genGetInsForOper(tree->OperGet(), targetType);

if (JitConfig.JitEnableApxNDD() && GetEmitter()->IsApxNDDEncodableInstruction(ins) && (targetReg != operandReg))
{
Expand Down Expand Up @@ -1202,7 +1203,7 @@ void CodeGen::genCodeForBinary(GenTreeOp* treeNode)
// TODO-xarch-apx:
// APX can provide optimal code gen in this case using NDD feature:
// reg3 = op1 op op2 without extra mov

// see if it can be optimized by inc/dec
if (oper == GT_ADD && op2->isContainedIntOrIImmed() && !treeNode->gtOverflowEx())
{
Expand Down Expand Up @@ -1231,7 +1232,7 @@ void CodeGen::genCodeForBinary(GenTreeOp* treeNode)
genProduceReg(treeNode);
return;
}
else
else
{
var_types op1Type = op1->TypeGet();
inst_Mov(op1Type, targetReg, op1reg, /* canSkip */ false);
Expand Down Expand Up @@ -1372,7 +1373,8 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode)
}
assert(regOp->isUsedFromReg());

if (JitConfig.JitEnableApxNDD() && emit->IsApxNDDEncodableInstruction(ins) && regOp->GetRegNum() != mulTargetReg)
if (JitConfig.JitEnableApxNDD() && emit->IsApxNDDEncodableInstruction(ins) &&
regOp->GetRegNum() != mulTargetReg)
{
// use NDD form to optimize this form:
// mov targetReg, regOp
Expand All @@ -1390,7 +1392,6 @@ void CodeGen::genCodeForMul(GenTreeOp* treeNode)
return;
}


// Setup targetReg when neither of the source operands was a matching register
inst_Mov(targetType, mulTargetReg, regOp->GetRegNum(), /* canSkip */ true);

Expand Down Expand Up @@ -4939,8 +4940,8 @@ void CodeGen::genCodeForShift(GenTree* tree)
return;
}


if (JitConfig.JitEnableApxNDD() && GetEmitter()->IsApxNDDEncodableInstruction(ins) && (tree->GetRegNum() != operandReg))
if (JitConfig.JitEnableApxNDD() && GetEmitter()->IsApxNDDEncodableInstruction(ins) &&
(tree->GetRegNum() != operandReg))
{
ins = genMapShiftInsToShiftByConstantIns(ins, shiftByValue);
// If APX is available, we can use NDD to optimize the case when LSRA failed to avoid explicit mov.
Expand All @@ -4951,7 +4952,8 @@ void CodeGen::genCodeForShift(GenTree* tree)
}
else
{
GetEmitter()->emitIns_R_R_I(ins, emitTypeSize(tree), tree->GetRegNum(), operandReg, shiftByValue, INS_OPTS_EVEX_nd);
GetEmitter()->emitIns_R_R_I(ins, emitTypeSize(tree), tree->GetRegNum(), operandReg, shiftByValue,
INS_OPTS_EVEX_nd);
}
genProduceReg(tree);
return;
Expand Down Expand Up @@ -5002,7 +5004,8 @@ void CodeGen::genCodeForShift(GenTree* tree)
// The operand to be shifted must not be in ECX
noway_assert(operandReg != REG_RCX);

if (JitConfig.JitEnableApxNDD() && GetEmitter()->IsApxNDDEncodableInstruction(ins) && (tree->GetRegNum() != operandReg))
if (JitConfig.JitEnableApxNDD() && GetEmitter()->IsApxNDDEncodableInstruction(ins) &&
(tree->GetRegNum() != operandReg))
{
// If APX is available, we can use NDD to optimize the case when LSRA failed to avoid explicit mov.
// this case might be rarely hit.
Expand Down Expand Up @@ -9367,14 +9370,14 @@ void CodeGen::genAmd64EmitterUnitTestsApx()

theEmitter->emitIns_R_R_R(INS_add, EA_8BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);
theEmitter->emitIns_R_R_R(INS_sub, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);
theEmitter->emitIns_R_R_R(INS_or, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);
theEmitter->emitIns_R_R_R(INS_or, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);
theEmitter->emitIns_R_R_R(INS_and, EA_2BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);
theEmitter->emitIns_R_R_R(INS_xor, EA_1BYTE, REG_R10, REG_EAX, REG_ECX, INS_OPTS_EVEX_nd);

theEmitter->emitIns_R_R_I(INS_or, EA_2BYTE, REG_R10, REG_EAX, 10565, INS_OPTS_EVEX_nd);
theEmitter->emitIns_R_R_I(INS_or, EA_8BYTE, REG_R10, REG_EAX, 10, INS_OPTS_EVEX_nd);
theEmitter->emitIns_R_R_S(INS_or, EA_8BYTE, REG_R10, REG_EAX, 0, 1, INS_OPTS_EVEX_nd);

theEmitter->emitIns_R_R(INS_neg, EA_2BYTE, REG_R10, REG_ECX, INS_OPTS_EVEX_nd);

theEmitter->emitIns_R_R(INS_shl, EA_2BYTE, REG_R11, REG_EAX, INS_OPTS_EVEX_nd);
Expand All @@ -9401,13 +9404,13 @@ void CodeGen::genAmd64EmitterUnitTestsApx()
theEmitter->emitIns_R_I(INS_add, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_I(INS_sub, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_I(INS_and, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_I(INS_or, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_I(INS_or, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_I(INS_xor, EA_4BYTE, REG_R12, 5, INS_OPTS_EVEX_nf);

theEmitter->emitIns_R_S(INS_add, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_S(INS_sub, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_S(INS_and, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_S(INS_or, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_S(INS_or, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_S(INS_xor, EA_4BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);

theEmitter->emitIns_R(INS_neg, EA_2BYTE, REG_R11, INS_OPTS_EVEX_nf);
Expand All @@ -9426,22 +9429,104 @@ void CodeGen::genAmd64EmitterUnitTestsApx()
theEmitter->emitIns_R(INS_div, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R(INS_idiv, EA_8BYTE, REG_R12, INS_OPTS_EVEX_nf);

theEmitter->emitIns_R_R(INS_tzcnt_evex, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_R(INS_lzcnt_evex, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_R(INS_tzcnt_evex, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_R(INS_lzcnt_evex, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_R(INS_popcnt_evex, EA_8BYTE, REG_R12, REG_R11, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_S(INS_tzcnt_evex, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_S(INS_lzcnt_evex, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);

theEmitter->emitIns_R_S(INS_tzcnt_evex, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_S(INS_lzcnt_evex, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_S(INS_popcnt_evex, EA_8BYTE, REG_R12, 0, 1, INS_OPTS_EVEX_nf);

theEmitter->emitIns_R_R_R(INS_add, EA_2BYTE, REG_R12, REG_R13, REG_R11, (insOpts)(INS_OPTS_EVEX_nf | INS_OPTS_EVEX_nd));
theEmitter->emitIns_R_R_R(INS_add, EA_2BYTE, REG_R12, REG_R13, REG_R11,
(insOpts)(INS_OPTS_EVEX_nf | INS_OPTS_EVEX_nd));

theEmitter->emitIns_R_R_R(INS_andn, EA_8BYTE, REG_R11, REG_R13, REG_R11, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_R_R(INS_andn, EA_8BYTE, REG_R11, REG_R13, REG_R11, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_R_R(INS_bextr, EA_8BYTE, REG_R11, REG_R13, REG_R11, INS_OPTS_EVEX_nf);

theEmitter->emitIns_R_R(INS_blsi, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_R(INS_blsmsk, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_S(INS_blsr, EA_8BYTE, REG_R11, 0, 1);

theEmitter->emitIns_R_R(INS_blsi, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_R(INS_blsmsk, EA_8BYTE, REG_R11, REG_R13, INS_OPTS_EVEX_nf);
theEmitter->emitIns_R_S(INS_blsr, EA_8BYTE, REG_R11, 0, 1);
}

/*****************************************************************************
* Unit tests for the CCMP instructions.
*/

void CodeGen::genAmd64EmitterUnitTestsCCMP()
{
emitter* theEmitter = GetEmitter();
genDefineTempLabel(genCreateTempLabel());

// #ifdef COMMENTOUT

// ============
// Test RR form
// ============

// Test all sizes
theEmitter->emitIns_R_R(INS_ccmpe, EA_4BYTE, REG_RAX, REG_RCX, INS_OPTS_EVEX_dfv_cf);
theEmitter->emitIns_R_R(INS_ccmpe, EA_8BYTE, REG_RAX, REG_RCX, INS_OPTS_EVEX_dfv_cf);
theEmitter->emitIns_R_R(INS_ccmpe, EA_2BYTE, REG_RAX, REG_RCX, INS_OPTS_EVEX_dfv_cf);
theEmitter->emitIns_R_R(INS_ccmpe, EA_1BYTE, REG_RAX, REG_RCX, INS_OPTS_EVEX_dfv_cf);

// Test all CC codes
for (uint32_t ins = INS_FIRST_CCMP_INSTRUCTION + 1; ins < INS_LAST_CCMP_INSTRUCTION; ins++)
{
theEmitter->emitIns_R_R((instruction)ins, EA_4BYTE, REG_RAX, REG_RCX, INS_OPTS_EVEX_dfv_cf);
}

// Test all dfv
for (int i = 0; i < 16; i++)
{
theEmitter->emitIns_R_R(INS_ccmpe, EA_4BYTE, REG_RAX, REG_RCX, (insOpts)(i << INS_OPTS_EVEX_dfv_byte_offset));
}

// ============
// Test RS form
// ============

// Test all sizes
theEmitter->emitIns_R_S(INS_ccmpe, EA_4BYTE, REG_RAX, 0, 0, INS_OPTS_EVEX_dfv_cf);
theEmitter->emitIns_R_S(INS_ccmpe, EA_8BYTE, REG_RAX, 0, 0, INS_OPTS_EVEX_dfv_cf);
theEmitter->emitIns_R_S(INS_ccmpe, EA_2BYTE, REG_RAX, 0, 0, INS_OPTS_EVEX_dfv_cf);
theEmitter->emitIns_R_S(INS_ccmpe, EA_1BYTE, REG_RAX, 0, 0, INS_OPTS_EVEX_dfv_cf);

// Test all CC codes
for (uint32_t ins = INS_FIRST_CCMP_INSTRUCTION + 1; ins < INS_LAST_CCMP_INSTRUCTION; ins++)
{
theEmitter->emitIns_R_S((instruction)ins, EA_4BYTE, REG_RAX, 0, 0, INS_OPTS_EVEX_dfv_cf);
}

// Test all dfv
for (int i = 0; i < 16; i++)
{
theEmitter->emitIns_R_S(INS_ccmpe, EA_4BYTE, REG_RAX, 0, 0, (insOpts)(i << INS_OPTS_EVEX_dfv_byte_offset));
}

// ============
// Test RI form (test small and large sizes and constants)
// ============

theEmitter->emitIns_R_I(INS_ccmpe, EA_4BYTE, REG_RAX, 123, INS_OPTS_EVEX_dfv_cf);
theEmitter->emitIns_R_I(INS_ccmpe, EA_4BYTE, REG_RAX, 270, INS_OPTS_EVEX_dfv_cf);

theEmitter->emitIns_R_I(INS_ccmpe, EA_8BYTE, REG_RAX, 123, INS_OPTS_EVEX_dfv_cf);
theEmitter->emitIns_R_I(INS_ccmpe, EA_8BYTE, REG_RAX, 270, INS_OPTS_EVEX_dfv_cf);

theEmitter->emitIns_R_I(INS_ccmpe, EA_2BYTE, REG_RAX, 123, INS_OPTS_EVEX_dfv_cf);
theEmitter->emitIns_R_I(INS_ccmpe, EA_2BYTE, REG_RAX, 270, INS_OPTS_EVEX_dfv_cf);

theEmitter->emitIns_R_I(INS_ccmpe, EA_1BYTE, REG_RAX, 123, INS_OPTS_EVEX_dfv_cf);
theEmitter->emitIns_R_I(INS_ccmpe, EA_1BYTE, REG_RAX, 270, INS_OPTS_EVEX_dfv_cf);

// ============
// Test RC form
// ============

CORINFO_FIELD_HANDLE hnd = theEmitter->emitFltOrDblConst(1.0f, EA_4BYTE);
theEmitter->emitIns_R_C(INS_ccmpe, EA_4BYTE, REG_RAX, hnd, 0, INS_OPTS_EVEX_dfv_cf);
theEmitter->emitIns_R_C(INS_ccmpe, EA_4BYTE, REG_RAX, hnd, 4, INS_OPTS_EVEX_dfv_cf);
// #endif
}

#endif // defined(DEBUG) && defined(TARGET_AMD64)
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2299,7 +2299,7 @@ void Compiler::compSetProcessor()
if (canUseApxEncoding())
{
// TODO-Xarch-apx:
// At this stage, since no machine will pass the CPUID check for APX, we need a special stress mode that
// At this stage, since no machine will pass the CPUID check for APX, we need a special stress mode that
// enables REX2 on incompatible platform, `DoJitStressRex2Encoding` is expected to be removed eventually.
codeGen->GetEmitter()->SetUseRex2Encoding(true);
codeGen->GetEmitter()->SetUsePromotedEVEXEncoding(true);
Expand Down
27 changes: 23 additions & 4 deletions src/coreclr/jit/emit.h
Original file line number Diff line number Diff line change
Expand Up @@ -797,9 +797,13 @@ class emitter
unsigned _idCustom5 : 1;
unsigned _idCustom6 : 1;

#define _idEvexbContext (_idCustom6 << 1) | _idCustom5 /* Evex.b: embedded broadcast, embedded rounding, embedded SAE */
#define _idEvexNdContext _idCustom5 /* bits used for the APX-EVEX.nd context for promoted legacy instructions */
#define _idEvexNfContext _idCustom6 /* bits used for the APX-EVEX.nf context for promoted legacy/vex instructions */
#define _idEvexbContext \
(_idCustom6 << 1) | _idCustom5 /* Evex.b: embedded broadcast, embedded rounding, embedded SAE \
*/
#define _idEvexNdContext _idCustom5 /* bits used for the APX-EVEX.nd context for promoted legacy instructions */
#define _idEvexNfContext _idCustom6 /* bits used for the APX-EVEX.nf context for promoted legacy/vex instructions */
#define _idEvexDFV (_idCustom4 << 3) | (_idCustom3 << 2) | (_idCustom2 << 1) | _idCustom1

#endif // TARGET_XARCH

#ifdef TARGET_ARM64
Expand Down Expand Up @@ -1015,6 +1019,7 @@ class emitter
regNumber _idReg3 : REGNUM_BITS;
regNumber _idReg4 : REGNUM_BITS;
};

#elif defined(TARGET_LOONGARCH64)
struct
{
Expand Down Expand Up @@ -1674,7 +1679,6 @@ class emitter

_idCustom5 = ((value >> 0) & 1);
_idCustom6 = ((value >> 1) & 1);

}

unsigned idGetEvexbContext() const
Expand Down Expand Up @@ -1736,6 +1740,21 @@ class emitter
assert(!idIsEvexNfContextSet());
_idEvexNfContext = 1;
}

unsigned idGetEvexDFV() const
{
return _idEvexDFV;
}

void idSetEvexDFV(insOpts instOptions)
{
unsigned value = static_cast<unsigned>((instOptions & INS_OPTS_EVEX_dfv_MASK) >> 8);

_idCustom1 = ((value >> 0) & 1);
_idCustom2 = ((value >> 1) & 1);
_idCustom3 = ((value >> 2) & 1);
_idCustom4 = ((value >> 3) & 1);
}
#endif

#ifdef TARGET_ARMARCH
Expand Down
Loading

0 comments on commit a625f59

Please sign in to comment.