Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support PUTARG_STK(STRUCT LCL_VAR/LCL_FLD) on ARM/64 #70256

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 52 additions & 120 deletions src/coreclr/jit/codegenarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -770,9 +770,7 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* treeNode)

GenTree* source = treeNode->gtGetOp1();

bool isStruct = source->TypeIs(TYP_STRUCT) || (source->OperGet() == GT_FIELD_LIST);

if (!isStruct) // a normal non-Struct argument
if (!source->TypeIs(TYP_STRUCT)) // a normal non-Struct argument
{
if (varTypeIsSIMD(source->TypeGet()))
{
Expand Down Expand Up @@ -864,9 +862,9 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* treeNode)
{
genPutArgStkFieldList(treeNode, varNumOut);
}
else // We must have a GT_OBJ or a GT_LCL_VAR
else
{
noway_assert(source->OperIs(GT_LCL_VAR, GT_OBJ));
noway_assert(source->OperIsLocalRead() || source->OperIs(GT_OBJ));

var_types targetType = source->TypeGet();
noway_assert(varTypeIsStruct(targetType));
Expand All @@ -879,96 +877,42 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* treeNode)
#ifdef TARGET_ARM64
regNumber hiReg = treeNode->GetSingleTempReg();
#endif // TARGET_ARM64
regNumber addrReg = REG_NA;

GenTreeLclVarCommon* varNode = nullptr;
GenTree* addrNode = nullptr;
GenTreeLclVarCommon* srcLclNode = nullptr;
regNumber addrReg = REG_NA;
ClassLayout* layout = nullptr;

if (source->OperGet() == GT_LCL_VAR)
// Setup "layout", "srcLclNode" and "addrReg".
if (source->OperIsLocalRead())
{
varNode = source->AsLclVarCommon();
srcLclNode = source->AsLclVarCommon();
layout = srcLclNode->GetLayout(compiler);
LclVarDsc* varDsc = compiler->lvaGetDesc(srcLclNode);

// This struct must live on the stack frame.
assert(varDsc->lvOnFrame && !varDsc->lvRegister);
}
else // we must have a GT_OBJ
{
assert(source->OperGet() == GT_OBJ);

addrNode = source->AsOp()->gtOp1;
layout = source->AsObj()->GetLayout();
addrReg = genConsumeReg(source->AsObj()->Addr());

// addrNode can either be a GT_LCL_VAR_ADDR or an address expression
//
if (addrNode->OperGet() == GT_LCL_VAR_ADDR)
#ifdef TARGET_ARM64
// If addrReg equal to loReg, swap(loReg, hiReg)
// This reduces code complexity by only supporting one addrReg overwrite case
if (loReg == addrReg)
{
// We have a GT_OBJ(GT_LCL_VAR_ADDR)
//
// We will treat this case the same as above
// (i.e if we just had this GT_LCL_VAR directly as the source)
// so update 'source' to point this GT_LCL_VAR_ADDR node
// and continue to the codegen for the LCL_VAR node below
//
assert(addrNode->isContained());
varNode = addrNode->AsLclVarCommon();
addrNode = nullptr;
loReg = hiReg;
hiReg = addrReg;
}
else // addrNode is used
{
// TODO-Cleanup: `Lowering::NewPutArg` marks only `LCL_VAR_ADDR` as contained nowadays,
// but we use `genConsumeAddress` as a precaution, use `genConsumeReg()` instead.
assert(!addrNode->isContained());
// Generate code to load the address that we need into a register
genConsumeAddress(addrNode);
addrReg = addrNode->GetRegNum();

#ifdef TARGET_ARM64
// If addrReg equal to loReg, swap(loReg, hiReg)
// This reduces code complexity by only supporting one addrReg overwrite case
if (loReg == addrReg)
{
loReg = hiReg;
hiReg = addrReg;
}
#endif // TARGET_ARM64
}
}

// Either varNode or addrNOde must have been setup above,
// the xor ensures that only one of the two is setup, not both
assert((varNode != nullptr) ^ (addrNode != nullptr));

ClassLayout* layout;
unsigned srcSize;
bool isHfa;

// Setup the srcSize, isHFa, and gcPtrCount
if (source->OperGet() == GT_LCL_VAR)
{
assert(varNode != nullptr);
LclVarDsc* varDsc = compiler->lvaGetDesc(varNode);

// This struct also must live in the stack frame
// And it can't live in a register (SIMD)
assert(varDsc->lvType == TYP_STRUCT);
assert(varDsc->lvOnFrame && !varDsc->lvRegister);

srcSize = varDsc->lvSize();
isHfa = varDsc->lvIsHfa();
layout = varDsc->GetLayout();
}
else // we must have a GT_OBJ
{
assert(source->OperGet() == GT_OBJ);

// If the source is an OBJ node then we need to use the type information
// it provides (size and GC layout) even if the node wraps a lclvar. Due
// to struct reinterpretation (e.g. Unsafe.As<X, Y>) it is possible that
// the OBJ node has a different type than the lclvar.
layout = source->AsObj()->GetLayout();
srcSize = layout->GetSize();
isHfa = compiler->IsHfa(layout->GetClassHandle());
}
unsigned srcSize = layout->GetSize();

// If we have an HFA we can't have any GC pointers,
// if not then the max size for the struct is 16 bytes
if (isHfa)
if (compiler->IsHfa(layout->GetClassHandle()))
{
noway_assert(!layout->HasGCPtr());
}
Expand All @@ -981,45 +925,32 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* treeNode)
noway_assert(srcSize <= MAX_PASS_MULTIREG_BYTES);
#endif // TARGET_ARM64

unsigned structSize;

unsigned dstSize = treeNode->GetStackByteSize();
if (dstSize != srcSize)

// We can generate smaller code if store size is a multiple of TARGET_POINTER_SIZE.
// The dst size can be rounded up to PUTARG_STK size. The src size can be rounded up
// if it reads a local variable because reading "too much" from a local cannot fault.
// We must also be careful to check for the arm64 apple case where arguments can be
// passed without padding.
//
if ((dstSize != srcSize) && (srcLclNode != nullptr))
{
// We can generate a smaller code if store size is a multiple of TARGET_POINTER_SIZE.
// The dst size can be rounded up to PUTARG_STK size.
// The src size can be rounded up if it reads a local variable slot because the local
// variable stack allocation size is rounded up to be a multiple of the TARGET_POINTER_SIZE.
// The exception is arm64 apple arguments because they can be passed without padding.
if (varNode != nullptr)
unsigned widenedSrcSize = roundUp(srcSize, TARGET_POINTER_SIZE);
if (widenedSrcSize <= dstSize)
{
// If we have a varNode, even if it was casted using `OBJ`, we can read its original memory size.
const LclVarDsc* varDsc = compiler->lvaGetDesc(varNode);
const unsigned varStackSize = varDsc->lvSize();
if (varStackSize >= srcSize)
{
srcSize = varStackSize;
}
srcSize = widenedSrcSize;
}
}
if (dstSize == srcSize)
{
structSize = dstSize;
}
else
{
// With Unsafe object cast we can have different strange combinations:
// PutArgStk<8>(Obj<16>(LclVar<8>)) -> copy 8 bytes;
// PutArgStk<16>(Obj<16>(LclVar<8>)) -> copy 16 bytes, reading undefined memory after the local.
structSize = min(dstSize, srcSize);
}

int remainingSize = structSize;
assert(srcSize <= dstSize);

int remainingSize = srcSize;
unsigned structOffset = 0;
unsigned lclOffset = (srcLclNode != nullptr) ? srcLclNode->GetLclOffs() : 0;
unsigned nextIndex = 0;

#ifdef TARGET_ARM64
// For a >= 16-byte structSize we will generate a ldp and stp instruction each loop
// For a >= 16-byte sizes we will generate a ldp and stp instruction each loop
// ldp x2, x3, [x0]
// stp x2, x3, [sp, #16]

Expand All @@ -1028,11 +959,11 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* treeNode)
var_types type0 = layout->GetGCPtrType(nextIndex + 0);
var_types type1 = layout->GetGCPtrType(nextIndex + 1);

if (varNode != nullptr)
if (srcLclNode != nullptr)
{
// Load from our varNumImp source
// Load from our local source
emit->emitIns_R_R_S_S(INS_ldp, emitTypeSize(type0), emitTypeSize(type1), loReg, hiReg,
varNode->GetLclNum(), structOffset);
srcLclNode->GetLclNum(), lclOffset + structOffset);
}
else
{
Expand All @@ -1056,17 +987,18 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* treeNode)
nextIndex += 2;
}
#else // TARGET_ARM
// For a >= 4 byte structSize we will generate a ldr and str instruction each loop
// For a >= 4 byte sizes we will generate a ldr and str instruction each loop
// ldr r2, [r0]
// str r2, [sp, #16]
while (remainingSize >= TARGET_POINTER_SIZE)
{
var_types type = layout->GetGCPtrType(nextIndex);

if (varNode != nullptr)
if (srcLclNode != nullptr)
{
// Load from our varNumImp source
emit->emitIns_R_S(INS_ldr, emitTypeSize(type), loReg, varNode->GetLclNum(), structOffset);
// Load from our local source
emit->emitIns_R_S(INS_ldr, emitTypeSize(type), loReg, srcLclNode->GetLclNum(),
lclOffset + structOffset);
}
else
{
Expand All @@ -1088,7 +1020,7 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* treeNode)
}
#endif // TARGET_ARM

// For a 12-byte structSize we will generate two load instructions
// For a 12-byte size we will generate two load instructions
// ldr x2, [x0]
// ldr w3, [x0, #8]
// str x2, [sp, #16]
Expand Down Expand Up @@ -1129,10 +1061,10 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* treeNode)
remainingSize -= moveSize;

instruction loadIns = ins_Load(type);
if (varNode != nullptr)
if (srcLclNode != nullptr)
{
// Load from our varNumImp source
emit->emitIns_R_S(loadIns, attr, loReg, varNode->GetLclNum(), structOffset);
// Load from our local source
emit->emitIns_R_S(loadIns, attr, loReg, srcLclNode->GetLclNum(), lclOffset + structOffset);
}
else
{
Expand Down
43 changes: 15 additions & 28 deletions src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1031,9 +1031,6 @@ bool Lowering::TryLowerSwitchToBitTest(
#endif // TARGET_XARCH
}

// NOTE: this method deliberately does not update the call arg table. It must only
// be used by NewPutArg and LowerArg; these functions are responsible for updating
// the call arg table as necessary.
void Lowering::ReplaceArgWithPutArgOrBitcast(GenTree** argSlot, GenTree* putArgOrBitcast)
{
assert(argSlot != nullptr);
Expand Down Expand Up @@ -1069,12 +1066,7 @@ void Lowering::ReplaceArgWithPutArgOrBitcast(GenTree** argSlot, GenTree* putArgO
// Notes:
// For System V systems with native struct passing (i.e. UNIX_AMD64_ABI defined)
// this method allocates a single GT_PUTARG_REG for 1 eightbyte structs and a GT_FIELD_LIST of two GT_PUTARG_REGs
// for two eightbyte structs.
//
// For STK passed structs the method generates GT_PUTARG_STK tree. For System V systems with native struct passing
// (i.e. UNIX_AMD64_ABI defined) this method also sets the GC pointers count and the pointers
// layout object, so the codegen of the GT_PUTARG_STK could use this for optimizing copying to the stack by value.
// (using block copy primitives for non GC pointers and a single TARGET_POINTER_SIZE copy with recording GC info.)
// for two eightbyte structs. For STK passed structs the method generates GT_PUTARG_STK tree.
//
GenTree* Lowering::NewPutArg(GenTreeCall* call, GenTree* arg, CallArg* callArg, var_types type)
{
Expand All @@ -1086,19 +1078,6 @@ GenTree* Lowering::NewPutArg(GenTreeCall* call, GenTree* arg, CallArg* callArg,

bool isOnStack = (callArg->AbiInfo.GetRegNum() == REG_STK);

#if defined(TARGET_ARMARCH) || defined(TARGET_LOONGARCH64)
// Mark contained when we pass struct
// GT_FIELD_LIST is always marked contained when it is generated
if (type == TYP_STRUCT)
{
arg->SetContained();
if ((arg->OperGet() == GT_OBJ) && (arg->AsObj()->Addr()->OperGet() == GT_LCL_VAR_ADDR))
{
MakeSrcContained(arg, arg->AsObj()->Addr());
}
}
#endif

#if FEATURE_ARG_SPLIT
// Struct can be split into register(s) and stack on ARM
if (compFeatureArgSplit() && callArg->AbiInfo.IsSplit())
Expand All @@ -1120,10 +1099,7 @@ GenTree* Lowering::NewPutArg(GenTreeCall* call, GenTree* arg, CallArg* callArg,
callArg->AbiInfo.GetStackByteSize(),
#endif
callArg->AbiInfo.NumRegs, call, putInIncomingArgArea);
// If struct argument is morphed to GT_FIELD_LIST node(s),
// we can know GC info by type of each GT_FIELD_LIST node.
// So we skip setting GC Pointer info.
//

GenTreePutArgSplit* argSplit = putArg->AsPutArgSplit();
for (unsigned regIndex = 0; regIndex < callArg->AbiInfo.NumRegs; regIndex++)
{
Expand All @@ -1132,6 +1108,12 @@ GenTree* Lowering::NewPutArg(GenTreeCall* call, GenTree* arg, CallArg* callArg,

if (arg->OperGet() == GT_OBJ)
{
arg->SetContained();
if (arg->AsObj()->Addr()->OperGet() == GT_LCL_VAR_ADDR)
{
MakeSrcContained(arg, arg->AsObj()->Addr());
}

ClassLayout* layout = arg->AsObj()->GetLayout();

// Set type of registers
Expand Down Expand Up @@ -1206,8 +1188,6 @@ GenTree* Lowering::NewPutArg(GenTreeCall* call, GenTree* arg, CallArg* callArg,
#ifdef DEBUG
// Make sure state is correct. The PUTARG_STK has TYP_VOID, as it doesn't produce
// a result. So the type of its operand must be the correct type to push on the stack.
// For a FIELD_LIST, this will be the type of the field (not the type of the arg),
// but otherwise it is generally the type of the operand.
callArg->CheckIsStruct();
#endif

Expand Down Expand Up @@ -1459,6 +1439,13 @@ void Lowering::LowerArg(GenTreeCall* call, CallArg* callArg, bool late)
ReplaceArgWithPutArgOrBitcast(ppArg, putArg);
}
}

arg = *ppArg;

if (arg->OperIs(GT_PUTARG_STK))
{
LowerPutArgStk(arg->AsPutArgStk());
}
}

#if defined(TARGET_ARMARCH) || defined(TARGET_LOONGARCH64)
Expand Down
Loading