Skip to content

Commit

Permalink
[VPlan] Use pointer to member 0 as VPInterleaveRecipe's pointer arg. (#…
Browse files Browse the repository at this point in the history
…106431)

Update VPInterleaveRecipe to always use the pointer to member 0 as
pointer argument. This in many cases helps to remove unneeded index
adjustments and simplifies VPInterleaveRecipe::execute.

In some rare cases, the address of member 0 does not dominate the insert
position of the interleave group. In those cases a PtrAdd VPInstruction
is emitted to compute the address of member 0 based on the address of
the insert position. Alternatively we could hoist the recipe computing
the address of member 0.
  • Loading branch information
fhahn authored Oct 6, 2024
1 parent b0c070e commit 7f74651
Show file tree
Hide file tree
Showing 18 changed files with 157 additions and 237 deletions.
10 changes: 8 additions & 2 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,9 +220,15 @@ class VPBuilder {
new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
}

VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL,
VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
const Twine &Name = "") {
return createInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, DL, Name);
return tryInsertInstruction(new VPInstruction(
Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(false), DL, Name));
}
VPValue *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
const Twine &Name = "") {
return tryInsertInstruction(new VPInstruction(
Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(true), DL, Name));
}

VPDerivedIVRecipe *createDerivedIV(InductionDescriptor::InductionKind Kind,
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9070,8 +9070,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
// single VPInterleaveRecipe at its insertion point.
VPlanTransforms::createInterleaveGroups(InterleaveGroups, RecipeBuilder,
CM.isScalarEpilogueAllowed());
VPlanTransforms::createInterleaveGroups(
*Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());

for (ElementCount VF : Range)
Plan->addVF(VF);
Expand Down
7 changes: 6 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -957,7 +957,6 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {}
};

protected:
struct GEPFlagsTy {
char IsInBounds : 1;
GEPFlagsTy(bool IsInBounds) : IsInBounds(IsInBounds) {}
Expand Down Expand Up @@ -1308,6 +1307,12 @@ class VPInstruction : public VPRecipeWithIRFlags,
assert(Opcode == Instruction::Or && "only OR opcodes can be disjoint");
}

VPInstruction(VPValue *Ptr, VPValue *Offset, GEPFlagsTy Flags,
DebugLoc DL = {}, const Twine &Name = "")
: VPRecipeWithIRFlags(VPDef::VPInstructionSC,
ArrayRef<VPValue *>({Ptr, Offset}), Flags, DL),
Opcode(VPInstruction::PtrAdd), Name(Name.str()) {}

VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
FastMathFlags FMFs, DebugLoc DL = {}, const Twine &Name = "");

Expand Down
37 changes: 12 additions & 25 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -641,7 +641,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
"can only generate first lane for PtrAdd");
Value *Ptr = State.get(getOperand(0), /* IsScalar */ true);
Value *Addend = State.get(getOperand(1), /* IsScalar */ true);
return Builder.CreatePtrAdd(Ptr, Addend, Name);
return isInBounds() ? Builder.CreateInBoundsPtrAdd(Ptr, Addend, Name)
: Builder.CreatePtrAdd(Ptr, Addend, Name);
}
case VPInstruction::ResumePhi: {
Value *IncomingFromVPlanPred =
Expand Down Expand Up @@ -2605,51 +2606,37 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
unsigned InterleaveFactor = Group->getFactor();
auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);

// Prepare for the new pointers.
unsigned Index = Group->getIndex(Instr);

// TODO: extend the masked interleaved-group support to reversed access.
VPValue *BlockInMask = getMask();
assert((!BlockInMask || !Group->isReverse()) &&
"Reversed masked interleave-group not supported.");

Value *Idx;
Value *Index;
// If the group is reverse, adjust the index to refer to the last vector lane
// instead of the first. We adjust the index from the first vector lane,
// rather than directly getting the pointer for lane VF - 1, because the
// pointer operand of the interleaved access is supposed to be uniform.
if (Group->isReverse()) {
Value *RuntimeVF =
getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
Idx = State.Builder.CreateMul(Idx,
State.Builder.getInt32(Group->getFactor()));
Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));
Idx = State.Builder.CreateNeg(Idx);
} else
Idx = State.Builder.getInt32(-Index);
Index = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
Index = State.Builder.CreateMul(Index,
State.Builder.getInt32(Group->getFactor()));
Index = State.Builder.CreateNeg(Index);
} else {
// TODO: Drop redundant 0-index GEP as follow-up.
Index = State.Builder.getInt32(0);
}

VPValue *Addr = getAddr();
Value *ResAddr = State.get(Addr, VPLane(0));
if (auto *I = dyn_cast<Instruction>(ResAddr))
State.setDebugLocFrom(I->getDebugLoc());

// Notice current instruction could be any index. Need to adjust the address
// to the member of index 0.
//
// E.g. a = A[i+1]; // Member of index 1 (Current instruction)
// b = A[i]; // Member of index 0
// Current pointer is pointed to A[i+1], adjust it to A[i].
//
// E.g. A[i+1] = a; // Member of index 1
// A[i] = b; // Member of index 0
// A[i+2] = c; // Member of index 2 (Current instruction)
// Current pointer is pointed to A[i+2], adjust it to A[i].

bool InBounds = false;
if (auto *gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts()))
InBounds = gep->isInBounds();
ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Idx, "", InBounds);
ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds);

State.setDebugLocFrom(Instr->getDebugLoc());
Value *PoisonVec = PoisonValue::get(VecTy);
Expand Down
52 changes: 46 additions & 6 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1590,14 +1590,19 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
}

void VPlanTransforms::createInterleaveGroups(
const SmallPtrSetImpl<const InterleaveGroup<Instruction> *> &InterleaveGroups,
VPlan &Plan,
const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
&InterleaveGroups,
VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed) {
if (InterleaveGroups.empty())
return;

// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
// single VPInterleaveRecipe at its insertion point.
VPDominatorTree VPDT;
VPDT.recalculate(Plan);
for (const auto *IG : InterleaveGroups) {
auto *Recipe =
cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
SmallVector<VPValue *, 4> StoredValues;
for (unsigned i = 0; i < IG->getFactor(); ++i)
if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
Expand All @@ -1607,9 +1612,44 @@ void VPlanTransforms::createInterleaveGroups(

bool NeedsMaskForGaps =
IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed;
auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
Recipe->getMask(), NeedsMaskForGaps);
VPIG->insertBefore(Recipe);

Instruction *IRInsertPos = IG->getInsertPos();
auto *InsertPos =
cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));

// Get or create the start address for the interleave group.
auto *Start =
cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
VPValue *Addr = Start->getAddr();
VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
// TODO: Hoist Addr's defining recipe (and any operands as needed) to
// InsertPos or sink loads above zero members to join it.
bool InBounds = false;
if (auto *Gep = dyn_cast<GetElementPtrInst>(
getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
InBounds = Gep->isInBounds();

// We cannot re-use the address of member zero because it does not
// dominate the insert position. Instead, use the address of the insert
// position and create a PtrAdd adjusting it to the address of member
// zero.
assert(IG->getIndex(IRInsertPos) != 0 &&
"index of insert position shouldn't be zero");
APInt Offset(32,
getLoadStoreType(IRInsertPos)->getScalarSizeInBits() / 8 *
IG->getIndex(IRInsertPos),
/*IsSigned=*/true);
VPValue *OffsetVPV = Plan.getOrAddLiveIn(
ConstantInt::get(IRInsertPos->getParent()->getContext(), -Offset));
VPBuilder B(InsertPos);
Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV)
: B.createPtrAdd(InsertPos->getAddr(), OffsetVPV);
}
auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
InsertPos->getMask(), NeedsMaskForGaps);
VPIG->insertBefore(InsertPos);

unsigned J = 0;
for (unsigned i = 0; i < IG->getFactor(); ++i)
if (Instruction *Member = IG->getMember(i)) {
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,9 @@ struct VPlanTransforms {
// widening its memory instructions with a single VPInterleaveRecipe at its
// insertion point.
static void createInterleaveGroups(
const SmallPtrSetImpl<const InterleaveGroup<Instruction> *> &InterleaveGroups,
VPlan &Plan,
const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
&InterleaveGroups,
VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed);

/// Remove dead recipes from \p Plan.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ define void @interleaved_store_first_order_recurrence(ptr noalias %src, ptr %dst
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw nsw i64 [[TMP0]], 3
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 2
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 -2
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> [[TMP10]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,11 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 {
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
; CHECK-NEXT: [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
; CHECK-NEXT: [[TMP6:%.*]] = add nsw <vscale x 4 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP5]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 -4
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 4
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
Expand Down Expand Up @@ -121,20 +119,19 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.+]] = shl i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, <vscale x 4 x i64> [[VEC_IND]]
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
; CHECK-NEXT: [[TMP7:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, <vscale x 4 x i64> [[TMP7]]
; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
; CHECK-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
; CHECK-NEXT: [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER1]] to <vscale x 4 x i32>
; CHECK-NEXT: [[TMP12:%.*]] = mul nsw <vscale x 4 x i32> [[BROADCAST_SPLAT3]], [[TMP11]]
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <vscale x 4 x i64> [[TMP7]], i64 0
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP13]]
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP12]])
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP14]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
Expand Down Expand Up @@ -404,10 +401,10 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP15]], 3
; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 1, [[TMP16]]
; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 2, [[TMP16]]
; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]]
; CHECK-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]])
Expand Down Expand Up @@ -715,16 +712,14 @@ define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias n
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
; CHECK-NEXT: [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
; CHECK-NEXT: [[TMP9:%.*]] = add nsw <vscale x 4 x i32> [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 -4
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP9]])
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP11]], align 4
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
Expand Down Expand Up @@ -1271,14 +1266,11 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP12:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
; CHECK-NEXT: [[P:%.+]] = extractelement <vscale x 4 x ptr> [[TMP13]], i64 0
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> [[BROADCAST_SPLAT4]])
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[P]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
Expand Down
Loading

0 comments on commit 7f74651

Please sign in to comment.