diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 13504508e2fb2e..da065e8d8cb6b8 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -480,7 +480,7 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, " "target occupancy = " << TgtOcc << '\n'); - GCNMaxOccupancySchedStrategy LStrgy(Context); + GCNMaxOccupancySchedStrategy LStrgy(Context, /*IsLegacyScheduler=*/true); unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy()); for (int I = 0; I < NumPasses; ++I) { diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 7c633b2bce7bc2..d46c4cf23a221e 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -296,6 +296,63 @@ collectVirtualRegUses(SmallVectorImpl &RegMaskPairs, } } +/// Mostly copy/paste from CodeGen/RegisterPressure.cpp +static LaneBitmask getLanesWithProperty( + const LiveIntervals &LIS, const MachineRegisterInfo &MRI, + bool TrackLaneMasks, Register RegUnit, SlotIndex Pos, + LaneBitmask SafeDefault, + function_ref Property) { + if (RegUnit.isVirtual()) { + const LiveInterval &LI = LIS.getInterval(RegUnit); + LaneBitmask Result; + if (TrackLaneMasks && LI.hasSubRanges()) { + for (const LiveInterval::SubRange &SR : LI.subranges()) { + if (Property(SR, Pos)) + Result |= SR.LaneMask; + } + } else if (Property(LI, Pos)) { + Result = TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(RegUnit) + : LaneBitmask::getAll(); + } + + return Result; + } + + const LiveRange *LR = LIS.getCachedRegUnit(RegUnit); + if (LR == nullptr) + return SafeDefault; + return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone(); +} + +/// Mostly copy/paste from CodeGen/RegisterPressure.cpp +/// Helper to find a vreg use between two indices {PriorUseIdx, NextUseIdx}. +/// The query starts with a lane bitmask which gets lanes/bits removed for every +/// use we find. +static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask, + SlotIndex PriorUseIdx, SlotIndex NextUseIdx, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *TRI, + const LiveIntervals *LIS, + bool Upward = false) { + for (const MachineOperand &MO : MRI.use_nodbg_operands(Reg)) { + if (MO.isUndef()) + continue; + const MachineInstr *MI = MO.getParent(); + SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot(); + bool InRange = Upward ? (InstSlot > PriorUseIdx && InstSlot <= NextUseIdx) + : (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx); + if (!InRange) + continue; + + unsigned SubRegIdx = MO.getSubReg(); + LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(SubRegIdx); + LastUseMask &= ~UseMask; + if (LastUseMask.none()) + return LaneBitmask::getNone(); + } + return LastUseMask; +} + /////////////////////////////////////////////////////////////////////////////// // GCNRPTracker @@ -354,17 +411,28 @@ void GCNRPTracker::reset(const MachineInstr &MI, MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs); } -//////////////////////////////////////////////////////////////////////////////// -// GCNUpwardRPTracker - -void GCNUpwardRPTracker::reset(const MachineRegisterInfo &MRI_, - const LiveRegSet &LiveRegs_) { +void GCNRPTracker::reset(const MachineRegisterInfo &MRI_, + const LiveRegSet &LiveRegs_) { MRI = &MRI_; LiveRegs = LiveRegs_; LastTrackedMI = nullptr; MaxPressure = CurPressure = getRegPressure(MRI_, LiveRegs_); } +/// Mostly copy/paste from CodeGen/RegisterPressure.cpp +LaneBitmask GCNRPTracker::getLastUsedLanes(Register RegUnit, + SlotIndex Pos) const { + return getLanesWithProperty( + LIS, *MRI, true, RegUnit, Pos.getBaseIndex(), LaneBitmask::getNone(), + [](const LiveRange &LR, SlotIndex Pos) { + const LiveRange::Segment *S = LR.getSegmentContaining(Pos); + return S != nullptr && S->end == Pos.getRegSlot(); + }); +} + +//////////////////////////////////////////////////////////////////////////////// +// GCNUpwardRPTracker + void GCNUpwardRPTracker::recede(const MachineInstr &MI) { assert(MRI && "call reset first"); @@ -441,25 +509,37 @@ bool GCNDownwardRPTracker::reset(const MachineInstr &MI, return true; } -bool GCNDownwardRPTracker::advanceBeforeNext() { +bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI, + bool UseInternalIterator) { assert(MRI && "call reset first"); - if (!LastTrackedMI) - return NextMI == MBBEnd; - - assert(NextMI == MBBEnd || !NextMI->isDebugInstr()); + SlotIndex SI; + const MachineInstr *CurrMI; + if (UseInternalIterator) { + if (!LastTrackedMI) + return NextMI == MBBEnd; + + assert(NextMI == MBBEnd || !NextMI->isDebugInstr()); + CurrMI = LastTrackedMI; + + SI = NextMI == MBBEnd + ? LIS.getInstructionIndex(*LastTrackedMI).getDeadSlot() + : LIS.getInstructionIndex(*NextMI).getBaseIndex(); + } else { //! UseInternalIterator + SI = LIS.getInstructionIndex(*MI).getBaseIndex(); + CurrMI = MI; + } - SlotIndex SI = NextMI == MBBEnd - ? LIS.getInstructionIndex(*LastTrackedMI).getDeadSlot() - : LIS.getInstructionIndex(*NextMI).getBaseIndex(); assert(SI.isValid()); // Remove dead registers or mask bits. SmallSet SeenRegs; - for (auto &MO : LastTrackedMI->operands()) { + for (auto &MO : CurrMI->operands()) { if (!MO.isReg() || !MO.getReg().isVirtual()) continue; if (MO.isUse() && !MO.readsReg()) continue; + if (!UseInternalIterator && MO.isDef()) + continue; if (!SeenRegs.insert(MO.getReg()).second) continue; const LiveInterval &LI = LIS.getInterval(MO.getReg()); @@ -492,15 +572,22 @@ bool GCNDownwardRPTracker::advanceBeforeNext() { LastTrackedMI = nullptr; - return NextMI == MBBEnd; + return UseInternalIterator && (NextMI == MBBEnd); } -void GCNDownwardRPTracker::advanceToNext() { - LastTrackedMI = &*NextMI++; - NextMI = skipDebugInstructionsForward(NextMI, MBBEnd); +void GCNDownwardRPTracker::advanceToNext(MachineInstr *MI, + bool UseInternalIterator) { + if (UseInternalIterator) { + LastTrackedMI = &*NextMI++; + NextMI = skipDebugInstructionsForward(NextMI, MBBEnd); + } else { + LastTrackedMI = MI; + } + + const MachineInstr *CurrMI = LastTrackedMI; // Add new registers or mask bits. - for (const auto &MO : LastTrackedMI->all_defs()) { + for (const auto &MO : CurrMI->all_defs()) { Register Reg = MO.getReg(); if (!Reg.isVirtual()) continue; @@ -513,11 +600,16 @@ void GCNDownwardRPTracker::advanceToNext() { MaxPressure = max(MaxPressure, CurPressure); } -bool GCNDownwardRPTracker::advance() { - if (NextMI == MBBEnd) +bool GCNDownwardRPTracker::advance(MachineInstr *MI, bool UseInternalIterator) { + if (UseInternalIterator && NextMI == MBBEnd) return false; - advanceBeforeNext(); - advanceToNext(); + + advanceBeforeNext(MI, UseInternalIterator); + advanceToNext(MI, UseInternalIterator); + if (!UseInternalIterator) { + // We must remove any dead def lanes from the current RP + advanceBeforeNext(MI, true); + } return true; } @@ -559,6 +651,67 @@ Printable llvm::reportMismatch(const GCNRPTracker::LiveRegSet &LISLR, }); } +GCNRegPressure +GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI, + const SIRegisterInfo *TRI) const { + assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction."); + + SlotIndex SlotIdx; + SlotIdx = LIS.getInstructionIndex(*MI).getRegSlot(); + + // Account for register pressure similar to RegPressureTracker::recede(). + RegisterOperands RegOpers; + RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/false); + RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx); + GCNRegPressure TempPressure = CurPressure; + + for (const RegisterMaskPair &Use : RegOpers.Uses) { + Register Reg = Use.RegUnit; + if (!Reg.isVirtual()) + continue; + LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx); + if (LastUseMask.none()) + continue; + // The LastUseMask is queried from the liveness information of instruction + // which may be further down the schedule. Some lanes may actually not be + // last uses for the current position. + // FIXME: allow the caller to pass in the list of vreg uses that remain + // to be bottom-scheduled to avoid searching uses at each query. + SlotIndex CurrIdx; + const MachineBasicBlock *MBB = MI->getParent(); + MachineBasicBlock::const_iterator IdxPos = skipDebugInstructionsForward( + LastTrackedMI ? LastTrackedMI : MBB->begin(), MBB->end()); + if (IdxPos == MBB->end()) { + CurrIdx = LIS.getMBBEndIdx(MBB); + } else { + CurrIdx = LIS.getInstructionIndex(*IdxPos).getRegSlot(); + } + + LastUseMask = + findUseBetween(Reg, LastUseMask, CurrIdx, SlotIdx, *MRI, TRI, &LIS); + if (LastUseMask.none()) + continue; + + LaneBitmask LiveMask = + LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0); + LaneBitmask NewMask = LiveMask & ~LastUseMask; + TempPressure.inc(Reg, LiveMask, NewMask, *MRI); + } + + // Generate liveness for defs. + for (const RegisterMaskPair &Def : RegOpers.Defs) { + Register Reg = Def.RegUnit; + if (!Reg.isVirtual()) + continue; + LaneBitmask LiveMask = + LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0); + LaneBitmask NewMask = LiveMask | Def.LaneMask; + TempPressure.inc(Reg, LiveMask, NewMask, *MRI); + } + + return TempPressure; +} + bool GCNUpwardRPTracker::isValid() const { const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex(); const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI); diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index d419fcc802c60a..06c3d9027db1b5 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -19,6 +19,7 @@ #include "GCNSubtarget.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/RegisterPressure.h" #include namespace llvm { @@ -149,6 +150,9 @@ inline GCNRegPressure operator-(const GCNRegPressure &P1, return Diff; } +/////////////////////////////////////////////////////////////////////////////// +// GCNRPTracker + class GCNRPTracker { public: using LiveRegSet = DenseMap; @@ -165,7 +169,14 @@ class GCNRPTracker { void reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy, bool After); + /// Mostly copy/paste from CodeGen/RegisterPressure.cpp + void bumpDeadDefs(ArrayRef DeadDefs); + + LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const; + public: + // reset tracker and set live register set to the specified value. + void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_); // live regs for the current state const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; } const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; } @@ -182,34 +193,38 @@ class GCNRPTracker { GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS, const MachineRegisterInfo &MRI); +//////////////////////////////////////////////////////////////////////////////// +// GCNUpwardRPTracker + class GCNUpwardRPTracker : public GCNRPTracker { public: GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {} - // reset tracker and set live register set to the specified value. - void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_); + using GCNRPTracker::reset; - // reset tracker at the specified slot index. + /// reset tracker at the specified slot index \p SI. void reset(const MachineRegisterInfo &MRI, SlotIndex SI) { - reset(MRI, llvm::getLiveRegs(SI, LIS, MRI)); + GCNRPTracker::reset(MRI, llvm::getLiveRegs(SI, LIS, MRI)); } - // reset tracker to the end of the MBB. + /// reset tracker to the end of the \p MBB. void reset(const MachineBasicBlock &MBB) { reset(MBB.getParent()->getRegInfo(), LIS.getSlotIndexes()->getMBBEndIdx(&MBB)); } - // reset tracker to the point just after MI (in program order). + /// reset tracker to the point just after \p MI (in program order). void reset(const MachineInstr &MI) { reset(MI.getMF()->getRegInfo(), LIS.getInstructionIndex(MI).getDeadSlot()); } - // move to the state just before the MI (in program order). + /// Move to the state of RP just before the \p MI . If \p UseInternalIterator + /// is set, also update the internal iterators. Setting \p UseInternalIterator + /// to false allows for an externally managed iterator / program order. void recede(const MachineInstr &MI); - // checks whether the tracker's state after receding MI corresponds - // to reported by LIS. + /// \p returns whether the tracker's state after receding MI corresponds + /// to reported by LIS. bool isValid() const; const GCNRegPressure &getMaxPressure() const { return MaxPressure; } @@ -223,6 +238,9 @@ class GCNUpwardRPTracker : public GCNRPTracker { } }; +//////////////////////////////////////////////////////////////////////////////// +// GCNDownwardRPTracker + class GCNDownwardRPTracker : public GCNRPTracker { // Last position of reset or advanceBeforeNext MachineBasicBlock::const_iterator NextMI; @@ -232,37 +250,65 @@ class GCNDownwardRPTracker : public GCNRPTracker { public: GCNDownwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {} + using GCNRPTracker::reset; + MachineBasicBlock::const_iterator getNext() const { return NextMI; } - // Return MaxPressure and clear it. + /// \p return MaxPressure and clear it. GCNRegPressure moveMaxPressure() { auto Res = MaxPressure; MaxPressure.clear(); return Res; } - // Reset tracker to the point before the MI - // filling live regs upon this point using LIS. - // Returns false if block is empty except debug values. + /// Reset tracker to the point before the \p MI + /// filling \p LiveRegs upon this point using LIS. + /// \p returns false if block is empty except debug values. bool reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr); - // Move to the state right before the next MI or after the end of MBB. - // Returns false if reached end of the block. - bool advanceBeforeNext(); - - // Move to the state at the MI, advanceBeforeNext has to be called first. - void advanceToNext(); - - // Move to the state at the next MI. Returns false if reached end of block. - bool advance(); - - // Advance instructions until before End. + /// Move to the state right before the next MI or after the end of MBB. + /// \p returns false if reached end of the block. + /// If \p UseInternalIterator is true, then internal iterators are used and + /// set to process in program order. If \p UseInternalIterator is false, then + /// it is assumed that the tracker is using an externally managed iterator, + /// and advance* calls will not update the state of the iterator. In such + /// cases, the tracker will move to the state right before the provided \p MI + /// and use LIS for RP calculations. + bool advanceBeforeNext(MachineInstr *MI = nullptr, + bool UseInternalIterator = true); + + /// Move to the state at the MI, advanceBeforeNext has to be called first. + /// If \p UseInternalIterator is true, then internal iterators are used and + /// set to process in program order. If \p UseInternalIterator is false, then + /// it is assumed that the tracker is using an externally managed iterator, + /// and advance* calls will not update the state of the iterator. In such + /// cases, the tracker will move to the state at the provided \p MI . + void advanceToNext(MachineInstr *MI = nullptr, + bool UseInternalIterator = true); + + /// Move to the state at the next MI. \p returns false if reached end of + /// block. If \p UseInternalIterator is true, then internal iterators are used + /// and set to process in program order. If \p UseInternalIterator is false, + /// then it is assumed that the tracker is using an externally managed + /// iterator, and advance* calls will not update the state of the iterator. In + /// such cases, the tracker will move to the state right before the provided + /// \p MI and use LIS for RP calculations. + bool advance(MachineInstr *MI = nullptr, bool UseInternalIterator = true); + + /// Advance instructions until before \p End. bool advance(MachineBasicBlock::const_iterator End); - // Reset to Begin and advance to End. + /// Reset to \p Begin and advance to \p End. bool advance(MachineBasicBlock::const_iterator Begin, MachineBasicBlock::const_iterator End, const LiveRegSet *LiveRegsCopy = nullptr); + + /// Mostly copy/paste from CodeGen/RegisterPressure.cpp + /// Calculate the impact \p MI will have on CurPressure and \return the + /// speculated pressure. In order to support RP Speculation, this does not + /// rely on the implicit program ordering in the LiveIntervals. + GCNRegPressure bumpDownwardPressure(const MachineInstr *MI, + const SIRegisterInfo *TRI) const; }; /// \returns the LaneMask of live lanes of \p Reg at position \p SI. Only the diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index d6958d9055fade..11c95675aeeafa 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -58,11 +58,17 @@ static cl::opt "Wave Limited (amdgpu-limit-wave-threshold)."), cl::init(false)); +static cl::opt GCNTrackers( + "amdgpu-use-amdgpu-trackers", cl::Hidden, + cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), + cl::init(false)); + const unsigned ScheduleMetrics::ScaleFactor = 100; GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), - HasHighPressure(false) {} + DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) { +} void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { GenericScheduler::initialize(DAG); @@ -148,17 +154,38 @@ static bool canUsePressureDiffs(const SUnit &SU) { return true; } -static void getRegisterPressures(bool AtTop, - const RegPressureTracker &RPTracker, SUnit *SU, - std::vector &Pressure, - std::vector &MaxPressure) { +static void getRegisterPressures( + bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, + std::vector &Pressure, std::vector &MaxPressure, + GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, + ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) { // getDownwardPressure() and getUpwardPressure() make temporary changes to // the tracker, so we need to pass those function a non-const copy. RegPressureTracker &TempTracker = const_cast(RPTracker); - if (AtTop) - TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); - else - TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); + if (!GCNTrackers) { + AtTop + ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure) + : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); + + return; + } + + // GCNTrackers + Pressure.resize(4, 0); + MachineInstr *MI = SU->getInstr(); + GCNRegPressure NewPressure; + if (AtTop) { + GCNDownwardRPTracker TempDownwardTracker(DownwardTracker); + NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI); + } else { + GCNUpwardRPTracker TempUpwardTracker(UpwardTracker); + TempUpwardTracker.recede(*MI); + NewPressure = TempUpwardTracker.getPressure(); + } + Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum(); + Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = + NewPressure.getArchVGPRNum(); + Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum(); } void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, @@ -187,8 +214,9 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, // // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of // PressureDiffs. - if (AtTop || !canUsePressureDiffs(*SU)) { - getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure); + if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) { + getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure, + DownwardTracker, UpwardTracker, DAG, SRI); } else { // Reserve 4 slots. Pressure.resize(4, 0); @@ -206,7 +234,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, #ifdef EXPENSIVE_CHECKS std::vector CheckPressure, CheckMaxPressure; - getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure); + getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure, + TheTracker, UpwardTracker, DAG, SRI); if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] != CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] || Pressure[AMDGPU::RegisterPressureSets::VGPR_32] != @@ -294,8 +323,16 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, unsigned SGPRPressure = 0; unsigned VGPRPressure = 0; if (DAG->isTrackingPressure()) { - SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; - VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; + if (!GCNTrackers) { + SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; + VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; + } else { + GCNRPTracker *T = IsBottomUp + ? static_cast(&UpwardTracker) + : static_cast(&DownwardTracker); + SGPRPressure = T->getPressure().getSGPRNum(); + VGPRPressure = T->getPressure().getArchVGPRNum(); + } } ReadyQueue &Q = Zone.Available; for (SUnit *SU : Q) { @@ -444,6 +481,16 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { return SU; } +void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { + if (GCNTrackers) { + MachineInstr *MI = SU->getInstr(); + IsTopNode ? (void)DownwardTracker.advance(MI, false) + : UpwardTracker.recede(*MI); + } + + return GenericScheduler::schedNode(SU, IsTopNode); +} + GCNSchedStageID GCNSchedStrategy::getCurrentStage() { assert(CurrentStage && CurrentStage != SchedStages.end()); return *CurrentStage; @@ -470,12 +517,13 @@ GCNSchedStageID GCNSchedStrategy::getNextStage() const { } GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( - const MachineSchedContext *C) + const MachineSchedContext *C, bool IsLegacyScheduler) : GCNSchedStrategy(C) { SchedStages.push_back(GCNSchedStageID::OccInitialSchedule); SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule); SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule); SchedStages.push_back(GCNSchedStageID::PreRARematerialize); + GCNTrackers = GCNTrackers & !IsLegacyScheduler; } GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C) @@ -571,7 +619,8 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive( MachineSchedContext *C, std::unique_ptr S) : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget()), MFI(*MF.getInfo()), - StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) { + StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy), + RegionLiveOuts(this, /*IsLiveOut=*/true) { LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); if (RelaxedOcc) { @@ -613,6 +662,14 @@ GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { return RPTracker.moveMaxPressure(); } +static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, + MachineBasicBlock::iterator RegionEnd) { + auto REnd = RegionEnd == RegionBegin->getParent()->end() + ? std::prev(RegionEnd) + : RegionEnd; + return &*skipDebugInstructionsBackward(REnd, RegionBegin); +} + void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB) { GCNDownwardRPTracker RPTracker(*LIS); @@ -687,20 +744,45 @@ void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, } DenseMap -GCNScheduleDAGMILive::getBBLiveInMap() const { +GCNScheduleDAGMILive::getRegionLiveInMap() const { assert(!Regions.empty()); - std::vector BBStarters; - BBStarters.reserve(Regions.size()); + std::vector RegionFirstMIs; + RegionFirstMIs.reserve(Regions.size()); auto I = Regions.rbegin(), E = Regions.rend(); auto *BB = I->first->getParent(); do { auto *MI = &*skipDebugInstructionsForward(I->first, I->second); - BBStarters.push_back(MI); + RegionFirstMIs.push_back(MI); do { ++I; } while (I != E && I->first->getParent() == BB); } while (I != E); - return getLiveRegMap(BBStarters, false /*After*/, *LIS); + return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS); +} + +DenseMap +GCNScheduleDAGMILive::getRegionLiveOutMap() const { + assert(!Regions.empty()); + std::vector RegionLastMIs; + RegionLastMIs.reserve(Regions.size()); + for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) + RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd)); + + return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS); +} + +void RegionPressureMap::buildLiveRegMap() { + IdxToInstruction.clear(); + + RegionLiveRegMap = + IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap(); + for (unsigned I = 0; I < DAG->Regions.size(); I++) { + MachineInstr *RegionKey = + IsLiveOut + ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second) + : &*DAG->Regions[I].first; + IdxToInstruction[I] = RegionKey; + } } void GCNScheduleDAGMILive::finalizeSchedule() { @@ -726,8 +808,11 @@ void GCNScheduleDAGMILive::finalizeSchedule() { void GCNScheduleDAGMILive::runSchedStages() { LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); - if (!Regions.empty()) - BBLiveInMap = getBBLiveInMap(); + if (!Regions.empty()) { + BBLiveInMap = getRegionLiveInMap(); + if (GCNTrackers) + RegionLiveOuts.buildLiveRegMap(); + } GCNSchedStrategy &S = static_cast(*SchedImpl); while (S.advanceStage()) { @@ -745,6 +830,19 @@ void GCNScheduleDAGMILive::runSchedStages() { continue; } + if (GCNTrackers) { + GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker(); + GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker(); + GCNRPTracker::LiveRegSet *RegionLiveIns = + &LiveIns[Stage->getRegionIdx()]; + + reinterpret_cast(DownwardTracker) + ->reset(MRI, *RegionLiveIns); + reinterpret_cast(UpwardTracker) + ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx( + Stage->getRegionIdx())); + } + ScheduleDAGMILive::schedule(); Stage->finalizeGCNRegion(); } @@ -1015,6 +1113,7 @@ void GCNSchedStage::finalizeGCNRegion() { void GCNSchedStage::checkScheduling() { // Check the results of scheduling. PressureAfter = DAG.getRealRegPressure(RegionIdx); + LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); @@ -1586,6 +1685,9 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST, DAG.Regions = NewRegions; DAG.RescheduleRegions = NewRescheduleRegions; + if (GCNTrackers) + DAG.RegionLiveOuts.buildLiveRegMap(); + SIMachineFunctionInfo &MFI = *MF.getInfo(); MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index f0aea2bc4ab865..64d517038f90e0 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -70,6 +70,12 @@ class GCNSchedStrategy : public GenericScheduler { // Pointer to the current SchedStageID. SmallVectorImpl::iterator CurrentStage = nullptr; + // GCN RP Tracker for top-down scheduling + mutable GCNDownwardRPTracker DownwardTracker; + + // GCN RP Tracker for botttom-up scheduling + mutable GCNUpwardRPTracker UpwardTracker; + public: // schedule() have seen register pressure over the critical limits and had to // track register pressure for actual scheduling heuristics. @@ -102,6 +108,8 @@ class GCNSchedStrategy : public GenericScheduler { SUnit *pickNode(bool &IsTopNode) override; + void schedNode(SUnit *SU, bool IsTopNode) override; + void initialize(ScheduleDAGMI *DAG) override; unsigned getTargetOccupancy() { return TargetOccupancy; } @@ -116,13 +124,18 @@ class GCNSchedStrategy : public GenericScheduler { bool hasNextStage() const; GCNSchedStageID getNextStage() const; + + GCNDownwardRPTracker *getDownwardTracker() { return &DownwardTracker; } + + GCNUpwardRPTracker *getUpwardTracker() { return &UpwardTracker; } }; /// The goal of this scheduling strategy is to maximize kernel occupancy (i.e. /// maximum number of waves per simd). class GCNMaxOccupancySchedStrategy final : public GCNSchedStrategy { public: - GCNMaxOccupancySchedStrategy(const MachineSchedContext *C); + GCNMaxOccupancySchedStrategy(const MachineSchedContext *C, + bool IsLegacyScheduler = false); }; /// The goal of this scheduling strategy is to maximize ILP for a single wave @@ -163,6 +176,32 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) { return OS; } +class GCNScheduleDAGMILive; +class RegionPressureMap { + GCNScheduleDAGMILive *DAG; + // The live in/out pressure as indexed by the first or last MI in the region + // before scheduling. + DenseMap RegionLiveRegMap; + // The mapping of RegionIDx to key instruction + DenseMap IdxToInstruction; + // Whether we are calculating LiveOuts or LiveIns + bool IsLiveOut; + +public: + RegionPressureMap() {} + RegionPressureMap(GCNScheduleDAGMILive *GCNDAG, bool LiveOut) + : DAG(GCNDAG), IsLiveOut(LiveOut) {} + // Build the Instr->LiveReg and RegionIdx->Instr maps + void buildLiveRegMap(); + + // Retrieve the LiveReg for a given RegionIdx + GCNRPTracker::LiveRegSet &getLiveRegsForRegionIdx(unsigned RegionIdx) { + assert(IdxToInstruction.find(RegionIdx) != IdxToInstruction.end()); + MachineInstr *Key = IdxToInstruction[RegionIdx]; + return RegionLiveRegMap[Key]; + } +}; + class GCNScheduleDAGMILive final : public ScheduleDAGMILive { friend class GCNSchedStage; friend class OccInitialScheduleStage; @@ -170,6 +209,7 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { friend class ClusteredLowOccStage; friend class PreRARematStage; friend class ILPInitialScheduleStage; + friend class RegionPressureMap; const GCNSubtarget &ST; @@ -211,9 +251,22 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { // Temporary basic block live-in cache. DenseMap MBBLiveIns; + // The map of the initial first region instruction to region live in registers DenseMap BBLiveInMap; - DenseMap getBBLiveInMap() const; + // Calculate the map of the initial first region instruction to region live in + // registers + DenseMap getRegionLiveInMap() const; + + // Calculate the map of the initial last region instruction to region live out + // registers + DenseMap + getRegionLiveOutMap() const; + + // The live out registers per region. These are internally stored as a map of + // the initial last region instruction to region live out registers, but can + // be retreived with the regionIdx by calls to getLiveRegsForRegionIdx. + RegionPressureMap RegionLiveOuts; // Return current region pressure. GCNRegPressure getRealRegPressure(unsigned RegionIdx) const; @@ -311,6 +364,9 @@ class GCNSchedStage { return DAG.RegionsWithExcessRP[RegionIdx]; } + // The region number this stage is currently working on + unsigned getRegionIdx() { return RegionIdx; } + // Returns true if the new schedule may result in more spilling. bool mayCauseSpilling(unsigned WavesAfter); diff --git a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir index e9005e94ce5db7..d57450baea911a 100644 --- a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir +++ b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir @@ -1,11 +1,17 @@ # REQUIRES: asserts -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -amdgpu-use-amdgpu-trackers=1 -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN-GCNTRACKER %s --- | define amdgpu_kernel void @high-RP-reschedule() { ret void } ... -# CHECK: Unclustered High Register Pressure Reschedule stage successfully increased occupancy to 4 +# GCN: Unclustered High Register Pressure Reschedule stage successfully increased occupancy to 4 + +# GCN-GCNTRACKER: Occupancy before scheduling: 3, after 4. +# GCN-GCNTRACKER-NEXT: Ending scheduling stage: Max Occupancy Initial Schedule + +# When using the GCN Trackers, the scheduler is able to acieve desired occupancy without running high-RP-reschedule stage. --- name: high-RP-reschedule diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir index 4be102f7860eab..f496a4b06bb237 100644 --- a/llvm/test/CodeGen/AMDGPU/pr51516.mir +++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir @@ -1,4 +1,5 @@ -# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=1 -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s # Check that %3 was not rematerialized before the last store since its operand %1 # is killed by that store. @@ -7,6 +8,9 @@ # GCN: renamable $vgpr33_vgpr34_vgpr35_vgpr36 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) # GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr47, killed renamable $vgpr29_vgpr30_vgpr31_vgpr32, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr46 +# GCN-GCNTRACKER-LABEL: name: global_sextload_v32i32_to_v32i64 +# GCN-GCNTRACKER-NOT: SI_SPILL + --- name: global_sextload_v32i32_to_v32i64 tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll new file mode 100644 index 00000000000000..79187f51af0d2b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll @@ -0,0 +1,65 @@ +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack -amdgpu-use-amdgpu-trackers=1 2>&1 < %s | FileCheck -check-prefixes=ERR-GCNTRACKERS %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack 2>&1 < %s | FileCheck -check-prefixes=GCN %s + +%asm.output = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs + <16 x i32>, <7 x i32>, ; vgprs + i64 ; vcc + } + +%asm.output2 = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs + <16 x i32>, <5 x i32>, ; vgprs + i64 ; vcc + } + +%asm.output3 = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, ; sgprs + <16 x i32>, <6 x i32>, ; vgprs + i64 ; vcc + } + +; ERR-GCNTRACKERS: ran out of registers during register allocation +; GCN-NOT: ran out of registers during register allocation + +; FIXME: GCN Trackers do not track pressure from PhysRegs, so scheduling is actually worse + +define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 { + %alloca0 = alloca [4096 x i32], align 64, addrspace(5) + %alloca1 = alloca i32, align 4, addrspace(5) + call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0) + + %asm = call %asm.output asm sideeffect + "; def $0, $1, $2, $3, $4, $5, $6, $7, $8", + "={s[0:15]},={s[16:31]},={s[32:47]},={s[48:55]},={s[56:57]},={s58},={v[0:15]},={v[16:22]},={vcc}"() + + %s0 = extractvalue %asm.output %asm, 0 + %s1 = extractvalue %asm.output %asm, 1 + %s2 = extractvalue %asm.output %asm, 2 + %s3 = extractvalue %asm.output %asm, 3 + %s4 = extractvalue %asm.output %asm, 4 + %s5 = extractvalue %asm.output %asm, 5 + + %v0 = extractvalue %asm.output %asm, 6 + %v1 = extractvalue %asm.output %asm, 7 + + %vcc = extractvalue %asm.output %asm, 8 + + ; scc is unavailable since it is live in + call void asm sideeffect "; use $0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10", + "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{s58},{v[0:15]},{v[16:22]},{vcc},{s59},{scc}"( + <16 x i32> %s0, + <16 x i32> %s1, + <16 x i32> %s2, + <8 x i32> %s3, + <2 x i32> %s4, + i32 %s5, + <16 x i32> %v0, + <7 x i32> %v1, + i64 %vcc, + ptr addrspace(5) %alloca1, + i32 0) ; use of scc + + ret void +} + +attributes #0 = { nounwind alignstack=64 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" } +attributes #1 = { nounwind alignstack=16 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" } + diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll new file mode 100644 index 00000000000000..c490c76f4531de --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll @@ -0,0 +1,491 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0 < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0 -amdgpu-use-amdgpu-trackers=1 < %s | FileCheck --check-prefix=GCN-GCNTRACKERS %s + +; CHECK-LABEL: {{^}}spill: +; GCN: codeLenInByte = 1000 +; GCN-GCNTRACKERS: codeLenInByte = 1016 +; GCN: NumSgprs: 104 +; GCN-GCNTRACKERS: NumSgprs: 104 +; GCN: NumVgprs: 1 +; GCN-GCNTRACKERS: NumVgprs: 2 +; GCN: ScratchSize: 0 +; GCN-GCNTRACKERS: ScratchSize: 0 +; GCN: Occupancy: 5 +; GCN-GCNTRACKERS: Occupancy: 5 + +; FIXME: GCN Trackers do not track pressure from PhysRegs, so scheduling is actually worse + +define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 { +entry: + %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #0 + %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={s1}"() #0 + %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={s2}"() #0 + %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={s3}"() #0 + %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={s4}"() #0 + %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={s5}"() #0 + %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={s6}"() #0 + %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={s7}"() #0 + %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={s8}"() #0 + %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={s9}"() #0 + %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={s10}"() #0 + %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={s11}"() #0 + %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={s12}"() #0 + %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={s13}"() #0 + %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={s14}"() #0 + %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={s15}"() #0 + %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={s16}"() #0 + %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={s17}"() #0 + %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={s18}"() #0 + %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={s19}"() #0 + %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={s20}"() #0 + %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={s21}"() #0 + %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={s22}"() #0 + %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={s23}"() #0 + %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={s24}"() #0 + %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={s25}"() #0 + %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={s26}"() #0 + %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={s27}"() #0 + %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={s28}"() #0 + %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={s29}"() #0 + %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={s30}"() #0 + %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={s31}"() #0 + %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={s32}"() #0 + %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={s33}"() #0 + %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={s34}"() #0 + %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={s35}"() #0 + %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={s36}"() #0 + %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={s37}"() #0 + %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={s38}"() #0 + %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={s39}"() #0 + %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={s40}"() #0 + %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={s41}"() #0 + %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={s42}"() #0 + %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={s43}"() #0 + %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={s44}"() #0 + %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={s45}"() #0 + %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={s46}"() #0 + %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={s47}"() #0 + %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={s48}"() #0 + %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={s49}"() #0 + %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={s50}"() #0 + %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={s51}"() #0 + %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={s52}"() #0 + %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={s53}"() #0 + %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={s54}"() #0 + %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={s55}"() #0 + %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={s56}"() #0 + %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={s57}"() #0 + %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={s58}"() #0 + %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={s59}"() #0 + %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={s60}"() #0 + %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={s61}"() #0 + %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={s62}"() #0 + %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={s63}"() #0 + %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={s64}"() #0 + %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={s65}"() #0 + %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={s66}"() #0 + %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={s67}"() #0 + %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={s68}"() #0 + %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={s69}"() #0 + %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={s70}"() #0 + %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={s71}"() #0 + %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={s72}"() #0 + %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={s73}"() #0 + %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={s74}"() #0 + %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={s75}"() #0 + %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={s76}"() #0 + %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={s77}"() #0 + %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={s78}"() #0 + %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={s79}"() #0 + %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={s80}"() #0 + %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={s81}"() #0 + %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={s82}"() #0 + %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={s83}"() #0 + %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={s84}"() #0 + %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={s85}"() #0 + %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={s86}"() #0 + %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={s87}"() #0 + %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={s88}"() #0 + %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={s89}"() #0 + %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={s90}"() #0 + %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={s91}"() #0 + %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={s92}"() #0 + %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={s93}"() #0 + %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={s94}"() #0 + %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #0 + %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #0 + %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #0 + %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #0 + %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #0 + %sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={s100}"() #0 + %sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={s101}"() #0 + %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_lo}"() #0 + %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_hi}"() #0 + %cmp = icmp eq i32 %cnd, 0 + br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch + +bb2: ; 68 bytes + ; 64 byte asm + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64",""() #0 + br label %bb3 + +bb3: + tail call void asm sideeffect "; reg use $0", "{s0}"(i32 %sgpr0) #0 + tail call void asm sideeffect "; reg use $0", "{s1}"(i32 %sgpr1) #0 + tail call void asm sideeffect "; reg use $0", "{s2}"(i32 %sgpr2) #0 + tail call void asm sideeffect "; reg use $0", "{s3}"(i32 %sgpr3) #0 + tail call void asm sideeffect "; reg use $0", "{s4}"(i32 %sgpr4) #0 + tail call void asm sideeffect "; reg use $0", "{s5}"(i32 %sgpr5) #0 + tail call void asm sideeffect "; reg use $0", "{s6}"(i32 %sgpr6) #0 + tail call void asm sideeffect "; reg use $0", "{s7}"(i32 %sgpr7) #0 + tail call void asm sideeffect "; reg use $0", "{s8}"(i32 %sgpr8) #0 + tail call void asm sideeffect "; reg use $0", "{s9}"(i32 %sgpr9) #0 + tail call void asm sideeffect "; reg use $0", "{s10}"(i32 %sgpr10) #0 + tail call void asm sideeffect "; reg use $0", "{s11}"(i32 %sgpr11) #0 + tail call void asm sideeffect "; reg use $0", "{s12}"(i32 %sgpr12) #0 + tail call void asm sideeffect "; reg use $0", "{s13}"(i32 %sgpr13) #0 + tail call void asm sideeffect "; reg use $0", "{s14}"(i32 %sgpr14) #0 + tail call void asm sideeffect "; reg use $0", "{s15}"(i32 %sgpr15) #0 + tail call void asm sideeffect "; reg use $0", "{s16}"(i32 %sgpr16) #0 + tail call void asm sideeffect "; reg use $0", "{s17}"(i32 %sgpr17) #0 + tail call void asm sideeffect "; reg use $0", "{s18}"(i32 %sgpr18) #0 + tail call void asm sideeffect "; reg use $0", "{s19}"(i32 %sgpr19) #0 + tail call void asm sideeffect "; reg use $0", "{s20}"(i32 %sgpr20) #0 + tail call void asm sideeffect "; reg use $0", "{s21}"(i32 %sgpr21) #0 + tail call void asm sideeffect "; reg use $0", "{s22}"(i32 %sgpr22) #0 + tail call void asm sideeffect "; reg use $0", "{s23}"(i32 %sgpr23) #0 + tail call void asm sideeffect "; reg use $0", "{s24}"(i32 %sgpr24) #0 + tail call void asm sideeffect "; reg use $0", "{s25}"(i32 %sgpr25) #0 + tail call void asm sideeffect "; reg use $0", "{s26}"(i32 %sgpr26) #0 + tail call void asm sideeffect "; reg use $0", "{s27}"(i32 %sgpr27) #0 + tail call void asm sideeffect "; reg use $0", "{s28}"(i32 %sgpr28) #0 + tail call void asm sideeffect "; reg use $0", "{s29}"(i32 %sgpr29) #0 + tail call void asm sideeffect "; reg use $0", "{s30}"(i32 %sgpr30) #0 + tail call void asm sideeffect "; reg use $0", "{s31}"(i32 %sgpr31) #0 + tail call void asm sideeffect "; reg use $0", "{s32}"(i32 %sgpr32) #0 + tail call void asm sideeffect "; reg use $0", "{s33}"(i32 %sgpr33) #0 + tail call void asm sideeffect "; reg use $0", "{s34}"(i32 %sgpr34) #0 + tail call void asm sideeffect "; reg use $0", "{s35}"(i32 %sgpr35) #0 + tail call void asm sideeffect "; reg use $0", "{s36}"(i32 %sgpr36) #0 + tail call void asm sideeffect "; reg use $0", "{s37}"(i32 %sgpr37) #0 + tail call void asm sideeffect "; reg use $0", "{s38}"(i32 %sgpr38) #0 + tail call void asm sideeffect "; reg use $0", "{s39}"(i32 %sgpr39) #0 + tail call void asm sideeffect "; reg use $0", "{s40}"(i32 %sgpr40) #0 + tail call void asm sideeffect "; reg use $0", "{s41}"(i32 %sgpr41) #0 + tail call void asm sideeffect "; reg use $0", "{s42}"(i32 %sgpr42) #0 + tail call void asm sideeffect "; reg use $0", "{s43}"(i32 %sgpr43) #0 + tail call void asm sideeffect "; reg use $0", "{s44}"(i32 %sgpr44) #0 + tail call void asm sideeffect "; reg use $0", "{s45}"(i32 %sgpr45) #0 + tail call void asm sideeffect "; reg use $0", "{s46}"(i32 %sgpr46) #0 + tail call void asm sideeffect "; reg use $0", "{s47}"(i32 %sgpr47) #0 + tail call void asm sideeffect "; reg use $0", "{s48}"(i32 %sgpr48) #0 + tail call void asm sideeffect "; reg use $0", "{s49}"(i32 %sgpr49) #0 + tail call void asm sideeffect "; reg use $0", "{s50}"(i32 %sgpr50) #0 + tail call void asm sideeffect "; reg use $0", "{s51}"(i32 %sgpr51) #0 + tail call void asm sideeffect "; reg use $0", "{s52}"(i32 %sgpr52) #0 + tail call void asm sideeffect "; reg use $0", "{s53}"(i32 %sgpr53) #0 + tail call void asm sideeffect "; reg use $0", "{s54}"(i32 %sgpr54) #0 + tail call void asm sideeffect "; reg use $0", "{s55}"(i32 %sgpr55) #0 + tail call void asm sideeffect "; reg use $0", "{s56}"(i32 %sgpr56) #0 + tail call void asm sideeffect "; reg use $0", "{s57}"(i32 %sgpr57) #0 + tail call void asm sideeffect "; reg use $0", "{s58}"(i32 %sgpr58) #0 + tail call void asm sideeffect "; reg use $0", "{s59}"(i32 %sgpr59) #0 + tail call void asm sideeffect "; reg use $0", "{s60}"(i32 %sgpr60) #0 + tail call void asm sideeffect "; reg use $0", "{s61}"(i32 %sgpr61) #0 + tail call void asm sideeffect "; reg use $0", "{s62}"(i32 %sgpr62) #0 + tail call void asm sideeffect "; reg use $0", "{s63}"(i32 %sgpr63) #0 + tail call void asm sideeffect "; reg use $0", "{s64}"(i32 %sgpr64) #0 + tail call void asm sideeffect "; reg use $0", "{s65}"(i32 %sgpr65) #0 + tail call void asm sideeffect "; reg use $0", "{s66}"(i32 %sgpr66) #0 + tail call void asm sideeffect "; reg use $0", "{s67}"(i32 %sgpr67) #0 + tail call void asm sideeffect "; reg use $0", "{s68}"(i32 %sgpr68) #0 + tail call void asm sideeffect "; reg use $0", "{s69}"(i32 %sgpr69) #0 + tail call void asm sideeffect "; reg use $0", "{s70}"(i32 %sgpr70) #0 + tail call void asm sideeffect "; reg use $0", "{s71}"(i32 %sgpr71) #0 + tail call void asm sideeffect "; reg use $0", "{s72}"(i32 %sgpr72) #0 + tail call void asm sideeffect "; reg use $0", "{s73}"(i32 %sgpr73) #0 + tail call void asm sideeffect "; reg use $0", "{s74}"(i32 %sgpr74) #0 + tail call void asm sideeffect "; reg use $0", "{s75}"(i32 %sgpr75) #0 + tail call void asm sideeffect "; reg use $0", "{s76}"(i32 %sgpr76) #0 + tail call void asm sideeffect "; reg use $0", "{s77}"(i32 %sgpr77) #0 + tail call void asm sideeffect "; reg use $0", "{s78}"(i32 %sgpr78) #0 + tail call void asm sideeffect "; reg use $0", "{s79}"(i32 %sgpr79) #0 + tail call void asm sideeffect "; reg use $0", "{s80}"(i32 %sgpr80) #0 + tail call void asm sideeffect "; reg use $0", "{s81}"(i32 %sgpr81) #0 + tail call void asm sideeffect "; reg use $0", "{s82}"(i32 %sgpr82) #0 + tail call void asm sideeffect "; reg use $0", "{s83}"(i32 %sgpr83) #0 + tail call void asm sideeffect "; reg use $0", "{s84}"(i32 %sgpr84) #0 + tail call void asm sideeffect "; reg use $0", "{s85}"(i32 %sgpr85) #0 + tail call void asm sideeffect "; reg use $0", "{s86}"(i32 %sgpr86) #0 + tail call void asm sideeffect "; reg use $0", "{s87}"(i32 %sgpr87) #0 + tail call void asm sideeffect "; reg use $0", "{s88}"(i32 %sgpr88) #0 + tail call void asm sideeffect "; reg use $0", "{s89}"(i32 %sgpr89) #0 + tail call void asm sideeffect "; reg use $0", "{s90}"(i32 %sgpr90) #0 + tail call void asm sideeffect "; reg use $0", "{s91}"(i32 %sgpr91) #0 + tail call void asm sideeffect "; reg use $0", "{s92}"(i32 %sgpr92) #0 + tail call void asm sideeffect "; reg use $0", "{s93}"(i32 %sgpr93) #0 + tail call void asm sideeffect "; reg use $0", "{s94}"(i32 %sgpr94) #0 + tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #0 + tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #0 + tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #0 + tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #0 + tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #0 + tail call void asm sideeffect "; reg use $0", "{s100}"(i32 %sgpr100) #0 + tail call void asm sideeffect "; reg use $0", "{s101}"(i32 %sgpr101) #0 + tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #0 + tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #0 + ret void +} + +; CHECK-LABEL: {{^}}spill_func: +; GCN: codeLenInByte = 1612 +; GCN-GCNTRACKERS: codeLenInByte = 1660 +; GCN: NumSgprs: 104 +; GCN-GCNTRACKERS: NumSgprs: 104 +; GCN: NumVgprs: 3 +; GCN-GCNTRACKERS: NumVgprs: 4 +; GCN: ScratchSize: 12 +; GCN-GCNTRACKERS: ScratchSize: 16 + +define void @spill_func(ptr addrspace(1) %arg) #0 { +entry: + %cnd = tail call i32 @llvm.amdgcn.workgroup.id.x() #0 + %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #0 + %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={s1}"() #0 + %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={s2}"() #0 + %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={s3}"() #0 + %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={s4}"() #0 + %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={s5}"() #0 + %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={s6}"() #0 + %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={s7}"() #0 + %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={s8}"() #0 + %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={s9}"() #0 + %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={s10}"() #0 + %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={s11}"() #0 + %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={s12}"() #0 + %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={s13}"() #0 + %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={s14}"() #0 + %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={s15}"() #0 + %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={s16}"() #0 + %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={s17}"() #0 + %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={s18}"() #0 + %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={s19}"() #0 + %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={s20}"() #0 + %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={s21}"() #0 + %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={s22}"() #0 + %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={s23}"() #0 + %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={s24}"() #0 + %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={s25}"() #0 + %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={s26}"() #0 + %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={s27}"() #0 + %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={s28}"() #0 + %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={s29}"() #0 + %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={s30}"() #0 + %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={s31}"() #0 + %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={s32}"() #0 + %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={s33}"() #0 + %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={s34}"() #0 + %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={s35}"() #0 + %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={s36}"() #0 + %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={s37}"() #0 + %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={s38}"() #0 + %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={s39}"() #0 + %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={s40}"() #0 + %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={s41}"() #0 + %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={s42}"() #0 + %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={s43}"() #0 + %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={s44}"() #0 + %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={s45}"() #0 + %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={s46}"() #0 + %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={s47}"() #0 + %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={s48}"() #0 + %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={s49}"() #0 + %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={s50}"() #0 + %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={s51}"() #0 + %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={s52}"() #0 + %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={s53}"() #0 + %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={s54}"() #0 + %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={s55}"() #0 + %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={s56}"() #0 + %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={s57}"() #0 + %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={s58}"() #0 + %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={s59}"() #0 + %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={s60}"() #0 + %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={s61}"() #0 + %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={s62}"() #0 + %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={s63}"() #0 + %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={s64}"() #0 + %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={s65}"() #0 + %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={s66}"() #0 + %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={s67}"() #0 + %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={s68}"() #0 + %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={s69}"() #0 + %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={s70}"() #0 + %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={s71}"() #0 + %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={s72}"() #0 + %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={s73}"() #0 + %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={s74}"() #0 + %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={s75}"() #0 + %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={s76}"() #0 + %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={s77}"() #0 + %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={s78}"() #0 + %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={s79}"() #0 + %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={s80}"() #0 + %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={s81}"() #0 + %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={s82}"() #0 + %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={s83}"() #0 + %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={s84}"() #0 + %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={s85}"() #0 + %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={s86}"() #0 + %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={s87}"() #0 + %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={s88}"() #0 + %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={s89}"() #0 + %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={s90}"() #0 + %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={s91}"() #0 + %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={s92}"() #0 + %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={s93}"() #0 + %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={s94}"() #0 + %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #0 + %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #0 + %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #0 + %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #0 + %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #0 + %sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={s100}"() #0 + %sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={s101}"() #0 + %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_lo}"() #0 + %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_hi}"() #0 + %cmp = icmp eq i32 %cnd, 0 + br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch + +bb2: ; 68 bytes + ; 64 byte asm + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64",""() #0 + br label %bb3 + +bb3: + tail call void asm sideeffect "; reg use $0", "{s0}"(i32 %sgpr0) #0 + tail call void asm sideeffect "; reg use $0", "{s1}"(i32 %sgpr1) #0 + tail call void asm sideeffect "; reg use $0", "{s2}"(i32 %sgpr2) #0 + tail call void asm sideeffect "; reg use $0", "{s3}"(i32 %sgpr3) #0 + tail call void asm sideeffect "; reg use $0", "{s4}"(i32 %sgpr4) #0 + tail call void asm sideeffect "; reg use $0", "{s5}"(i32 %sgpr5) #0 + tail call void asm sideeffect "; reg use $0", "{s6}"(i32 %sgpr6) #0 + tail call void asm sideeffect "; reg use $0", "{s7}"(i32 %sgpr7) #0 + tail call void asm sideeffect "; reg use $0", "{s8}"(i32 %sgpr8) #0 + tail call void asm sideeffect "; reg use $0", "{s9}"(i32 %sgpr9) #0 + tail call void asm sideeffect "; reg use $0", "{s10}"(i32 %sgpr10) #0 + tail call void asm sideeffect "; reg use $0", "{s11}"(i32 %sgpr11) #0 + tail call void asm sideeffect "; reg use $0", "{s12}"(i32 %sgpr12) #0 + tail call void asm sideeffect "; reg use $0", "{s13}"(i32 %sgpr13) #0 + tail call void asm sideeffect "; reg use $0", "{s14}"(i32 %sgpr14) #0 + tail call void asm sideeffect "; reg use $0", "{s15}"(i32 %sgpr15) #0 + tail call void asm sideeffect "; reg use $0", "{s16}"(i32 %sgpr16) #0 + tail call void asm sideeffect "; reg use $0", "{s17}"(i32 %sgpr17) #0 + tail call void asm sideeffect "; reg use $0", "{s18}"(i32 %sgpr18) #0 + tail call void asm sideeffect "; reg use $0", "{s19}"(i32 %sgpr19) #0 + tail call void asm sideeffect "; reg use $0", "{s20}"(i32 %sgpr20) #0 + tail call void asm sideeffect "; reg use $0", "{s21}"(i32 %sgpr21) #0 + tail call void asm sideeffect "; reg use $0", "{s22}"(i32 %sgpr22) #0 + tail call void asm sideeffect "; reg use $0", "{s23}"(i32 %sgpr23) #0 + tail call void asm sideeffect "; reg use $0", "{s24}"(i32 %sgpr24) #0 + tail call void asm sideeffect "; reg use $0", "{s25}"(i32 %sgpr25) #0 + tail call void asm sideeffect "; reg use $0", "{s26}"(i32 %sgpr26) #0 + tail call void asm sideeffect "; reg use $0", "{s27}"(i32 %sgpr27) #0 + tail call void asm sideeffect "; reg use $0", "{s28}"(i32 %sgpr28) #0 + tail call void asm sideeffect "; reg use $0", "{s29}"(i32 %sgpr29) #0 + tail call void asm sideeffect "; reg use $0", "{s30}"(i32 %sgpr30) #0 + tail call void asm sideeffect "; reg use $0", "{s31}"(i32 %sgpr31) #0 + tail call void asm sideeffect "; reg use $0", "{s32}"(i32 %sgpr32) #0 + tail call void asm sideeffect "; reg use $0", "{s33}"(i32 %sgpr33) #0 + tail call void asm sideeffect "; reg use $0", "{s34}"(i32 %sgpr34) #0 + tail call void asm sideeffect "; reg use $0", "{s35}"(i32 %sgpr35) #0 + tail call void asm sideeffect "; reg use $0", "{s36}"(i32 %sgpr36) #0 + tail call void asm sideeffect "; reg use $0", "{s37}"(i32 %sgpr37) #0 + tail call void asm sideeffect "; reg use $0", "{s38}"(i32 %sgpr38) #0 + tail call void asm sideeffect "; reg use $0", "{s39}"(i32 %sgpr39) #0 + tail call void asm sideeffect "; reg use $0", "{s40}"(i32 %sgpr40) #0 + tail call void asm sideeffect "; reg use $0", "{s41}"(i32 %sgpr41) #0 + tail call void asm sideeffect "; reg use $0", "{s42}"(i32 %sgpr42) #0 + tail call void asm sideeffect "; reg use $0", "{s43}"(i32 %sgpr43) #0 + tail call void asm sideeffect "; reg use $0", "{s44}"(i32 %sgpr44) #0 + tail call void asm sideeffect "; reg use $0", "{s45}"(i32 %sgpr45) #0 + tail call void asm sideeffect "; reg use $0", "{s46}"(i32 %sgpr46) #0 + tail call void asm sideeffect "; reg use $0", "{s47}"(i32 %sgpr47) #0 + tail call void asm sideeffect "; reg use $0", "{s48}"(i32 %sgpr48) #0 + tail call void asm sideeffect "; reg use $0", "{s49}"(i32 %sgpr49) #0 + tail call void asm sideeffect "; reg use $0", "{s50}"(i32 %sgpr50) #0 + tail call void asm sideeffect "; reg use $0", "{s51}"(i32 %sgpr51) #0 + tail call void asm sideeffect "; reg use $0", "{s52}"(i32 %sgpr52) #0 + tail call void asm sideeffect "; reg use $0", "{s53}"(i32 %sgpr53) #0 + tail call void asm sideeffect "; reg use $0", "{s54}"(i32 %sgpr54) #0 + tail call void asm sideeffect "; reg use $0", "{s55}"(i32 %sgpr55) #0 + tail call void asm sideeffect "; reg use $0", "{s56}"(i32 %sgpr56) #0 + tail call void asm sideeffect "; reg use $0", "{s57}"(i32 %sgpr57) #0 + tail call void asm sideeffect "; reg use $0", "{s58}"(i32 %sgpr58) #0 + tail call void asm sideeffect "; reg use $0", "{s59}"(i32 %sgpr59) #0 + tail call void asm sideeffect "; reg use $0", "{s60}"(i32 %sgpr60) #0 + tail call void asm sideeffect "; reg use $0", "{s61}"(i32 %sgpr61) #0 + tail call void asm sideeffect "; reg use $0", "{s62}"(i32 %sgpr62) #0 + tail call void asm sideeffect "; reg use $0", "{s63}"(i32 %sgpr63) #0 + tail call void asm sideeffect "; reg use $0", "{s64}"(i32 %sgpr64) #0 + tail call void asm sideeffect "; reg use $0", "{s65}"(i32 %sgpr65) #0 + tail call void asm sideeffect "; reg use $0", "{s66}"(i32 %sgpr66) #0 + tail call void asm sideeffect "; reg use $0", "{s67}"(i32 %sgpr67) #0 + tail call void asm sideeffect "; reg use $0", "{s68}"(i32 %sgpr68) #0 + tail call void asm sideeffect "; reg use $0", "{s69}"(i32 %sgpr69) #0 + tail call void asm sideeffect "; reg use $0", "{s70}"(i32 %sgpr70) #0 + tail call void asm sideeffect "; reg use $0", "{s71}"(i32 %sgpr71) #0 + tail call void asm sideeffect "; reg use $0", "{s72}"(i32 %sgpr72) #0 + tail call void asm sideeffect "; reg use $0", "{s73}"(i32 %sgpr73) #0 + tail call void asm sideeffect "; reg use $0", "{s74}"(i32 %sgpr74) #0 + tail call void asm sideeffect "; reg use $0", "{s75}"(i32 %sgpr75) #0 + tail call void asm sideeffect "; reg use $0", "{s76}"(i32 %sgpr76) #0 + tail call void asm sideeffect "; reg use $0", "{s77}"(i32 %sgpr77) #0 + tail call void asm sideeffect "; reg use $0", "{s78}"(i32 %sgpr78) #0 + tail call void asm sideeffect "; reg use $0", "{s79}"(i32 %sgpr79) #0 + tail call void asm sideeffect "; reg use $0", "{s80}"(i32 %sgpr80) #0 + tail call void asm sideeffect "; reg use $0", "{s81}"(i32 %sgpr81) #0 + tail call void asm sideeffect "; reg use $0", "{s82}"(i32 %sgpr82) #0 + tail call void asm sideeffect "; reg use $0", "{s83}"(i32 %sgpr83) #0 + tail call void asm sideeffect "; reg use $0", "{s84}"(i32 %sgpr84) #0 + tail call void asm sideeffect "; reg use $0", "{s85}"(i32 %sgpr85) #0 + tail call void asm sideeffect "; reg use $0", "{s86}"(i32 %sgpr86) #0 + tail call void asm sideeffect "; reg use $0", "{s87}"(i32 %sgpr87) #0 + tail call void asm sideeffect "; reg use $0", "{s88}"(i32 %sgpr88) #0 + tail call void asm sideeffect "; reg use $0", "{s89}"(i32 %sgpr89) #0 + tail call void asm sideeffect "; reg use $0", "{s90}"(i32 %sgpr90) #0 + tail call void asm sideeffect "; reg use $0", "{s91}"(i32 %sgpr91) #0 + tail call void asm sideeffect "; reg use $0", "{s92}"(i32 %sgpr92) #0 + tail call void asm sideeffect "; reg use $0", "{s93}"(i32 %sgpr93) #0 + tail call void asm sideeffect "; reg use $0", "{s94}"(i32 %sgpr94) #0 + tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #0 + tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #0 + tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #0 + tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #0 + tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #0 + tail call void asm sideeffect "; reg use $0", "{s100}"(i32 %sgpr100) #0 + tail call void asm sideeffect "; reg use $0", "{s101}"(i32 %sgpr101) #0 + tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #0 + tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #0 + ret void +} + +declare i32 @llvm.amdgcn.workgroup.id.x() #0 + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll new file mode 100644 index 00000000000000..53f533ebb28427 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll @@ -0,0 +1,647 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-misched < %s | FileCheck --check-prefixes=GFX11-PAL %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GFX11-PAL-GCNTRACKERS %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -verify-misched < %s | FileCheck --check-prefixes=TONGA %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=TONGA-GCNTRACKERS %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched < %s | FileCheck --check-prefixes=GFX908 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GFX908-GCNTRACKERS %s +; RUN: llc -mtriple=amdgcn -verify-misched < %s | FileCheck --check-prefixes=GENERIC %s +; RUN: llc -mtriple=amdgcn -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GENERIC-GCNTRACKERS %s + +; GCN Trackers are sensitive to minor changes in RP, and will avoid scheduling certain instructions, which, if scheduled, +; allow scheduling of other instructions which reduce RP + +; CHECK-LABEL: {{^}}return_72xi32: +; GFX11-PAL: codeLenInByte = 768 +; GFX11-PAL-GCNTRACKERS: codeLenInByte = 888 +; GFX11-PAL: NumSgprs: 33 +; GFX11-PAL-GCNTRACKERS: NumSgprs: 33 +; GFX11-PAL: NumVgprs: 64 +; GFX11-PAL-GCNTRACKERS: NumVgprs: 64 +; GFX11-PAL: ScratchSize: 220 +; GFX11-PAL-GCNTRACKERS: ScratchSize: 248 + + +; CHECK-LABEL: {{^}}call_72xi32: +; GFX11-PAL: codeLenInByte = 1300 +; GFX11-PAL-GCNTRACKERS: codeLenInByte = 1372 +; GFX11-PAL: NumSgprs: 35 +; GFX11-PAL-GCNTRACKERS: NumSgprs: 35 +; GFX11-PAL: NumVgprs: 64 +; GFX11-PAL-GCNTRACKERS: NumVgprs: 64 +; GFX11-PAL: ScratchSize: 2780 +; GFX11-PAL-GCNTRACKERS: ScratchSize: 2808 + + +define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { + ret <72 x i32> %val +} + +define amdgpu_gfx void @call_72xi32() #1 { +entry: + %ret.0 = call amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> zeroinitializer) + %val.0 = insertelement <72 x i32> %ret.0, i32 42, i32 0 + %val.1 = insertelement <72 x i32> %val.0, i32 24, i32 58 + %ret.1 = call amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val.1) + ret void +} + +; CHECK-LABEL: {{^}}global_extload_v16f16_to_v16f64: +; TONGA: codeLenInByte = 420 +; TONGA-GCNTRACKERS: codeLenInByte = 436 +; TONGA: NumSgprs: 96 +; TONGA-GCNTRACKERS: NumSgprs: 96 +; TONGA: NumVgprs: 33 +; TONGA-GCNTRACKERS: NumVgprs: 25 +; TONGA: Occupancy: 7 +; TONGA-GCNTRACKERS: Occupancy: 8 + + +define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out, ptr addrspace(1) %in) { + %val = load <16 x half>, ptr addrspace(1) %in + %cvt = fpext <16 x half> %val to <16 x double> + store <16 x double> %cvt, ptr addrspace(1) %out + ret void +} + +; CHECK-LABEL: {{^}}constant_zextload_v64i16_to_v64i32: +; GENERIC: codeLenInByte = 860 +; GENERIC-GCNTRACKERS: codeLenInByte = 860 +; GENERIC: NumSgprs: 71 +; GENERIC-GCNTRACKERS: NumSgprs: 54 +; GENERIC: NumVgprs: 16 +; GENERIC-GCNTRACKERS: NumVgprs: 16 +; GENERIC: Occupancy: 7 +; GENERIC-GCNTRACKERS: Occupancy: 8 + +define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { + %load = load <64 x i16>, ptr addrspace(4) %in + %ext = zext <64 x i16> %load to <64 x i32> + store <64 x i32> %ext, ptr addrspace(1) %out + ret void +} + +; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure: +; GFX908: codeLenInByte = 1436 +; GFX908-GCNTRACKERS: codeLenInByte = 1436 +; GFX908: NumSgprs: 56 +; GFX908-GCNTRACKERS: NumSgprs: 56 +; GFX908: NumVgprs: 43 +; GFX908-GCNTRACKERS: NumVgprs: 39 +; GFX908: Occupancy: 5 +; GFX908-GCNTRACKERS: Occupancy: 6 + + +define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) { +entry: + %i = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %i2 = load i64, ptr addrspace(4) %i, align 8 + %i3 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %i4 = shl i32 %i3, 8 + %i5 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !5 + %i6 = add i32 %i4, %i5 + %i7 = trunc i64 %i2 to i32 + %conv = add i32 %i6, %i7 + %conv.frozen = freeze i32 %conv + %div = udiv i32 %conv.frozen, 49 + %add.ptr22 = getelementptr inbounds float, ptr addrspace(4) %wei_ptr, i64 undef + %in.ptr1 = getelementptr inbounds float, ptr addrspace(1) %in, i32 %i5 + br label %for.cond28.preheader + +for.cond28.preheader: ; preds = %for.cond28.preheader, %entry + %accum.sroa.110.0 = phi float [ 0.000000e+00, %entry ], [ %i251, %for.cond28.preheader ] + %accum.sroa.106.0 = phi float [ 0.000000e+00, %entry ], [ %i247, %for.cond28.preheader ] + %accum.sroa.102.0 = phi float [ 0.000000e+00, %entry ], [ %i243, %for.cond28.preheader ] + %accum.sroa.98.0 = phi float [ 0.000000e+00, %entry ], [ %i239, %for.cond28.preheader ] + %accum.sroa.94.0 = phi float [ 0.000000e+00, %entry ], [ %i235, %for.cond28.preheader ] + %accum.sroa.90.0 = phi float [ 0.000000e+00, %entry ], [ %i231, %for.cond28.preheader ] + %accum.sroa.86.0 = phi float [ 0.000000e+00, %entry ], [ %i227, %for.cond28.preheader ] + %accum.sroa.82.0 = phi float [ 0.000000e+00, %entry ], [ %i223, %for.cond28.preheader ] + %accum.sroa.78.0 = phi float [ 0.000000e+00, %entry ], [ %i219, %for.cond28.preheader ] + %accum.sroa.74.0 = phi float [ 0.000000e+00, %entry ], [ %i215, %for.cond28.preheader ] + %accum.sroa.70.0 = phi float [ 0.000000e+00, %entry ], [ %i211, %for.cond28.preheader ] + %accum.sroa.66.0 = phi float [ 0.000000e+00, %entry ], [ %i207, %for.cond28.preheader ] + %accum.sroa.62.0 = phi float [ 0.000000e+00, %entry ], [ %i203, %for.cond28.preheader ] + %accum.sroa.58.0 = phi float [ 0.000000e+00, %entry ], [ %i199, %for.cond28.preheader ] + %accum.sroa.54.0 = phi float [ 0.000000e+00, %entry ], [ %i195, %for.cond28.preheader ] + %accum.sroa.50.0 = phi float [ 0.000000e+00, %entry ], [ %i191, %for.cond28.preheader ] + %accum.sroa.46.0 = phi float [ 0.000000e+00, %entry ], [ %i187, %for.cond28.preheader ] + %accum.sroa.42.0 = phi float [ 0.000000e+00, %entry ], [ %i183, %for.cond28.preheader ] + %accum.sroa.38.0 = phi float [ 0.000000e+00, %entry ], [ %i179, %for.cond28.preheader ] + %accum.sroa.34.0 = phi float [ 0.000000e+00, %entry ], [ %i175, %for.cond28.preheader ] + %accum.sroa.30.0 = phi float [ 0.000000e+00, %entry ], [ %i171, %for.cond28.preheader ] + %accum.sroa.26.0 = phi float [ 0.000000e+00, %entry ], [ %i167, %for.cond28.preheader ] + %accum.sroa.22.0 = phi float [ 0.000000e+00, %entry ], [ %i163, %for.cond28.preheader ] + %accum.sroa.18.0 = phi float [ 0.000000e+00, %entry ], [ %i159, %for.cond28.preheader ] + %accum.sroa.14.0 = phi float [ 0.000000e+00, %entry ], [ %i155, %for.cond28.preheader ] + %accum.sroa.10.0 = phi float [ 0.000000e+00, %entry ], [ %i151, %for.cond28.preheader ] + %accum.sroa.6.0 = phi float [ 0.000000e+00, %entry ], [ %i147, %for.cond28.preheader ] + %accum.sroa.0.0 = phi float [ 0.000000e+00, %entry ], [ %i143, %for.cond28.preheader ] + %accum.sroa.114.0 = phi float [ 0.000000e+00, %entry ], [ %i255, %for.cond28.preheader ] + %accum.sroa.118.0 = phi float [ 0.000000e+00, %entry ], [ %i259, %for.cond28.preheader ] + %accum.sroa.122.0 = phi float [ 0.000000e+00, %entry ], [ %i263, %for.cond28.preheader ] + %accum.sroa.126.0 = phi float [ 0.000000e+00, %entry ], [ %i267, %for.cond28.preheader ] + %i_ptr.0288 = phi ptr addrspace(1) [ %in.ptr1, %entry ], [ %add.ptr47.3, %for.cond28.preheader ] + %w_ptr.0287 = phi ptr addrspace(4) [ %add.ptr22, %entry ], [ %add.ptr74, %for.cond28.preheader ] + %ci.0286 = phi i32 [ 0, %entry ], [ %inc116, %for.cond28.preheader ] + %i8 = load float, ptr addrspace(1) %i_ptr.0288, align 4 + %add.ptr47 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 49 + %i9 = load float, ptr addrspace(1) %add.ptr47, align 4 + %add.ptr47.1 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 98 + %i10 = load float, ptr addrspace(1) %add.ptr47.1, align 4 + %add.ptr47.2 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 147 + %i11 = load float, ptr addrspace(1) %add.ptr47.2, align 4 + %i12 = load float, ptr addrspace(4) %w_ptr.0287, align 4 + %add.ptr66 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1024 + %i13 = load float, ptr addrspace(4) %add.ptr66, align 4 + %add.ptr66.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2048 + %i14 = load float, ptr addrspace(4) %add.ptr66.1, align 4 + %add.ptr66.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3072 + %i15 = load float, ptr addrspace(4) %add.ptr66.2, align 4 + %add.ptr70 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1 + %i16 = load float, ptr addrspace(4) %add.ptr70, align 4 + %add.ptr66.1291 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1025 + %i17 = load float, ptr addrspace(4) %add.ptr66.1291, align 4 + %add.ptr66.1.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2049 + %i18 = load float, ptr addrspace(4) %add.ptr66.1.1, align 4 + %add.ptr66.2.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3073 + %i19 = load float, ptr addrspace(4) %add.ptr66.2.1, align 4 + %add.ptr70.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2 + %i20 = load float, ptr addrspace(4) %add.ptr70.1, align 4 + %add.ptr66.2293 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1026 + %i21 = load float, ptr addrspace(4) %add.ptr66.2293, align 4 + %add.ptr66.1.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2050 + %i22 = load float, ptr addrspace(4) %add.ptr66.1.2, align 4 + %add.ptr66.2.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3074 + %i23 = load float, ptr addrspace(4) %add.ptr66.2.2, align 4 + %add.ptr70.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3 + %i24 = load float, ptr addrspace(4) %add.ptr70.2, align 4 + %add.ptr66.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1027 + %i25 = load float, ptr addrspace(4) %add.ptr66.3, align 4 + %add.ptr66.1.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2051 + %i26 = load float, ptr addrspace(4) %add.ptr66.1.3, align 4 + %add.ptr66.2.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3075 + %i27 = load float, ptr addrspace(4) %add.ptr66.2.3, align 4 + %add.ptr70.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 4 + %i28 = load float, ptr addrspace(4) %add.ptr70.3, align 4 + %add.ptr66.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1028 + %i29 = load float, ptr addrspace(4) %add.ptr66.4, align 4 + %add.ptr66.1.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2052 + %i30 = load float, ptr addrspace(4) %add.ptr66.1.4, align 4 + %add.ptr66.2.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3076 + %i31 = load float, ptr addrspace(4) %add.ptr66.2.4, align 4 + %add.ptr70.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 5 + %i32 = load float, ptr addrspace(4) %add.ptr70.4, align 4 + %add.ptr66.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1029 + %i33 = load float, ptr addrspace(4) %add.ptr66.5, align 4 + %add.ptr66.1.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2053 + %i34 = load float, ptr addrspace(4) %add.ptr66.1.5, align 4 + %add.ptr66.2.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3077 + %i35 = load float, ptr addrspace(4) %add.ptr66.2.5, align 4 + %add.ptr70.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 6 + %i36 = load float, ptr addrspace(4) %add.ptr70.5, align 4 + %add.ptr66.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1030 + %i37 = load float, ptr addrspace(4) %add.ptr66.6, align 4 + %add.ptr66.1.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2054 + %i38 = load float, ptr addrspace(4) %add.ptr66.1.6, align 4 + %add.ptr66.2.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3078 + %i39 = load float, ptr addrspace(4) %add.ptr66.2.6, align 4 + %add.ptr70.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 7 + %i40 = load float, ptr addrspace(4) %add.ptr70.6, align 4 + %add.ptr66.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1031 + %i41 = load float, ptr addrspace(4) %add.ptr66.7, align 4 + %add.ptr66.1.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2055 + %i42 = load float, ptr addrspace(4) %add.ptr66.1.7, align 4 + %add.ptr66.2.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3079 + %i43 = load float, ptr addrspace(4) %add.ptr66.2.7, align 4 + %add.ptr70.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 8 + %i44 = load float, ptr addrspace(4) %add.ptr70.7, align 4 + %add.ptr66.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1032 + %i45 = load float, ptr addrspace(4) %add.ptr66.8, align 4 + %add.ptr66.1.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2056 + %i46 = load float, ptr addrspace(4) %add.ptr66.1.8, align 4 + %add.ptr66.2.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3080 + %i47 = load float, ptr addrspace(4) %add.ptr66.2.8, align 4 + %add.ptr70.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 9 + %i48 = load float, ptr addrspace(4) %add.ptr70.8, align 4 + %add.ptr66.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1033 + %i49 = load float, ptr addrspace(4) %add.ptr66.9, align 4 + %add.ptr66.1.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2057 + %i50 = load float, ptr addrspace(4) %add.ptr66.1.9, align 4 + %add.ptr66.2.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3081 + %i51 = load float, ptr addrspace(4) %add.ptr66.2.9, align 4 + %add.ptr70.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 10 + %i52 = load float, ptr addrspace(4) %add.ptr70.9, align 4 + %add.ptr66.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1034 + %i53 = load float, ptr addrspace(4) %add.ptr66.10, align 4 + %add.ptr66.1.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2058 + %i54 = load float, ptr addrspace(4) %add.ptr66.1.10, align 4 + %add.ptr66.2.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3082 + %i55 = load float, ptr addrspace(4) %add.ptr66.2.10, align 4 + %add.ptr70.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 11 + %i56 = load float, ptr addrspace(4) %add.ptr70.10, align 4 + %add.ptr66.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1035 + %i57 = load float, ptr addrspace(4) %add.ptr66.11, align 4 + %add.ptr66.1.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2059 + %i58 = load float, ptr addrspace(4) %add.ptr66.1.11, align 4 + %add.ptr66.2.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3083 + %i59 = load float, ptr addrspace(4) %add.ptr66.2.11, align 4 + %add.ptr70.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 12 + %i60 = load float, ptr addrspace(4) %add.ptr70.11, align 4 + %add.ptr66.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1036 + %i61 = load float, ptr addrspace(4) %add.ptr66.12, align 4 + %add.ptr66.1.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2060 + %i62 = load float, ptr addrspace(4) %add.ptr66.1.12, align 4 + %add.ptr66.2.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3084 + %i63 = load float, ptr addrspace(4) %add.ptr66.2.12, align 4 + %add.ptr70.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 13 + %i64 = load float, ptr addrspace(4) %add.ptr70.12, align 4 + %add.ptr66.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1037 + %i65 = load float, ptr addrspace(4) %add.ptr66.13, align 4 + %add.ptr66.1.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2061 + %i66 = load float, ptr addrspace(4) %add.ptr66.1.13, align 4 + %add.ptr66.2.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3085 + %i67 = load float, ptr addrspace(4) %add.ptr66.2.13, align 4 + %add.ptr70.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 14 + %i68 = load float, ptr addrspace(4) %add.ptr70.13, align 4 + %add.ptr66.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1038 + %i69 = load float, ptr addrspace(4) %add.ptr66.14, align 4 + %add.ptr66.1.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2062 + %i70 = load float, ptr addrspace(4) %add.ptr66.1.14, align 4 + %add.ptr66.2.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3086 + %i71 = load float, ptr addrspace(4) %add.ptr66.2.14, align 4 + %add.ptr70.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 15 + %i72 = load float, ptr addrspace(4) %add.ptr70.14, align 4 + %add.ptr66.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1039 + %i73 = load float, ptr addrspace(4) %add.ptr66.15, align 4 + %add.ptr66.1.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2063 + %i74 = load float, ptr addrspace(4) %add.ptr66.1.15, align 4 + %add.ptr66.2.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3087 + %i75 = load float, ptr addrspace(4) %add.ptr66.2.15, align 4 + %add.ptr70.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 16 + %i76 = load float, ptr addrspace(4) %add.ptr70.15, align 4 + %add.ptr66.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1040 + %i77 = load float, ptr addrspace(4) %add.ptr66.16, align 4 + %add.ptr66.1.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2064 + %i78 = load float, ptr addrspace(4) %add.ptr66.1.16, align 4 + %add.ptr66.2.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3088 + %i79 = load float, ptr addrspace(4) %add.ptr66.2.16, align 4 + %add.ptr70.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 17 + %i80 = load float, ptr addrspace(4) %add.ptr70.16, align 4 + %add.ptr66.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1041 + %i81 = load float, ptr addrspace(4) %add.ptr66.17, align 4 + %add.ptr66.1.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2065 + %i82 = load float, ptr addrspace(4) %add.ptr66.1.17, align 4 + %add.ptr66.2.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3089 + %i83 = load float, ptr addrspace(4) %add.ptr66.2.17, align 4 + %add.ptr70.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 18 + %i84 = load float, ptr addrspace(4) %add.ptr70.17, align 4 + %add.ptr66.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1042 + %i85 = load float, ptr addrspace(4) %add.ptr66.18, align 4 + %add.ptr66.1.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2066 + %i86 = load float, ptr addrspace(4) %add.ptr66.1.18, align 4 + %add.ptr66.2.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3090 + %i87 = load float, ptr addrspace(4) %add.ptr66.2.18, align 4 + %add.ptr70.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 19 + %i88 = load float, ptr addrspace(4) %add.ptr70.18, align 4 + %add.ptr66.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1043 + %i89 = load float, ptr addrspace(4) %add.ptr66.19, align 4 + %add.ptr66.1.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2067 + %i90 = load float, ptr addrspace(4) %add.ptr66.1.19, align 4 + %add.ptr66.2.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3091 + %i91 = load float, ptr addrspace(4) %add.ptr66.2.19, align 4 + %add.ptr70.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 20 + %i92 = load float, ptr addrspace(4) %add.ptr70.19, align 4 + %add.ptr66.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1044 + %i93 = load float, ptr addrspace(4) %add.ptr66.20, align 4 + %add.ptr66.1.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2068 + %i94 = load float, ptr addrspace(4) %add.ptr66.1.20, align 4 + %add.ptr66.2.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3092 + %i95 = load float, ptr addrspace(4) %add.ptr66.2.20, align 4 + %add.ptr70.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 21 + %i96 = load float, ptr addrspace(4) %add.ptr70.20, align 4 + %add.ptr66.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1045 + %i97 = load float, ptr addrspace(4) %add.ptr66.21, align 4 + %add.ptr66.1.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2069 + %i98 = load float, ptr addrspace(4) %add.ptr66.1.21, align 4 + %add.ptr66.2.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3093 + %i99 = load float, ptr addrspace(4) %add.ptr66.2.21, align 4 + %add.ptr70.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 22 + %i100 = load float, ptr addrspace(4) %add.ptr70.21, align 4 + %add.ptr66.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1046 + %i101 = load float, ptr addrspace(4) %add.ptr66.22, align 4 + %add.ptr66.1.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2070 + %i102 = load float, ptr addrspace(4) %add.ptr66.1.22, align 4 + %add.ptr66.2.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3094 + %i103 = load float, ptr addrspace(4) %add.ptr66.2.22, align 4 + %add.ptr70.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 23 + %i104 = load float, ptr addrspace(4) %add.ptr70.22, align 4 + %add.ptr66.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1047 + %i105 = load float, ptr addrspace(4) %add.ptr66.23, align 4 + %add.ptr66.1.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2071 + %i106 = load float, ptr addrspace(4) %add.ptr66.1.23, align 4 + %add.ptr66.2.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3095 + %i107 = load float, ptr addrspace(4) %add.ptr66.2.23, align 4 + %add.ptr70.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 24 + %i108 = load float, ptr addrspace(4) %add.ptr70.23, align 4 + %add.ptr66.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1048 + %i109 = load float, ptr addrspace(4) %add.ptr66.24, align 4 + %add.ptr66.1.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2072 + %i110 = load float, ptr addrspace(4) %add.ptr66.1.24, align 4 + %add.ptr66.2.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3096 + %i111 = load float, ptr addrspace(4) %add.ptr66.2.24, align 4 + %add.ptr70.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 25 + %i112 = load float, ptr addrspace(4) %add.ptr70.24, align 4 + %add.ptr66.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1049 + %i113 = load float, ptr addrspace(4) %add.ptr66.25, align 4 + %add.ptr66.1.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2073 + %i114 = load float, ptr addrspace(4) %add.ptr66.1.25, align 4 + %add.ptr66.2.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3097 + %i115 = load float, ptr addrspace(4) %add.ptr66.2.25, align 4 + %add.ptr70.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 26 + %i116 = load float, ptr addrspace(4) %add.ptr70.25, align 4 + %add.ptr66.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1050 + %i117 = load float, ptr addrspace(4) %add.ptr66.26, align 4 + %add.ptr66.1.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2074 + %i118 = load float, ptr addrspace(4) %add.ptr66.1.26, align 4 + %add.ptr66.2.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3098 + %i119 = load float, ptr addrspace(4) %add.ptr66.2.26, align 4 + %add.ptr70.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 27 + %i120 = load float, ptr addrspace(4) %add.ptr70.26, align 4 + %add.ptr66.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1051 + %i121 = load float, ptr addrspace(4) %add.ptr66.27, align 4 + %add.ptr66.1.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2075 + %i122 = load float, ptr addrspace(4) %add.ptr66.1.27, align 4 + %add.ptr66.2.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3099 + %i123 = load float, ptr addrspace(4) %add.ptr66.2.27, align 4 + %add.ptr70.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 28 + %i124 = load float, ptr addrspace(4) %add.ptr70.27, align 4 + %add.ptr66.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1052 + %i125 = load float, ptr addrspace(4) %add.ptr66.28, align 4 + %add.ptr66.1.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2076 + %i126 = load float, ptr addrspace(4) %add.ptr66.1.28, align 4 + %add.ptr66.2.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3100 + %i127 = load float, ptr addrspace(4) %add.ptr66.2.28, align 4 + %add.ptr70.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 29 + %i128 = load float, ptr addrspace(4) %add.ptr70.28, align 4 + %add.ptr66.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1053 + %i129 = load float, ptr addrspace(4) %add.ptr66.29, align 4 + %add.ptr66.1.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2077 + %i130 = load float, ptr addrspace(4) %add.ptr66.1.29, align 4 + %add.ptr66.2.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3101 + %i131 = load float, ptr addrspace(4) %add.ptr66.2.29, align 4 + %add.ptr70.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 30 + %i132 = load float, ptr addrspace(4) %add.ptr70.29, align 4 + %add.ptr66.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1054 + %i133 = load float, ptr addrspace(4) %add.ptr66.30, align 4 + %add.ptr66.1.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2078 + %i134 = load float, ptr addrspace(4) %add.ptr66.1.30, align 4 + %add.ptr66.2.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3102 + %i135 = load float, ptr addrspace(4) %add.ptr66.2.30, align 4 + %add.ptr70.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 31 + %i136 = load float, ptr addrspace(4) %add.ptr70.30, align 4 + %add.ptr66.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1055 + %i137 = load float, ptr addrspace(4) %add.ptr66.31, align 4 + %add.ptr66.1.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2079 + %i138 = load float, ptr addrspace(4) %add.ptr66.1.31, align 4 + %add.ptr66.2.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3103 + %i139 = load float, ptr addrspace(4) %add.ptr66.2.31, align 4 + %add.ptr47.3 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 196 + %i140 = tail call float @llvm.fmuladd.f32(float %i8, float %i12, float %accum.sroa.0.0) + %i141 = tail call float @llvm.fmuladd.f32(float %i9, float %i13, float %i140) + %i142 = tail call float @llvm.fmuladd.f32(float %i10, float %i14, float %i141) + %i143 = tail call float @llvm.fmuladd.f32(float %i11, float %i15, float %i142) + %i144 = tail call float @llvm.fmuladd.f32(float %i8, float %i16, float %accum.sroa.6.0) + %i145 = tail call float @llvm.fmuladd.f32(float %i9, float %i17, float %i144) + %i146 = tail call float @llvm.fmuladd.f32(float %i10, float %i18, float %i145) + %i147 = tail call float @llvm.fmuladd.f32(float %i11, float %i19, float %i146) + %i148 = tail call float @llvm.fmuladd.f32(float %i8, float %i20, float %accum.sroa.10.0) + %i149 = tail call float @llvm.fmuladd.f32(float %i9, float %i21, float %i148) + %i150 = tail call float @llvm.fmuladd.f32(float %i10, float %i22, float %i149) + %i151 = tail call float @llvm.fmuladd.f32(float %i11, float %i23, float %i150) + %i152 = tail call float @llvm.fmuladd.f32(float %i8, float %i24, float %accum.sroa.14.0) + %i153 = tail call float @llvm.fmuladd.f32(float %i9, float %i25, float %i152) + %i154 = tail call float @llvm.fmuladd.f32(float %i10, float %i26, float %i153) + %i155 = tail call float @llvm.fmuladd.f32(float %i11, float %i27, float %i154) + %i156 = tail call float @llvm.fmuladd.f32(float %i8, float %i28, float %accum.sroa.18.0) + %i157 = tail call float @llvm.fmuladd.f32(float %i9, float %i29, float %i156) + %i158 = tail call float @llvm.fmuladd.f32(float %i10, float %i30, float %i157) + %i159 = tail call float @llvm.fmuladd.f32(float %i11, float %i31, float %i158) + %i160 = tail call float @llvm.fmuladd.f32(float %i8, float %i32, float %accum.sroa.22.0) + %i161 = tail call float @llvm.fmuladd.f32(float %i9, float %i33, float %i160) + %i162 = tail call float @llvm.fmuladd.f32(float %i10, float %i34, float %i161) + %i163 = tail call float @llvm.fmuladd.f32(float %i11, float %i35, float %i162) + %i164 = tail call float @llvm.fmuladd.f32(float %i8, float %i36, float %accum.sroa.26.0) + %i165 = tail call float @llvm.fmuladd.f32(float %i9, float %i37, float %i164) + %i166 = tail call float @llvm.fmuladd.f32(float %i10, float %i38, float %i165) + %i167 = tail call float @llvm.fmuladd.f32(float %i11, float %i39, float %i166) + %i168 = tail call float @llvm.fmuladd.f32(float %i8, float %i40, float %accum.sroa.30.0) + %i169 = tail call float @llvm.fmuladd.f32(float %i9, float %i41, float %i168) + %i170 = tail call float @llvm.fmuladd.f32(float %i10, float %i42, float %i169) + %i171 = tail call float @llvm.fmuladd.f32(float %i11, float %i43, float %i170) + %i172 = tail call float @llvm.fmuladd.f32(float %i8, float %i44, float %accum.sroa.34.0) + %i173 = tail call float @llvm.fmuladd.f32(float %i9, float %i45, float %i172) + %i174 = tail call float @llvm.fmuladd.f32(float %i10, float %i46, float %i173) + %i175 = tail call float @llvm.fmuladd.f32(float %i11, float %i47, float %i174) + %i176 = tail call float @llvm.fmuladd.f32(float %i8, float %i48, float %accum.sroa.38.0) + %i177 = tail call float @llvm.fmuladd.f32(float %i9, float %i49, float %i176) + %i178 = tail call float @llvm.fmuladd.f32(float %i10, float %i50, float %i177) + %i179 = tail call float @llvm.fmuladd.f32(float %i11, float %i51, float %i178) + %i180 = tail call float @llvm.fmuladd.f32(float %i8, float %i52, float %accum.sroa.42.0) + %i181 = tail call float @llvm.fmuladd.f32(float %i9, float %i53, float %i180) + %i182 = tail call float @llvm.fmuladd.f32(float %i10, float %i54, float %i181) + %i183 = tail call float @llvm.fmuladd.f32(float %i11, float %i55, float %i182) + %i184 = tail call float @llvm.fmuladd.f32(float %i8, float %i56, float %accum.sroa.46.0) + %i185 = tail call float @llvm.fmuladd.f32(float %i9, float %i57, float %i184) + %i186 = tail call float @llvm.fmuladd.f32(float %i10, float %i58, float %i185) + %i187 = tail call float @llvm.fmuladd.f32(float %i11, float %i59, float %i186) + %i188 = tail call float @llvm.fmuladd.f32(float %i8, float %i60, float %accum.sroa.50.0) + %i189 = tail call float @llvm.fmuladd.f32(float %i9, float %i61, float %i188) + %i190 = tail call float @llvm.fmuladd.f32(float %i10, float %i62, float %i189) + %i191 = tail call float @llvm.fmuladd.f32(float %i11, float %i63, float %i190) + %i192 = tail call float @llvm.fmuladd.f32(float %i8, float %i64, float %accum.sroa.54.0) + %i193 = tail call float @llvm.fmuladd.f32(float %i9, float %i65, float %i192) + %i194 = tail call float @llvm.fmuladd.f32(float %i10, float %i66, float %i193) + %i195 = tail call float @llvm.fmuladd.f32(float %i11, float %i67, float %i194) + %i196 = tail call float @llvm.fmuladd.f32(float %i8, float %i68, float %accum.sroa.58.0) + %i197 = tail call float @llvm.fmuladd.f32(float %i9, float %i69, float %i196) + %i198 = tail call float @llvm.fmuladd.f32(float %i10, float %i70, float %i197) + %i199 = tail call float @llvm.fmuladd.f32(float %i11, float %i71, float %i198) + %i200 = tail call float @llvm.fmuladd.f32(float %i8, float %i72, float %accum.sroa.62.0) + %i201 = tail call float @llvm.fmuladd.f32(float %i9, float %i73, float %i200) + %i202 = tail call float @llvm.fmuladd.f32(float %i10, float %i74, float %i201) + %i203 = tail call float @llvm.fmuladd.f32(float %i11, float %i75, float %i202) + %i204 = tail call float @llvm.fmuladd.f32(float %i8, float %i76, float %accum.sroa.66.0) + %i205 = tail call float @llvm.fmuladd.f32(float %i9, float %i77, float %i204) + %i206 = tail call float @llvm.fmuladd.f32(float %i10, float %i78, float %i205) + %i207 = tail call float @llvm.fmuladd.f32(float %i11, float %i79, float %i206) + %i208 = tail call float @llvm.fmuladd.f32(float %i8, float %i80, float %accum.sroa.70.0) + %i209 = tail call float @llvm.fmuladd.f32(float %i9, float %i81, float %i208) + %i210 = tail call float @llvm.fmuladd.f32(float %i10, float %i82, float %i209) + %i211 = tail call float @llvm.fmuladd.f32(float %i11, float %i83, float %i210) + %i212 = tail call float @llvm.fmuladd.f32(float %i8, float %i84, float %accum.sroa.74.0) + %i213 = tail call float @llvm.fmuladd.f32(float %i9, float %i85, float %i212) + %i214 = tail call float @llvm.fmuladd.f32(float %i10, float %i86, float %i213) + %i215 = tail call float @llvm.fmuladd.f32(float %i11, float %i87, float %i214) + %i216 = tail call float @llvm.fmuladd.f32(float %i8, float %i88, float %accum.sroa.78.0) + %i217 = tail call float @llvm.fmuladd.f32(float %i9, float %i89, float %i216) + %i218 = tail call float @llvm.fmuladd.f32(float %i10, float %i90, float %i217) + %i219 = tail call float @llvm.fmuladd.f32(float %i11, float %i91, float %i218) + %i220 = tail call float @llvm.fmuladd.f32(float %i8, float %i92, float %accum.sroa.82.0) + %i221 = tail call float @llvm.fmuladd.f32(float %i9, float %i93, float %i220) + %i222 = tail call float @llvm.fmuladd.f32(float %i10, float %i94, float %i221) + %i223 = tail call float @llvm.fmuladd.f32(float %i11, float %i95, float %i222) + %i224 = tail call float @llvm.fmuladd.f32(float %i8, float %i96, float %accum.sroa.86.0) + %i225 = tail call float @llvm.fmuladd.f32(float %i9, float %i97, float %i224) + %i226 = tail call float @llvm.fmuladd.f32(float %i10, float %i98, float %i225) + %i227 = tail call float @llvm.fmuladd.f32(float %i11, float %i99, float %i226) + %i228 = tail call float @llvm.fmuladd.f32(float %i8, float %i100, float %accum.sroa.90.0) + %i229 = tail call float @llvm.fmuladd.f32(float %i9, float %i101, float %i228) + %i230 = tail call float @llvm.fmuladd.f32(float %i10, float %i102, float %i229) + %i231 = tail call float @llvm.fmuladd.f32(float %i11, float %i103, float %i230) + %i232 = tail call float @llvm.fmuladd.f32(float %i8, float %i104, float %accum.sroa.94.0) + %i233 = tail call float @llvm.fmuladd.f32(float %i9, float %i105, float %i232) + %i234 = tail call float @llvm.fmuladd.f32(float %i10, float %i106, float %i233) + %i235 = tail call float @llvm.fmuladd.f32(float %i11, float %i107, float %i234) + %i236 = tail call float @llvm.fmuladd.f32(float %i8, float %i108, float %accum.sroa.98.0) + %i237 = tail call float @llvm.fmuladd.f32(float %i9, float %i109, float %i236) + %i238 = tail call float @llvm.fmuladd.f32(float %i10, float %i110, float %i237) + %i239 = tail call float @llvm.fmuladd.f32(float %i11, float %i111, float %i238) + %i240 = tail call float @llvm.fmuladd.f32(float %i8, float %i112, float %accum.sroa.102.0) + %i241 = tail call float @llvm.fmuladd.f32(float %i9, float %i113, float %i240) + %i242 = tail call float @llvm.fmuladd.f32(float %i10, float %i114, float %i241) + %i243 = tail call float @llvm.fmuladd.f32(float %i11, float %i115, float %i242) + %i244 = tail call float @llvm.fmuladd.f32(float %i8, float %i116, float %accum.sroa.106.0) + %i245 = tail call float @llvm.fmuladd.f32(float %i9, float %i117, float %i244) + %i246 = tail call float @llvm.fmuladd.f32(float %i10, float %i118, float %i245) + %i247 = tail call float @llvm.fmuladd.f32(float %i11, float %i119, float %i246) + %i248 = tail call float @llvm.fmuladd.f32(float %i8, float %i120, float %accum.sroa.110.0) + %i249 = tail call float @llvm.fmuladd.f32(float %i9, float %i121, float %i248) + %i250 = tail call float @llvm.fmuladd.f32(float %i10, float %i122, float %i249) + %i251 = tail call float @llvm.fmuladd.f32(float %i11, float %i123, float %i250) + %i252 = tail call float @llvm.fmuladd.f32(float %i8, float %i124, float %accum.sroa.114.0) + %i253 = tail call float @llvm.fmuladd.f32(float %i9, float %i125, float %i252) + %i254 = tail call float @llvm.fmuladd.f32(float %i10, float %i126, float %i253) + %i255 = tail call float @llvm.fmuladd.f32(float %i11, float %i127, float %i254) + %i256 = tail call float @llvm.fmuladd.f32(float %i8, float %i128, float %accum.sroa.118.0) + %i257 = tail call float @llvm.fmuladd.f32(float %i9, float %i129, float %i256) + %i258 = tail call float @llvm.fmuladd.f32(float %i10, float %i130, float %i257) + %i259 = tail call float @llvm.fmuladd.f32(float %i11, float %i131, float %i258) + %i260 = tail call float @llvm.fmuladd.f32(float %i8, float %i132, float %accum.sroa.122.0) + %i261 = tail call float @llvm.fmuladd.f32(float %i9, float %i133, float %i260) + %i262 = tail call float @llvm.fmuladd.f32(float %i10, float %i134, float %i261) + %i263 = tail call float @llvm.fmuladd.f32(float %i11, float %i135, float %i262) + %i264 = tail call float @llvm.fmuladd.f32(float %i8, float %i136, float %accum.sroa.126.0) + %i265 = tail call float @llvm.fmuladd.f32(float %i9, float %i137, float %i264) + %i266 = tail call float @llvm.fmuladd.f32(float %i10, float %i138, float %i265) + %i267 = tail call float @llvm.fmuladd.f32(float %i11, float %i139, float %i266) + %add.ptr74 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 4096 + %inc116 = add nuw nsw i32 %ci.0286, 1 + %exitcond.not = icmp eq i32 %inc116, 512 + br i1 %exitcond.not, label %for.cond.cleanup26, label %for.cond28.preheader + +for.cond.cleanup26: ; preds = %for.cond28.preheader + %mul119 = shl nuw nsw i32 undef, 1 + %mul120 = mul i32 %div, 200704 + %mul121 = mul i32 undef, 6272 + %add122 = add i32 %mul120, %mul121 + %mul123 = mul nuw nsw i32 undef, 28 + %add124 = add i32 %add122, %mul123 + %add126 = add i32 %add124, %mul119 + %idx.ext127 = zext i32 %add126 to i64 + %add.ptr128 = getelementptr inbounds float, ptr addrspace(1) %out_ptr, i64 %idx.ext127 + store float %i143, ptr addrspace(1) %add.ptr128, align 4 + %add.ptr184 = getelementptr inbounds float, ptr addrspace(1) %add.ptr128, i64 196 + store float %i147, ptr addrspace(1) %add.ptr184, align 4 + %add.ptr167.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184, i64 14 + store float 0.000000e+00, ptr addrspace(1) %add.ptr167.1, align 4 + %add.ptr175.1.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr167.1, i64 1 + store float 0.000000e+00, ptr addrspace(1) %add.ptr175.1.1, align 4 + %add.ptr184.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184, i64 196 + store float %i151, ptr addrspace(1) %add.ptr184.1, align 4 + %add.ptr184.2 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.1, i64 196 + store float %i155, ptr addrspace(1) %add.ptr184.2, align 4 + %add.ptr184.3 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.2, i64 196 + store float %i159, ptr addrspace(1) %add.ptr184.3, align 4 + %add.ptr184.4 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.3, i64 196 + store float %i163, ptr addrspace(1) %add.ptr184.4, align 4 + %add.ptr154.5 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.4, i64 1 + store float 0.000000e+00, ptr addrspace(1) %add.ptr154.5, align 4 + %add.ptr184.5 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.4, i64 196 + store float %i167, ptr addrspace(1) %add.ptr184.5, align 4 + %add.ptr154.6 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.5, i64 1 + store float 0.000000e+00, ptr addrspace(1) %add.ptr154.6, align 4 + %add.ptr184.6 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.5, i64 196 + store float %i171, ptr addrspace(1) %add.ptr184.6, align 4 + %add.ptr184.7 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.6, i64 196 + store float %i175, ptr addrspace(1) %add.ptr184.7, align 4 + %add.ptr167.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.7, i64 14 + store float 0.000000e+00, ptr addrspace(1) %add.ptr167.8, align 4 + %add.ptr175.1.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr167.8, i64 1 + store float 0.000000e+00, ptr addrspace(1) %add.ptr175.1.8, align 4 + %add.ptr184.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.7, i64 196 + store float %i179, ptr addrspace(1) %add.ptr184.8, align 4 + %add.ptr184.9 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.8, i64 196 + store float %i183, ptr addrspace(1) %add.ptr184.9, align 4 + %add.ptr184.10 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.9, i64 196 + store float %i187, ptr addrspace(1) %add.ptr184.10, align 4 + %add.ptr184.11 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.10, i64 196 + store float %i191, ptr addrspace(1) %add.ptr184.11, align 4 + %add.ptr184.12 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.11, i64 196 + store float %i195, ptr addrspace(1) %add.ptr184.12, align 4 + %add.ptr184.13 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.12, i64 196 + store float %i199, ptr addrspace(1) %add.ptr184.13, align 4 + %add.ptr184.14 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.13, i64 196 + store float %i203, ptr addrspace(1) %add.ptr184.14, align 4 + %add.ptr184.15 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.14, i64 196 + store float %i207, ptr addrspace(1) %add.ptr184.15, align 4 + %add.ptr184.16 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.15, i64 196 + store float %i211, ptr addrspace(1) %add.ptr184.16, align 4 + %add.ptr184.17 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.16, i64 196 + store float %i215, ptr addrspace(1) %add.ptr184.17, align 4 + %add.ptr184.18 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.17, i64 196 + store float %i219, ptr addrspace(1) %add.ptr184.18, align 4 + %add.ptr184.19 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.18, i64 196 + store float %i223, ptr addrspace(1) %add.ptr184.19, align 4 + %add.ptr184.20 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.19, i64 196 + store float %i227, ptr addrspace(1) %add.ptr184.20, align 4 + %add.ptr184.21 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.20, i64 196 + store float %i231, ptr addrspace(1) %add.ptr184.21, align 4 + %add.ptr184.22 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.21, i64 196 + store float %i235, ptr addrspace(1) %add.ptr184.22, align 4 + %add.ptr184.23 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.22, i64 196 + store float %i239, ptr addrspace(1) %add.ptr184.23, align 4 + %add.ptr184.24 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.23, i64 196 + store float %i243, ptr addrspace(1) %add.ptr184.24, align 4 + %add.ptr184.25 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.24, i64 196 + store float %i247, ptr addrspace(1) %add.ptr184.25, align 4 + %add.ptr184.26 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.25, i64 196 + store float %i251, ptr addrspace(1) %add.ptr184.26, align 4 + %add.ptr184.27 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.26, i64 196 + store float %i255, ptr addrspace(1) %add.ptr184.27, align 4 + %add.ptr184.28 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.27, i64 196 + store float %i259, ptr addrspace(1) %add.ptr184.28, align 4 + %add.ptr184.29 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.28, i64 196 + store float %i263, ptr addrspace(1) %add.ptr184.29, align 4 + %add.ptr184.30 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.29, i64 196 + store float %i267, ptr addrspace(1) %add.ptr184.30, align 4 + ret void +} + + + +declare float @llvm.fmuladd.f32(float, float, float) #2 +declare i32 @llvm.amdgcn.workitem.id.x() #3 +declare i32 @llvm.amdgcn.workgroup.id.x() #3 +declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3 + +!0 = !{i32 1, i32 2, i32 1, i32 0} +!1 = !{!"none", !"none", !"none", !"none"} +!2 = !{!"ptr", !"ptr", !"ptr", !"float"} +!3 = !{!"restrict const", !"restrict const", !"restrict", !""} +!4 = !{i32 256, i32 1, i32 1} +!5 = !{i32 0, i32 1024} + +attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1" } +attributes #1 = { nounwind "amdgpu-num-vgpr"="64" } +attributes #2 = { nofree nosync nounwind readnone speculatable willreturn } +attributes #3 = { nounwind readnone speculatable willreturn } \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir index 14bb4310c619ea..34d203e0de2ffa 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir @@ -1,4 +1,5 @@ -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s --- | define amdgpu_kernel void @no_sched_metric_due_to_spills() #0 { @@ -11,6 +12,20 @@ # GCN-LABEL: name: no_sched_metric_due_to_spills # GCN-NOT: SI_SPILL_ # GCN: S_ENDPGM + +# GCN-GCNTRACKER-LABEL: name: no_sched_metric_due_to_spills +# GCN-GCNTRACKER: SI_SPILL_V32_SAVE +# GCN-GCNTRACKER: SI_SPILL_V32_SAVE +# GCN-GCNTRACKER: SI_SPILL_V32_SAVE +# GCN-GCNTRACKER: SI_SPILL_V32_SAVE +# GCN-GCNTRACKER: SI_SPILL_V32_SAVE +# GCN-GCNTRACKER: SI_SPILL_V32_SAVE +# GCN-GCNTRACKER: S_ENDPGM + +# When using the GCN Trackers, the scheduler is able to acieve desired occupancy without running high-RP-reschedule stage. However, the RP is still high, +# and RA is unable to allocate without spills. By running the high-RP-reschedule schedule we would have furhter decreased RP, which provides increased +# flexibility for RA. + --- name: no_sched_metric_due_to_spills tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll index 94815558bf3d6d..71f8d91874f04f 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll @@ -1,16 +1,24 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=OCC %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs < %s | FileCheck --check-prefix=OCC-GCNTRACKER %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs -amdgpu-schedule-relaxed-occupancy=true < %s | FileCheck --check-prefix=RELAX %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -amdgpu-schedule-relaxed-occupancy=true < %s | FileCheck --check-prefix=RELAX-GCNTRACKER %s ; Using -amgpu-schedule-relaxed-occupancy allows scheduler to produce better ILP by further relaxing occupancy target -; GCN-LABEL: {{^}}load_fma_store: +; CHECK-LABEL: {{^}}load_fma_store: ; OCC: NumVgprs: 32 +; OCC-GCNTRACKER: NumVgprs: 24 ; RELAX: NumVgprs: 64 +; RELAX-GCNTRACKER: NumVgprs: 60 ; OCC: NumVGPRsForWavesPerEU: 32 +; OCC-GCNTRACKER: NumVGPRsForWavesPerEU: 24 ; RELAX: NumVGPRsForWavesPerEU: 64 +; RELAX-GCNTRACKER: NumVGPRsForWavesPerEU: 60 ; OCC: Occupancy: 8 +; OCC-GCNTRACKER: Occupancy: 8 ; RELAX: Occupancy: 4 +; RELAX-GCNTRACKER: Occupancy: 4 define amdgpu_kernel void @load_fma_store(ptr addrspace(3) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) #1 { bb: