From 774099135e529dbd3d5bf9fd2da78b01ff54fa12 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 21 May 2024 12:55:07 -0700 Subject: [PATCH 01/27] [AMDGPU] NFC: Add BBLiveOutMap & LiveOut Cache Change-Id: I63cfd44e635cc4bee0e6780ca43b692c46e940b7 --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 58 ++++++++++++++++++--- llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 42 ++++++++++++++- 2 files changed, 91 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index d6958d9055fade..0a1a72c230db85 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -58,6 +58,11 @@ static cl::opt "Wave Limited (amdgpu-limit-wave-threshold)."), cl::init(false)); +static cl::opt GCNTrackers( + "amdgpu-use-amdgpu-trackers", cl::Hidden, + cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), + cl::init(false)); + const unsigned ScheduleMetrics::ScaleFactor = 100; GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) @@ -571,7 +576,8 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive( MachineSchedContext *C, std::unique_ptr S) : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget()), MFI(*MF.getInfo()), - StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) { + StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy), + RegionLiveOuts(this, /*IsLiveOut=*/true) { LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); if (RelaxedOcc) { @@ -613,6 +619,14 @@ GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { return RPTracker.moveMaxPressure(); } +static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, + MachineBasicBlock::iterator RegionEnd) { + auto REnd = RegionEnd == RegionBegin->getParent()->end() + ? std::prev(RegionEnd) + : RegionEnd; + return &*skipDebugInstructionsBackward(REnd, RegionBegin); +} + void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB) { GCNDownwardRPTracker RPTracker(*LIS); @@ -687,20 +701,45 @@ void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, } DenseMap -GCNScheduleDAGMILive::getBBLiveInMap() const { +GCNScheduleDAGMILive::getRegionLiveInMap() const { assert(!Regions.empty()); - std::vector BBStarters; - BBStarters.reserve(Regions.size()); + std::vector RegionFirstMIs; + RegionFirstMIs.reserve(Regions.size()); auto I = Regions.rbegin(), E = Regions.rend(); auto *BB = I->first->getParent(); do { auto *MI = &*skipDebugInstructionsForward(I->first, I->second); - BBStarters.push_back(MI); + RegionFirstMIs.push_back(MI); do { ++I; } while (I != E && I->first->getParent() == BB); } while (I != E); - return getLiveRegMap(BBStarters, false /*After*/, *LIS); + return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS); +} + +DenseMap +GCNScheduleDAGMILive::getRegionLiveOutMap() const { + assert(!Regions.empty()); + std::vector RegionLastMIs; + RegionLastMIs.reserve(Regions.size()); + for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) + RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd)); + + return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS); +} + +void RegionPressureMap::buildLiveRegMap() { + IdxToInstruction.clear(); + + BBLiveRegMap = + IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap(); + for (unsigned I = 0; I < DAG->Regions.size(); I++) { + MachineInstr *RegionKey = + IsLiveOut + ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second) + : &*DAG->Regions[I].first; + IdxToInstruction[I] = RegionKey; + } } void GCNScheduleDAGMILive::finalizeSchedule() { @@ -726,8 +765,11 @@ void GCNScheduleDAGMILive::finalizeSchedule() { void GCNScheduleDAGMILive::runSchedStages() { LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); - if (!Regions.empty()) - BBLiveInMap = getBBLiveInMap(); + if (!Regions.empty()) { + BBLiveInMap = getRegionLiveInMap(); + if (GCNTrackers) + RegionLiveOuts.buildLiveRegMap(); + } GCNSchedStrategy &S = static_cast(*SchedImpl); while (S.advanceStage()) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index f0aea2bc4ab865..c402fb1ef373c9 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -163,6 +163,32 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) { return OS; } +class GCNScheduleDAGMILive; +class RegionPressureMap { + GCNScheduleDAGMILive *DAG; + // The live in/out pressure as indexed by the first or last MI in the region + // before scheduling. + DenseMap BBLiveRegMap; + // The mapping of RegionIDx to key instruction + DenseMap IdxToInstruction; + // Whether we are calculating LiveOuts or LiveIns + bool IsLiveOut; + +public: + RegionPressureMap() {} + RegionPressureMap(GCNScheduleDAGMILive *GCNDAG, bool LiveOut) + : DAG(GCNDAG), IsLiveOut(LiveOut) {} + // Build the Instr->LiveReg and RegionIdx->Instr maps + void buildLiveRegMap(); + + // Retrieve the LiveReg for a given RegionIdx + GCNRPTracker::LiveRegSet &getLiveRegsForRegionIdx(unsigned RegionIdx) { + assert(IdxToInstruction.find(RegionIdx) != IdxToInstruction.end()); + MachineInstr *Key = IdxToInstruction[RegionIdx]; + return BBLiveRegMap[Key]; + } +}; + class GCNScheduleDAGMILive final : public ScheduleDAGMILive { friend class GCNSchedStage; friend class OccInitialScheduleStage; @@ -170,6 +196,7 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { friend class ClusteredLowOccStage; friend class PreRARematStage; friend class ILPInitialScheduleStage; + friend class RegionPressureMap; const GCNSubtarget &ST; @@ -211,9 +238,22 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive { // Temporary basic block live-in cache. DenseMap MBBLiveIns; + // The map of the initial first region instruction to region live in registers DenseMap BBLiveInMap; - DenseMap getBBLiveInMap() const; + // Calculate the map of the initial first region instruction to region live in + // registers + DenseMap getRegionLiveInMap() const; + + // Calculate the map of the initial last region instruction to region live out + // registers + DenseMap + getRegionLiveOutMap() const; + + // The live out registers per region. These are internally stored as a map of + // the initial last region instruction to region live out registers, but can + // be retreived with the regionIdx by calls to getLiveRegsForRegionIdx. + RegionPressureMap RegionLiveOuts; // Return current region pressure. GCNRegPressure getRealRegPressure(unsigned RegionIdx) const; From d2f80207760104b39a543f0822efbf9b7a2208ab Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 21 May 2024 13:34:59 -0700 Subject: [PATCH 02/27] [AMDGPU] NFC: Provide RPTracker interface for external iterators Change-Id: I79b54722e6e858961486248d94766c3f3c161160 --- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 284 ++++++++++++++++++++-- llvm/lib/Target/AMDGPU/GCNRegPressure.h | 95 ++++++-- 2 files changed, 330 insertions(+), 49 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 7c633b2bce7bc2..9bf1d6260359fc 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -296,6 +296,72 @@ collectVirtualRegUses(SmallVectorImpl &RegMaskPairs, } } +static LaneBitmask getRegLanes(ArrayRef RegUnits, + Register RegUnit) { + auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { + return Other.RegUnit == RegUnit; + }); + if (I == RegUnits.end()) + return LaneBitmask::getNone(); + return I->LaneMask; +} + +static LaneBitmask +getLanesWithProperty(const LiveIntervals &LIS, const MachineRegisterInfo &MRI, + bool TrackLaneMasks, Register RegUnit, SlotIndex Pos, + LaneBitmask SafeDefault, + bool (*Property)(const LiveRange &LR, SlotIndex Pos)) { + if (RegUnit.isVirtual()) { + const LiveInterval &LI = LIS.getInterval(RegUnit); + LaneBitmask Result; + if (TrackLaneMasks && LI.hasSubRanges()) { + for (const LiveInterval::SubRange &SR : LI.subranges()) { + if (Property(SR, Pos)) + Result |= SR.LaneMask; + } + } else if (Property(LI, Pos)) { + Result = TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(RegUnit) + : LaneBitmask::getAll(); + } + + return Result; + } else { + const LiveRange *LR = LIS.getCachedRegUnit(RegUnit); + // Be prepared for missing liveranges: We usually do not compute liveranges + // for physical registers on targets with many registers (GPUs). + if (LR == nullptr) + return SafeDefault; + return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone(); + } +} + +/// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx). +/// The query starts with a lane bitmask which gets lanes/bits removed for every +/// use we find. +static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask, + SlotIndex PriorUseIdx, SlotIndex NextUseIdx, + const MachineRegisterInfo &MRI, + const LiveIntervals *LIS, + bool Upward = false) { + const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); + for (const MachineOperand &MO : MRI.use_nodbg_operands(Reg)) { + if (MO.isUndef()) + continue; + const MachineInstr *MI = MO.getParent(); + SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot(); + bool InRange = Upward ? (InstSlot > PriorUseIdx && InstSlot <= NextUseIdx) + : (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx); + if (InRange) { + unsigned SubRegIdx = MO.getSubReg(); + LaneBitmask UseMask = TRI.getSubRegIndexLaneMask(SubRegIdx); + LastUseMask &= ~UseMask; + if (LastUseMask.none()) + return LaneBitmask::getNone(); + } + } + return LastUseMask; +} + /////////////////////////////////////////////////////////////////////////////// // GCNRPTracker @@ -354,17 +420,47 @@ void GCNRPTracker::reset(const MachineInstr &MI, MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs); } -//////////////////////////////////////////////////////////////////////////////// -// GCNUpwardRPTracker - -void GCNUpwardRPTracker::reset(const MachineRegisterInfo &MRI_, - const LiveRegSet &LiveRegs_) { +void GCNRPTracker::reset(const MachineRegisterInfo &MRI_, + const LiveRegSet &LiveRegs_) { MRI = &MRI_; LiveRegs = LiveRegs_; LastTrackedMI = nullptr; MaxPressure = CurPressure = getRegPressure(MRI_, LiveRegs_); } +void GCNRPTracker::bumpDeadDefs(ArrayRef DeadDefs) { + for (const RegisterMaskPair &P : DeadDefs) { + Register Reg = P.RegUnit; + if (!Reg.isVirtual()) + continue; + LaneBitmask LiveMask = LiveRegs[Reg]; + LaneBitmask BumpedMask = LiveMask | P.LaneMask; + CurPressure.inc(Reg, LiveMask, BumpedMask, *MRI); + } + MaxPressure = max(MaxPressure, CurPressure); + for (const RegisterMaskPair &P : DeadDefs) { + Register Reg = P.RegUnit; + if (!Reg.isVirtual()) + continue; + LaneBitmask LiveMask = LiveRegs[Reg]; + LaneBitmask BumpedMask = LiveMask | P.LaneMask; + CurPressure.inc(Reg, BumpedMask, LiveMask, *MRI); + } +} + +LaneBitmask GCNRPTracker::getLastUsedLanes(Register RegUnit, + SlotIndex Pos) const { + return getLanesWithProperty( + LIS, *MRI, true, RegUnit, Pos.getBaseIndex(), LaneBitmask::getNone(), + [](const LiveRange &LR, SlotIndex Pos) { + const LiveRange::Segment *S = LR.getSegmentContaining(Pos); + return S != nullptr && S->end == Pos.getRegSlot(); + }); +} + +//////////////////////////////////////////////////////////////////////////////// +// GCNUpwardRPTracker + void GCNUpwardRPTracker::recede(const MachineInstr &MI) { assert(MRI && "call reset first"); @@ -425,6 +521,63 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { assert(CurPressure == getRegPressure(*MRI, LiveRegs)); } +void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI) { + assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction."); + + SlotIndex SlotIdx = LIS.getInstructionIndex(*MI).getRegSlot(); + + // Account for register pressure similar to RegPressureTracker::recede(). + RegisterOperands RegOpers; + const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); + RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/true); + assert(RegOpers.DeadDefs.empty()); + RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx); + RegOpers.detectDeadDefs(*MI, LIS); + + // Boost max pressure for all dead defs together. + // Since CurrSetPressure and MaxSetPressure + bumpDeadDefs(RegOpers.DeadDefs); + + // Kill liveness at live defs. + for (const RegisterMaskPair &P : RegOpers.Defs) { + Register Reg = P.RegUnit; + if (!Reg.isVirtual()) + continue; + LaneBitmask LiveAfter = LiveRegs[Reg]; + LaneBitmask UseLanes = getRegLanes(RegOpers.Uses, Reg); + LaneBitmask DefLanes = P.LaneMask; + LaneBitmask LiveBefore = (LiveAfter & ~DefLanes) | UseLanes; + + // There may be parts of the register that were dead before the + // instruction, but became live afterwards. Similarly, some parts + // may have been killed in this instruction. + CurPressure.inc(Reg, LiveAfter, LiveAfter & LiveBefore, *MRI); + CurPressure.inc(Reg, LiveAfter, ~LiveAfter & LiveBefore, *MRI); + MaxPressure = max(MaxPressure, CurPressure); + } + // Generate liveness for uses. + for (const RegisterMaskPair &P : RegOpers.Uses) { + Register Reg = P.RegUnit; + if (!Reg.isVirtual()) + continue; + // If this register was also in a def operand, we've handled it + // with defs. + if (getRegLanes(RegOpers.Defs, Reg).any()) + continue; + LaneBitmask LiveAfter = LiveRegs[Reg]; + SlotIndex CurrIdx = + LastTrackedMI ? LIS.getInstructionIndex(*LastTrackedMI).getRegSlot() + : LIS.getMBBEndIdx(MI->getParent()); + ; + LaneBitmask LastUseMask = + findUseBetween(Reg, P.LaneMask, SlotIdx, CurrIdx, *MRI, &LIS, true); + LastUseMask &= ~LiveAfter; + LaneBitmask LiveBefore = (LiveAfter | LastUseMask); + CurPressure.inc(Reg, LiveAfter, LiveBefore, *MRI); + } + MaxPressure = max(MaxPressure, CurPressure); +} + //////////////////////////////////////////////////////////////////////////////// // GCNDownwardRPTracker @@ -441,28 +594,44 @@ bool GCNDownwardRPTracker::reset(const MachineInstr &MI, return true; } -bool GCNDownwardRPTracker::advanceBeforeNext() { +bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI, + bool UseInternalIterator, + LiveIntervals *TheLIS) { assert(MRI && "call reset first"); - if (!LastTrackedMI) - return NextMI == MBBEnd; - - assert(NextMI == MBBEnd || !NextMI->isDebugInstr()); + SlotIndex SI; + LiveIntervals *CurrLIS; + MachineInstr *CurrMI; + if (UseInternalIterator) { + if (!LastTrackedMI) + return NextMI == MBBEnd; + + assert(NextMI == MBBEnd || !NextMI->isDebugInstr()); + CurrLIS = const_cast(&LIS); + CurrMI = const_cast(LastTrackedMI); + + SI = NextMI == MBBEnd + ? CurrLIS->getInstructionIndex(*LastTrackedMI).getDeadSlot() + : CurrLIS->getInstructionIndex(*NextMI).getBaseIndex(); + } else { //! UseInternalIterator + CurrLIS = TheLIS; + SI = CurrLIS->getInstructionIndex(*MI).getBaseIndex(); + CurrMI = MI; + } - SlotIndex SI = NextMI == MBBEnd - ? LIS.getInstructionIndex(*LastTrackedMI).getDeadSlot() - : LIS.getInstructionIndex(*NextMI).getBaseIndex(); assert(SI.isValid()); // Remove dead registers or mask bits. SmallSet SeenRegs; - for (auto &MO : LastTrackedMI->operands()) { + for (auto &MO : CurrMI->operands()) { if (!MO.isReg() || !MO.getReg().isVirtual()) continue; if (MO.isUse() && !MO.readsReg()) continue; + if (!UseInternalIterator && MO.isDef()) + continue; if (!SeenRegs.insert(MO.getReg()).second) continue; - const LiveInterval &LI = LIS.getInterval(MO.getReg()); + const LiveInterval &LI = CurrLIS->getInterval(MO.getReg()); if (LI.hasSubRanges()) { auto It = LiveRegs.end(); for (const auto &S : LI.subranges()) { @@ -492,15 +661,22 @@ bool GCNDownwardRPTracker::advanceBeforeNext() { LastTrackedMI = nullptr; - return NextMI == MBBEnd; + return UseInternalIterator && (NextMI == MBBEnd); } -void GCNDownwardRPTracker::advanceToNext() { - LastTrackedMI = &*NextMI++; - NextMI = skipDebugInstructionsForward(NextMI, MBBEnd); +void GCNDownwardRPTracker::advanceToNext(MachineInstr *MI, + bool UseInternalIterator) { + if (UseInternalIterator) { + LastTrackedMI = &*NextMI++; + NextMI = skipDebugInstructionsForward(NextMI, MBBEnd); + } else { + LastTrackedMI = MI; + } + + MachineInstr *CurrMI = const_cast(LastTrackedMI); // Add new registers or mask bits. - for (const auto &MO : LastTrackedMI->all_defs()) { + for (const auto &MO : CurrMI->all_defs()) { Register Reg = MO.getReg(); if (!Reg.isVirtual()) continue; @@ -513,11 +689,12 @@ void GCNDownwardRPTracker::advanceToNext() { MaxPressure = max(MaxPressure, CurPressure); } -bool GCNDownwardRPTracker::advance() { - if (NextMI == MBBEnd) +bool GCNDownwardRPTracker::advance(MachineInstr *MI, bool UseInternalIterator, + LiveIntervals *TheLIS) { + if (UseInternalIterator && NextMI == MBBEnd) return false; - advanceBeforeNext(); - advanceToNext(); + advanceBeforeNext(MI, UseInternalIterator, TheLIS); + advanceToNext(MI, UseInternalIterator); return true; } @@ -559,6 +736,65 @@ Printable llvm::reportMismatch(const GCNRPTracker::LiveRegSet &LISLR, }); } +void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI) { + assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction."); + + SlotIndex SlotIdx; + SlotIdx = LIS.getInstructionIndex(*MI).getRegSlot(); + + // Account for register pressure similar to RegPressureTracker::recede(). + RegisterOperands RegOpers; + const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); + RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/false); + RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx); + + for (const RegisterMaskPair &Use : RegOpers.Uses) { + Register Reg = Use.RegUnit; + if (!Reg.isVirtual()) + continue; + LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx); + if (LastUseMask.none()) + continue; + // The LastUseMask is queried from the liveness information of instruction + // which may be further down the schedule. Some lanes may actually not be + // last uses for the current position. + // FIXME: allow the caller to pass in the list of vreg uses that remain + // to be bottom-scheduled to avoid searching uses at each query. + SlotIndex CurrIdx; + const MachineBasicBlock *MBB = MI->getParent(); + MachineBasicBlock::const_iterator IdxPos = skipDebugInstructionsForward( + LastTrackedMI ? LastTrackedMI : MBB->begin(), MBB->end()); + if (IdxPos == MBB->end()) { + CurrIdx = LIS.getMBBEndIdx(MBB); + } else { + CurrIdx = LIS.getInstructionIndex(*IdxPos).getRegSlot(); + } + + LastUseMask = + findUseBetween(Reg, LastUseMask, CurrIdx, SlotIdx, *MRI, &LIS); + if (LastUseMask.none()) + continue; + + LaneBitmask LiveMask = LiveRegs[Reg]; + LaneBitmask NewMask = LiveMask & ~LastUseMask; + CurPressure.inc(Reg, LiveMask, NewMask, *MRI); + } + + // Generate liveness for defs. + for (const RegisterMaskPair &Def : RegOpers.Defs) { + Register Reg = Def.RegUnit; + if (!Reg.isVirtual()) + continue; + LaneBitmask LiveMask = LiveRegs[Reg]; + LaneBitmask NewMask = LiveMask | Def.LaneMask; + CurPressure.inc(Reg, LiveMask, NewMask, *MRI); + } + MaxPressure = max(MaxPressure, CurPressure); + + // Boost pressure for all dead defs together. + bumpDeadDefs(RegOpers.DeadDefs); +} + bool GCNUpwardRPTracker::isValid() const { const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex(); const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI); diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index d419fcc802c60a..a501a5a142884f 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -19,6 +19,7 @@ #include "GCNSubtarget.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/RegisterPressure.h" #include namespace llvm { @@ -149,6 +150,9 @@ inline GCNRegPressure operator-(const GCNRegPressure &P1, return Diff; } +/////////////////////////////////////////////////////////////////////////////// +// GCNRPTracker + class GCNRPTracker { public: using LiveRegSet = DenseMap; @@ -165,7 +169,14 @@ class GCNRPTracker { void reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy, bool After); + void bumpDeadDefs(ArrayRef DeadDefs); + + LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const; + public: + // reset tracker and set live register set to the specified value. + void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_); + // live regs for the current state const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; } const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; } @@ -182,34 +193,40 @@ class GCNRPTracker { GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS, const MachineRegisterInfo &MRI); +//////////////////////////////////////////////////////////////////////////////// +// GCNUpwardRPTracker + class GCNUpwardRPTracker : public GCNRPTracker { public: GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {} - // reset tracker and set live register set to the specified value. - void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_); + using GCNRPTracker::reset; - // reset tracker at the specified slot index. + /// reset tracker at the specified slot index \p SI. void reset(const MachineRegisterInfo &MRI, SlotIndex SI) { - reset(MRI, llvm::getLiveRegs(SI, LIS, MRI)); + GCNRPTracker::reset(MRI, llvm::getLiveRegs(SI, LIS, MRI)); } - // reset tracker to the end of the MBB. + /// reset tracker to the end of the \p MBB. void reset(const MachineBasicBlock &MBB) { reset(MBB.getParent()->getRegInfo(), LIS.getSlotIndexes()->getMBBEndIdx(&MBB)); } - // reset tracker to the point just after MI (in program order). + /// reset tracker to the point just after \p MI (in program order). void reset(const MachineInstr &MI) { reset(MI.getMF()->getRegInfo(), LIS.getInstructionIndex(MI).getDeadSlot()); } - // move to the state just before the MI (in program order). + /// Move to the state of RP just before the \p MI . If \p UseInternalIterator + /// is set, also update the internal iterators. Setting \p UseInternalIterator + /// to false allows for an externally managed iterator / program order. void recede(const MachineInstr &MI); - // checks whether the tracker's state after receding MI corresponds - // to reported by LIS. + void bumpUpwardPressure(const MachineInstr *MI); + + /// \p returns whether the tracker's state after receding MI corresponds + /// to reported by LIS. bool isValid() const; const GCNRegPressure &getMaxPressure() const { return MaxPressure; } @@ -223,6 +240,9 @@ class GCNUpwardRPTracker : public GCNRPTracker { } }; +//////////////////////////////////////////////////////////////////////////////// +// GCNDownwardRPTracker + class GCNDownwardRPTracker : public GCNRPTracker { // Last position of reset or advanceBeforeNext MachineBasicBlock::const_iterator NextMI; @@ -232,37 +252,62 @@ class GCNDownwardRPTracker : public GCNRPTracker { public: GCNDownwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {} + using GCNRPTracker::reset; + MachineBasicBlock::const_iterator getNext() const { return NextMI; } - // Return MaxPressure and clear it. + /// \p return MaxPressure and clear it. GCNRegPressure moveMaxPressure() { auto Res = MaxPressure; MaxPressure.clear(); return Res; } - // Reset tracker to the point before the MI - // filling live regs upon this point using LIS. - // Returns false if block is empty except debug values. + /// Reset tracker to the point before the \p MI + /// filling \p LiveRegs upon this point using LIS. + /// \p returns false if block is empty except debug values. bool reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr); - // Move to the state right before the next MI or after the end of MBB. - // Returns false if reached end of the block. - bool advanceBeforeNext(); - - // Move to the state at the MI, advanceBeforeNext has to be called first. - void advanceToNext(); - - // Move to the state at the next MI. Returns false if reached end of block. - bool advance(); - - // Advance instructions until before End. + /// Move to the state right before the next MI or after the end of MBB. + /// \p returns false if reached end of the block. + /// If \p UseInternalIterator is true, then internal iterators are used and + /// set to process in program order. If \p UseInternalIterator is false, then + /// it is assumed that the tracker is using an externally managed iterator, + /// and advance* calls will not update the state of the iterator. In such + /// cases, the tracker will move to the state right before the provided \p MI + /// and use the provided \p TheLIS for RP calculations. + bool advanceBeforeNext(MachineInstr *MI = nullptr, + bool UseInternalIterator = true, + LiveIntervals *TheLIS = nullptr); + + /// Move to the state at the MI, advanceBeforeNext has to be called first. + /// If \p UseInternalIterator is true, then internal iterators are used and + /// set to process in program order. If \p UseInternalIterator is false, then + /// it is assumed that the tracker is using an externally managed iterator, + /// and advance* calls will not update the state of the iterator. In such + /// cases, the tracker will move to the state at the provided \p MI . + void advanceToNext(MachineInstr *MI = nullptr, + bool UseInternalIterator = true); + + /// Move to the state at the next MI. \p returns false if reached end of + /// block. If \p UseInternalIterator is true, then internal iterators are used + /// and set to process in program order. If \p UseInternalIterator is false, + /// then it is assumed that the tracker is using an externally managed + /// iterator, and advance* calls will not update the state of the iterator. In + /// such cases, the tracker will move to the state right before the provided + /// \p MI and use the provided \p TheLIS for RP calculations. + bool advance(MachineInstr *MI = nullptr, bool UseInternalIterator = true, + LiveIntervals *TheLIS = nullptr); + + /// Advance instructions until before \p End. bool advance(MachineBasicBlock::const_iterator End); - // Reset to Begin and advance to End. + /// Reset to \p Begin and advance to \p End. bool advance(MachineBasicBlock::const_iterator Begin, MachineBasicBlock::const_iterator End, const LiveRegSet *LiveRegsCopy = nullptr); + + void bumpDownwardPressure(const MachineInstr *MI); }; /// \returns the LaneMask of live lanes of \p Reg at position \p SI. Only the From 819fb01de14294df825b9eade515b55ba3eb0015 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 21 May 2024 18:04:25 -0700 Subject: [PATCH 03/27] [AMDGPU] Optionally Use AMDGPU RPTrackers during scheduling Change-Id: I6ae56149c1eb49ea85362267174cc6274c416330 --- .../Target/AMDGPU/GCNIterativeScheduler.cpp | 2 +- llvm/lib/Target/AMDGPU/GCNRegPressure.h | 1 - llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 90 ++++++++++++++++--- llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 19 +++- 4 files changed, 96 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 13504508e2fb2e..9b1db3241e4327 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -480,7 +480,7 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, " "target occupancy = " << TgtOcc << '\n'); - GCNMaxOccupancySchedStrategy LStrgy(Context); + GCNMaxOccupancySchedStrategy LStrgy(Context, /*IsLegacyScheduler*/ true); unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy()); for (int I = 0; I < NumPasses; ++I) { diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index a501a5a142884f..063fb79fbf77bd 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -176,7 +176,6 @@ class GCNRPTracker { public: // reset tracker and set live register set to the specified value. void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_); - // live regs for the current state const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; } const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; } diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 0a1a72c230db85..1e6d95d128709d 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -67,6 +67,7 @@ const unsigned ScheduleMetrics::ScaleFactor = 100; GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), + TheTracker(*C->LIS), TheUpwardTracker(*C->LIS), HasHighPressure(false) {} void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { @@ -156,14 +157,37 @@ static bool canUsePressureDiffs(const SUnit &SU) { static void getRegisterPressures(bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, std::vector &Pressure, - std::vector &MaxPressure) { + std::vector &MaxPressure, + GCNDownwardRPTracker &TheTracker, + GCNUpwardRPTracker &TheUpwardTracker, + ScheduleDAGMI *DAG) { // getDownwardPressure() and getUpwardPressure() make temporary changes to // the tracker, so we need to pass those function a non-const copy. RegPressureTracker &TempTracker = const_cast(RPTracker); - if (AtTop) - TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); - else - TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); + if (!GCNTrackers) { + if (AtTop) + TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); + else + TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); + } else { + if (AtTop) { + GCNDownwardRPTracker TempTopTracker(TheTracker); + auto MI = SU->getInstr(); + TempTopTracker.advance(MI, true, DAG->getLIS()); + + Pressure[AMDGPU::RegisterPressureSets::SReg_32] = TempTopTracker.getPressure().getSGPRNum(); + Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = TempTopTracker.getPressure().getVGPRNum(false); + } + + else { + GCNUpwardRPTracker TempBotTracker(TheUpwardTracker); + auto MI = SU->getInstr(); + TempBotTracker.recede(*MI, true); + + Pressure[AMDGPU::RegisterPressureSets::SReg_32] = TempBotTracker.getPressure().getSGPRNum(); + Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = TempBotTracker.getPressure().getVGPRNum(false); + } + } } void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, @@ -192,8 +216,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, // // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of // PressureDiffs. - if (AtTop || !canUsePressureDiffs(*SU)) { - getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure); + if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) { + getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure, TheTracker, TheUpwardTracker, DAG); } else { // Reserve 4 slots. Pressure.resize(4, 0); @@ -211,7 +235,11 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, #ifdef EXPENSIVE_CHECKS std::vector CheckPressure, CheckMaxPressure; +<<<<<<< HEAD getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure); +======= + getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,TheTracker,TheUpwardTracker, DAG); +>>>>>>> 3fc6929b4a78... [AMDGPU] Optionally Use AMDGPU RPTrackers during scheduling if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] != CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] || Pressure[AMDGPU::RegisterPressureSets::VGPR_32] != @@ -299,8 +327,16 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, unsigned SGPRPressure = 0; unsigned VGPRPressure = 0; if (DAG->isTrackingPressure()) { - SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; - VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; + SGPRPressure = + GCNTrackers + ? (Zone.isTop() ? TheTracker.getPressure().getSGPRNum() + : TheUpwardTracker.getPressure().getSGPRNum()) + : Pressure[AMDGPU::RegisterPressureSets::SReg_32]; + VGPRPressure = + GCNTrackers + ? (Zone.isTop() ? TheTracker.getPressure().getVGPRNum(false) + : TheUpwardTracker.getPressure().getVGPRNum(false)) + : Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; } ReadyQueue &Q = Zone.Available; for (SUnit *SU : Q) { @@ -449,6 +485,16 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { return SU; } +void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { + if (GCNTrackers) { + MachineInstr *MI = SU->getInstr(); + IsTopNode ? (void)TheTracker.advance(MI, true, DAG->getLIS()) + : TheUpwardTracker.recede(*MI, true); + } + + return GenericScheduler::schedNode(SU, IsTopNode); +} + GCNSchedStageID GCNSchedStrategy::getCurrentStage() { assert(CurrentStage && CurrentStage != SchedStages.end()); return *CurrentStage; @@ -475,12 +521,13 @@ GCNSchedStageID GCNSchedStrategy::getNextStage() const { } GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( - const MachineSchedContext *C) + const MachineSchedContext *C, bool IsLegacyScheduler) : GCNSchedStrategy(C) { SchedStages.push_back(GCNSchedStageID::OccInitialSchedule); SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule); SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule); SchedStages.push_back(GCNSchedStageID::PreRARematerialize); + GCNTrackers = GCNTrackers & !IsLegacyScheduler; } GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C) @@ -787,6 +834,20 @@ void GCNScheduleDAGMILive::runSchedStages() { continue; } + if (GCNTrackers) { + GCNDownwardRPTracker *TheTracker = S.getTracker(); + GCNUpwardRPTracker *TheUpwardTracker = S.getUpwardTracker(); + GCNRPTracker::LiveRegSet *RegionLiveIns = &LiveIns[Stage->getRegionIdx()]; + + reinterpret_cast(TheTracker)->reset( + Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(), + *RegionLiveIns); + reinterpret_cast(TheUpwardTracker)->reset( + Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(), + RegionLiveOuts.getLiveRegsForRegionIdx(Stage->getRegionIdx())); + + } + ScheduleDAGMILive::schedule(); Stage->finalizeGCNRegion(); } @@ -1057,6 +1118,7 @@ void GCNSchedStage::finalizeGCNRegion() { void GCNSchedStage::checkScheduling() { // Check the results of scheduling. PressureAfter = DAG.getRealRegPressure(RegionIdx); + LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); @@ -1608,9 +1670,6 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST, MachineInstr *MI = Entry.first; MachineInstr *OldMI = Entry.second; - // Remove OldMI from BBLiveInMap since we are sinking it from its MBB. - DAG.BBLiveInMap.erase(OldMI); - // Remove OldMI and update LIS Register Reg = MI->getOperand(0).getReg(); LIS->RemoveMachineInstrFromMaps(*OldMI); @@ -1628,6 +1687,11 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST, DAG.Regions = NewRegions; DAG.RescheduleRegions = NewRescheduleRegions; + DAG.BBLiveInMap = DAG.getBBLiveInMap(); + + if (GCNTrackers) + DAG.RegionLiveOuts.buildLiveRegMap(); + SIMachineFunctionInfo &MFI = *MF.getInfo(); MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index c402fb1ef373c9..8088339fbd26c2 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -70,6 +70,12 @@ class GCNSchedStrategy : public GenericScheduler { // Pointer to the current SchedStageID. SmallVectorImpl::iterator CurrentStage = nullptr; + // GCN RP Tracker for top-down scheduling + mutable GCNDownwardRPTracker TheTracker; + + // GCN RP Tracker for botttom-up scheduling + mutable GCNUpwardRPTracker TheUpwardTracker; + public: // schedule() have seen register pressure over the critical limits and had to // track register pressure for actual scheduling heuristics. @@ -102,6 +108,8 @@ class GCNSchedStrategy : public GenericScheduler { SUnit *pickNode(bool &IsTopNode) override; + void schedNode(SUnit *SU, bool IsTopNode) override; + void initialize(ScheduleDAGMI *DAG) override; unsigned getTargetOccupancy() { return TargetOccupancy; } @@ -116,13 +124,19 @@ class GCNSchedStrategy : public GenericScheduler { bool hasNextStage() const; GCNSchedStageID getNextStage() const; + + GCNDownwardRPTracker *getTracker() { return &TheTracker; } + + GCNUpwardRPTracker *getUpwardTracker() { return &TheUpwardTracker; } + }; /// The goal of this scheduling strategy is to maximize kernel occupancy (i.e. /// maximum number of waves per simd). class GCNMaxOccupancySchedStrategy final : public GCNSchedStrategy { public: - GCNMaxOccupancySchedStrategy(const MachineSchedContext *C); + GCNMaxOccupancySchedStrategy(const MachineSchedContext *C, + bool IsLegacyScheduler = false); }; /// The goal of this scheduling strategy is to maximize ILP for a single wave @@ -350,6 +364,9 @@ class GCNSchedStage { bool isRegionWithExcessRP() const { return DAG.RegionsWithExcessRP[RegionIdx]; } + + // The region number this stage is currently working on + unsigned getRegionIdx() { return RegionIdx; } // Returns true if the new schedule may result in more spilling. bool mayCauseSpilling(unsigned WavesAfter); From b538f212c3f36427929c696140dcff82b637a3cb Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Fri, 14 Jun 2024 14:46:28 -0700 Subject: [PATCH 04/27] Formatting Change-Id: I1cb0a88e94f4156da6118fcd3724556939351c6d --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 46 +++++++++++---------- llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 3 +- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 1e6d95d128709d..a6115afe0f03ce 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -66,9 +66,8 @@ static cl::opt GCNTrackers( const unsigned ScheduleMetrics::ScaleFactor = 100; GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) - : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), - TheTracker(*C->LIS), TheUpwardTracker(*C->LIS), - HasHighPressure(false) {} + : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), TheTracker(*C->LIS), + TheUpwardTracker(*C->LIS), HasHighPressure(false) {} void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { GenericScheduler::initialize(DAG); @@ -175,8 +174,10 @@ static void getRegisterPressures(bool AtTop, auto MI = SU->getInstr(); TempTopTracker.advance(MI, true, DAG->getLIS()); - Pressure[AMDGPU::RegisterPressureSets::SReg_32] = TempTopTracker.getPressure().getSGPRNum(); - Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = TempTopTracker.getPressure().getVGPRNum(false); + Pressure[AMDGPU::RegisterPressureSets::SReg_32] = + TempTopTracker.getPressure().getSGPRNum(); + Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = + TempTopTracker.getPressure().getVGPRNum(false); } else { @@ -184,8 +185,10 @@ static void getRegisterPressures(bool AtTop, auto MI = SU->getInstr(); TempBotTracker.recede(*MI, true); - Pressure[AMDGPU::RegisterPressureSets::SReg_32] = TempBotTracker.getPressure().getSGPRNum(); - Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = TempBotTracker.getPressure().getVGPRNum(false); + Pressure[AMDGPU::RegisterPressureSets::SReg_32] = + TempBotTracker.getPressure().getSGPRNum(); + Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = + TempBotTracker.getPressure().getVGPRNum(false); } } } @@ -217,7 +220,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of // PressureDiffs. if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) { - getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure, TheTracker, TheUpwardTracker, DAG); + getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure, + TheTracker, TheUpwardTracker, DAG); } else { // Reserve 4 slots. Pressure.resize(4, 0); @@ -235,11 +239,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, #ifdef EXPENSIVE_CHECKS std::vector CheckPressure, CheckMaxPressure; -<<<<<<< HEAD - getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure); -======= - getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,TheTracker,TheUpwardTracker, DAG); ->>>>>>> 3fc6929b4a78... [AMDGPU] Optionally Use AMDGPU RPTrackers during scheduling + getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure, + TheTracker, TheUpwardTracker, DAG); if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] != CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] || Pressure[AMDGPU::RegisterPressureSets::VGPR_32] != @@ -837,15 +838,16 @@ void GCNScheduleDAGMILive::runSchedStages() { if (GCNTrackers) { GCNDownwardRPTracker *TheTracker = S.getTracker(); GCNUpwardRPTracker *TheUpwardTracker = S.getUpwardTracker(); - GCNRPTracker::LiveRegSet *RegionLiveIns = &LiveIns[Stage->getRegionIdx()]; - - reinterpret_cast(TheTracker)->reset( - Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(), - *RegionLiveIns); - reinterpret_cast(TheUpwardTracker)->reset( - Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(), - RegionLiveOuts.getLiveRegsForRegionIdx(Stage->getRegionIdx())); - + GCNRPTracker::LiveRegSet *RegionLiveIns = + &LiveIns[Stage->getRegionIdx()]; + + reinterpret_cast(TheTracker) + ->reset(Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(), + *RegionLiveIns); + reinterpret_cast(TheUpwardTracker) + ->reset( + Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(), + RegionLiveOuts.getLiveRegsForRegionIdx(Stage->getRegionIdx())); } ScheduleDAGMILive::schedule(); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 8088339fbd26c2..e8c89b2f1baf27 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -128,7 +128,6 @@ class GCNSchedStrategy : public GenericScheduler { GCNDownwardRPTracker *getTracker() { return &TheTracker; } GCNUpwardRPTracker *getUpwardTracker() { return &TheUpwardTracker; } - }; /// The goal of this scheduling strategy is to maximize kernel occupancy (i.e. @@ -364,7 +363,7 @@ class GCNSchedStage { bool isRegionWithExcessRP() const { return DAG.RegionsWithExcessRP[RegionIdx]; } - + // The region number this stage is currently working on unsigned getRegionIdx() { return RegionIdx; } From 653e153f2c206e60ce2b31364a2a6bc371af8e1a Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Mon, 27 May 2024 10:43:43 -0700 Subject: [PATCH 05/27] Actually use the iterative trackers Change-Id: I198925f5ed91b0a49ac265e19fdbe2208139f09a --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index a6115afe0f03ce..320acbaf5b22a6 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -172,7 +172,7 @@ static void getRegisterPressures(bool AtTop, if (AtTop) { GCNDownwardRPTracker TempTopTracker(TheTracker); auto MI = SU->getInstr(); - TempTopTracker.advance(MI, true, DAG->getLIS()); + TempTopTracker.advance(MI, false, DAG->getLIS()); Pressure[AMDGPU::RegisterPressureSets::SReg_32] = TempTopTracker.getPressure().getSGPRNum(); @@ -183,7 +183,7 @@ static void getRegisterPressures(bool AtTop, else { GCNUpwardRPTracker TempBotTracker(TheUpwardTracker); auto MI = SU->getInstr(); - TempBotTracker.recede(*MI, true); + TempBotTracker.recede(*MI, false); Pressure[AMDGPU::RegisterPressureSets::SReg_32] = TempBotTracker.getPressure().getSGPRNum(); @@ -489,8 +489,8 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { if (GCNTrackers) { MachineInstr *MI = SU->getInstr(); - IsTopNode ? (void)TheTracker.advance(MI, true, DAG->getLIS()) - : TheUpwardTracker.recede(*MI, true); + IsTopNode ? (void)TheTracker.advance(MI, false, DAG->getLIS()) + : TheUpwardTracker.recede(*MI, false); } return GenericScheduler::schedNode(SU, IsTopNode); From 5d92149e23acba84cf932bc5d45bfc024504a55f Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 28 May 2024 13:24:09 -0700 Subject: [PATCH 06/27] Review Comments Change-Id: Ifa69110bf0a239ea14d25c0bad03215d1b018656 --- .../Target/AMDGPU/GCNIterativeScheduler.cpp | 2 +- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 51 +++++++++---------- llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 8 +-- 3 files changed, 30 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 9b1db3241e4327..e89016b0ae984e 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -480,7 +480,7 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, " "target occupancy = " << TgtOcc << '\n'); - GCNMaxOccupancySchedStrategy LStrgy(Context, /*IsLegacyScheduler*/ true); + GCNMaxOccupancySchedStrategy LStrgy(Context, /*IsLegacyScheduler=*/ true); unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy()); for (int I = 0; I < NumPasses; ++I) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 320acbaf5b22a6..e4d32b6eefb9b1 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -66,8 +66,8 @@ static cl::opt GCNTrackers( const unsigned ScheduleMetrics::ScaleFactor = 100; GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) - : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), TheTracker(*C->LIS), - TheUpwardTracker(*C->LIS), HasHighPressure(false) {} + : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), DownwardTracker(*C->LIS), + UpwardTracker(*C->LIS), HasHighPressure(false) {} void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { GenericScheduler::initialize(DAG); @@ -157,8 +157,8 @@ static void getRegisterPressures(bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, std::vector &Pressure, std::vector &MaxPressure, - GCNDownwardRPTracker &TheTracker, - GCNUpwardRPTracker &TheUpwardTracker, + GCNDownwardRPTracker &DownwardTracker, + GCNUpwardRPTracker &UpwardTracker, ScheduleDAGMI *DAG) { // getDownwardPressure() and getUpwardPressure() make temporary changes to // the tracker, so we need to pass those function a non-const copy. @@ -170,7 +170,7 @@ static void getRegisterPressures(bool AtTop, TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); } else { if (AtTop) { - GCNDownwardRPTracker TempTopTracker(TheTracker); + GCNDownwardRPTracker TempTopTracker(DownwardTracker); auto MI = SU->getInstr(); TempTopTracker.advance(MI, false, DAG->getLIS()); @@ -181,7 +181,7 @@ static void getRegisterPressures(bool AtTop, } else { - GCNUpwardRPTracker TempBotTracker(TheUpwardTracker); + GCNUpwardRPTracker TempBotTracker(UpwardTracker); auto MI = SU->getInstr(); TempBotTracker.recede(*MI, false); @@ -221,7 +221,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, // PressureDiffs. if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) { getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure, - TheTracker, TheUpwardTracker, DAG); + DownwardTracker, UpwardTracker, DAG); } else { // Reserve 4 slots. Pressure.resize(4, 0); @@ -240,7 +240,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, #ifdef EXPENSIVE_CHECKS std::vector CheckPressure, CheckMaxPressure; getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure, - TheTracker, TheUpwardTracker, DAG); + TheTracker, UpwardTracker, DAG); if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] != CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] || Pressure[AMDGPU::RegisterPressureSets::VGPR_32] != @@ -330,13 +330,13 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, if (DAG->isTrackingPressure()) { SGPRPressure = GCNTrackers - ? (Zone.isTop() ? TheTracker.getPressure().getSGPRNum() - : TheUpwardTracker.getPressure().getSGPRNum()) + ? (Zone.isTop() ? DownwardTracker.getPressure().getSGPRNum() + : UpwardTracker.getPressure().getSGPRNum()) : Pressure[AMDGPU::RegisterPressureSets::SReg_32]; VGPRPressure = GCNTrackers - ? (Zone.isTop() ? TheTracker.getPressure().getVGPRNum(false) - : TheUpwardTracker.getPressure().getVGPRNum(false)) + ? (Zone.isTop() ? DownwardTracker.getPressure().getVGPRNum(false) + : UpwardTracker.getPressure().getVGPRNum(false)) : Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; } ReadyQueue &Q = Zone.Available; @@ -489,8 +489,8 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { if (GCNTrackers) { MachineInstr *MI = SU->getInstr(); - IsTopNode ? (void)TheTracker.advance(MI, false, DAG->getLIS()) - : TheUpwardTracker.recede(*MI, false); + IsTopNode ? (void)DownwardTracker.advance(MI, false, DAG->getLIS()) + : UpwardTracker.recede(*MI, false); } return GenericScheduler::schedNode(SU, IsTopNode); @@ -836,18 +836,17 @@ void GCNScheduleDAGMILive::runSchedStages() { } if (GCNTrackers) { - GCNDownwardRPTracker *TheTracker = S.getTracker(); - GCNUpwardRPTracker *TheUpwardTracker = S.getUpwardTracker(); - GCNRPTracker::LiveRegSet *RegionLiveIns = - &LiveIns[Stage->getRegionIdx()]; - - reinterpret_cast(TheTracker) - ->reset(Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(), - *RegionLiveIns); - reinterpret_cast(TheUpwardTracker) - ->reset( - Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(), - RegionLiveOuts.getLiveRegsForRegionIdx(Stage->getRegionIdx())); + GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker(); + GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker(); + GCNRPTracker::LiveRegSet *RegionLiveIns = &LiveIns[Stage->getRegionIdx()]; + + reinterpret_cast(DownwardTracker)->reset( + Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(), + *RegionLiveIns); + reinterpret_cast(UpwardTracker)->reset( + Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(), + RegionLiveOuts.getLiveRegsForRegionIdx(Stage->getRegionIdx())); + } ScheduleDAGMILive::schedule(); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index e8c89b2f1baf27..91b4c0c63d2bb3 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -71,10 +71,10 @@ class GCNSchedStrategy : public GenericScheduler { SmallVectorImpl::iterator CurrentStage = nullptr; // GCN RP Tracker for top-down scheduling - mutable GCNDownwardRPTracker TheTracker; + mutable GCNDownwardRPTracker DownwardTracker; // GCN RP Tracker for botttom-up scheduling - mutable GCNUpwardRPTracker TheUpwardTracker; + mutable GCNUpwardRPTracker UpwardTracker; public: // schedule() have seen register pressure over the critical limits and had to @@ -125,9 +125,9 @@ class GCNSchedStrategy : public GenericScheduler { GCNSchedStageID getNextStage() const; - GCNDownwardRPTracker *getTracker() { return &TheTracker; } + GCNDownwardRPTracker *getDownwardTracker() { return &DownwardTracker; } - GCNUpwardRPTracker *getUpwardTracker() { return &TheUpwardTracker; } + GCNUpwardRPTracker *getUpwardTracker() { return &UpwardTracker; } }; /// The goal of this scheduling strategy is to maximize kernel occupancy (i.e. From b1b81cc6a4813c908f129392841eb425a447cd59 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 28 May 2024 13:29:41 -0700 Subject: [PATCH 07/27] Use DAG.MRI Change-Id: I9f0275a0cede9e77dfd29262124f2a856f436c8c --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index e4d32b6eefb9b1..c3bee344764160 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -840,13 +840,11 @@ void GCNScheduleDAGMILive::runSchedStages() { GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker(); GCNRPTracker::LiveRegSet *RegionLiveIns = &LiveIns[Stage->getRegionIdx()]; - reinterpret_cast(DownwardTracker)->reset( - Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(), - *RegionLiveIns); - reinterpret_cast(UpwardTracker)->reset( - Regions[Stage->getRegionIdx()].first->getMF()->getRegInfo(), - RegionLiveOuts.getLiveRegsForRegionIdx(Stage->getRegionIdx())); - + reinterpret_cast(DownwardTracker) + ->reset(MRI, *RegionLiveIns); + reinterpret_cast(UpwardTracker) + ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx( + Stage->getRegionIdx())); } ScheduleDAGMILive::schedule(); From 83fea0a87f236f7bcc15beb3bbb57a9eee356c92 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 28 May 2024 13:52:29 -0700 Subject: [PATCH 08/27] Formatting Change-Id: I74c19a2cf20d2325178933f81e0e8716d7c62f17 --- llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp | 2 +- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index e89016b0ae984e..da065e8d8cb6b8 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -480,7 +480,7 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, " "target occupancy = " << TgtOcc << '\n'); - GCNMaxOccupancySchedStrategy LStrgy(Context, /*IsLegacyScheduler=*/ true); + GCNMaxOccupancySchedStrategy LStrgy(Context, /*IsLegacyScheduler=*/true); unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy()); for (int I = 0; I < NumPasses; ++I) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index c3bee344764160..724ffa4494323c 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -66,8 +66,9 @@ static cl::opt GCNTrackers( const unsigned ScheduleMetrics::ScaleFactor = 100; GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) - : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), DownwardTracker(*C->LIS), - UpwardTracker(*C->LIS), HasHighPressure(false) {} + : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), + DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) { +} void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { GenericScheduler::initialize(DAG); @@ -329,10 +330,9 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, unsigned VGPRPressure = 0; if (DAG->isTrackingPressure()) { SGPRPressure = - GCNTrackers - ? (Zone.isTop() ? DownwardTracker.getPressure().getSGPRNum() - : UpwardTracker.getPressure().getSGPRNum()) - : Pressure[AMDGPU::RegisterPressureSets::SReg_32]; + GCNTrackers ? (Zone.isTop() ? DownwardTracker.getPressure().getSGPRNum() + : UpwardTracker.getPressure().getSGPRNum()) + : Pressure[AMDGPU::RegisterPressureSets::SReg_32]; VGPRPressure = GCNTrackers ? (Zone.isTop() ? DownwardTracker.getPressure().getVGPRNum(false) @@ -838,7 +838,8 @@ void GCNScheduleDAGMILive::runSchedStages() { if (GCNTrackers) { GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker(); GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker(); - GCNRPTracker::LiveRegSet *RegionLiveIns = &LiveIns[Stage->getRegionIdx()]; + GCNRPTracker::LiveRegSet *RegionLiveIns = + &LiveIns[Stage->getRegionIdx()]; reinterpret_cast(DownwardTracker) ->reset(MRI, *RegionLiveIns); From 3947cbb76bf10a573f6c29d3a31c4cd5eb619161 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Fri, 14 Jun 2024 15:03:02 -0700 Subject: [PATCH 09/27] Review comments Change-Id: I09f9ca74c07b516daed0e93a85937df8b9aa922b --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 724ffa4494323c..5006ea37e2564b 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -329,15 +329,16 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, unsigned SGPRPressure = 0; unsigned VGPRPressure = 0; if (DAG->isTrackingPressure()) { - SGPRPressure = - GCNTrackers ? (Zone.isTop() ? DownwardTracker.getPressure().getSGPRNum() - : UpwardTracker.getPressure().getSGPRNum()) - : Pressure[AMDGPU::RegisterPressureSets::SReg_32]; - VGPRPressure = - GCNTrackers - ? (Zone.isTop() ? DownwardTracker.getPressure().getVGPRNum(false) - : UpwardTracker.getPressure().getVGPRNum(false)) - : Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; + if (!GCNTrackers) { + SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; + VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; + } else { + GCNRPTracker *T = &UpwardTracker; + if (Zone.isTop()) + T = &DownwardTracker; + SGPRPressure = T->getPressure().getSGPRNum(); + VGPRPressure = T->getPressure().getVGPRNum(false); + } } ReadyQueue &Q = Zone.Available; for (SUnit *SU : Q) { From a8600c83b237d67f78969840b8995ed8aa08b80d Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Fri, 14 Jun 2024 16:14:57 -0700 Subject: [PATCH 10/27] Allocate Pressure vector Change-Id: I5effce973fa2d945076e89b4453a844f0fc85fc9 --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 5006ea37e2564b..cdafa01eeb857a 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -170,6 +170,7 @@ static void getRegisterPressures(bool AtTop, else TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); } else { + Pressure.resize(4, 0); if (AtTop) { GCNDownwardRPTracker TempTopTracker(DownwardTracker); auto MI = SU->getInstr(); From 4d3a3ca2dccc8be90bbd82e634cc158580e39eae Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 18 Jun 2024 11:39:48 -0700 Subject: [PATCH 11/27] Remove flag from upward RPTracker Change-Id: I6217c03f56d34f584e5b23cf7c4462842bc7173b --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index cdafa01eeb857a..0c7639462905d7 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -185,7 +185,7 @@ static void getRegisterPressures(bool AtTop, else { GCNUpwardRPTracker TempBotTracker(UpwardTracker); auto MI = SU->getInstr(); - TempBotTracker.recede(*MI, false); + TempBotTracker.recede(*MI); Pressure[AMDGPU::RegisterPressureSets::SReg_32] = TempBotTracker.getPressure().getSGPRNum(); @@ -492,7 +492,7 @@ void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { if (GCNTrackers) { MachineInstr *MI = SU->getInstr(); IsTopNode ? (void)DownwardTracker.advance(MI, false, DAG->getLIS()) - : UpwardTracker.recede(*MI, false); + : UpwardTracker.recede(*MI); } return GenericScheduler::schedNode(SU, IsTopNode); From d468edec0da5875f4786a80315121ba806a96b82 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 19 Jun 2024 11:45:32 -0700 Subject: [PATCH 12/27] Review comments Change-Id: Ibeaba6cab034636472b20c36adfadabbbc2c19ef --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 53 ++++++++++----------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 0c7639462905d7..c5d217d80a7c8a 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -165,33 +165,30 @@ static void getRegisterPressures(bool AtTop, // the tracker, so we need to pass those function a non-const copy. RegPressureTracker &TempTracker = const_cast(RPTracker); if (!GCNTrackers) { - if (AtTop) - TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure); - else - TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); - } else { - Pressure.resize(4, 0); - if (AtTop) { - GCNDownwardRPTracker TempTopTracker(DownwardTracker); - auto MI = SU->getInstr(); - TempTopTracker.advance(MI, false, DAG->getLIS()); - - Pressure[AMDGPU::RegisterPressureSets::SReg_32] = - TempTopTracker.getPressure().getSGPRNum(); - Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = - TempTopTracker.getPressure().getVGPRNum(false); - } + AtTop + ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure) + : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); - else { - GCNUpwardRPTracker TempBotTracker(UpwardTracker); - auto MI = SU->getInstr(); - TempBotTracker.recede(*MI); + return; + } - Pressure[AMDGPU::RegisterPressureSets::SReg_32] = - TempBotTracker.getPressure().getSGPRNum(); - Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = - TempBotTracker.getPressure().getVGPRNum(false); - } + // GCNTrackers + Pressure.resize(4, 0); + MachineInstr *MI = SU->getInstr(); + if (AtTop) { + GCNDownwardRPTracker TempDownwardTracker(DownwardTracker); + TempDownwardTracker.advance(MI, false, DAG->getLIS()); + Pressure[AMDGPU::RegisterPressureSets::SReg_32] = + TempDownwardTracker.getPressure().getSGPRNum(); + Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = + TempDownwardTracker.getPressure().getVGPRNum(false); + } else { + GCNUpwardRPTracker TempUpwardTracker(UpwardTracker); + TempUpwardTracker.recede(*MI); + Pressure[AMDGPU::RegisterPressureSets::SReg_32] = + TempUpwardTracker.getPressure().getSGPRNum(); + Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = + TempUpwardTracker.getPressure().getVGPRNum(false); } } @@ -334,9 +331,9 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; } else { - GCNRPTracker *T = &UpwardTracker; - if (Zone.isTop()) - T = &DownwardTracker; + GCNRPTracker *T = Zone.isTop() + ? static_cast(&UpwardTracker) + : static_cast(&DownwardTracker); SGPRPressure = T->getPressure().getSGPRNum(); VGPRPressure = T->getPressure().getVGPRNum(false); } From 3d072b4e1d9df6f4a0c1359a834afe35a4053f51 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Thu, 20 Jun 2024 08:49:26 -0700 Subject: [PATCH 13/27] Dont modify existing PreRARematStage LiveIn handling Change-Id: I96c99f12c59ef0eea86f7fbf134913ecc47dd6f2 --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index c5d217d80a7c8a..d48e33f7df950a 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1669,6 +1669,9 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST, MachineInstr *MI = Entry.first; MachineInstr *OldMI = Entry.second; + // Remove OldMI from BBLiveInMap since we are sinking it from its MBB. + DAG.BBLiveInMap.erase(OldMI); + // Remove OldMI and update LIS Register Reg = MI->getOperand(0).getReg(); LIS->RemoveMachineInstrFromMaps(*OldMI); @@ -1686,8 +1689,6 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST, DAG.Regions = NewRegions; DAG.RescheduleRegions = NewRescheduleRegions; - DAG.BBLiveInMap = DAG.getBBLiveInMap(); - if (GCNTrackers) DAG.RegionLiveOuts.buildLiveRegMap(); From bb1e24129f24a59b1e5588f6db304ae5fc4baaf1 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Mon, 12 Aug 2024 13:55:44 -0700 Subject: [PATCH 14/27] Use GCNTracker RP speculation Change-Id: I3e893ca2ffcf1032fe157b537c9563565215b123 --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index d48e33f7df950a..7ce8d8c56baf56 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -177,18 +177,18 @@ static void getRegisterPressures(bool AtTop, MachineInstr *MI = SU->getInstr(); if (AtTop) { GCNDownwardRPTracker TempDownwardTracker(DownwardTracker); - TempDownwardTracker.advance(MI, false, DAG->getLIS()); + TempDownwardTracker.bumpDownwardPressure(MI); Pressure[AMDGPU::RegisterPressureSets::SReg_32] = TempDownwardTracker.getPressure().getSGPRNum(); Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = - TempDownwardTracker.getPressure().getVGPRNum(false); + TempDownwardTracker.getPressure().getArchVGPRNum(); } else { GCNUpwardRPTracker TempUpwardTracker(UpwardTracker); - TempUpwardTracker.recede(*MI); + TempUpwardTracker.bumpUpwardPressure(MI); Pressure[AMDGPU::RegisterPressureSets::SReg_32] = TempUpwardTracker.getPressure().getSGPRNum(); Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = - TempUpwardTracker.getPressure().getVGPRNum(false); + TempUpwardTracker.getPressure().getArchVGPRNum(); } } From 366b90fbb731a958b2e44d95e44e0f991007a47a Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 20 Aug 2024 12:29:33 -0700 Subject: [PATCH 15/27] Port changes from pull/93088 Change-Id: I2de464b32d3c6ed9a77cbbc669d735dde63c2e47 --- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 45 +++++++++++++---------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 9bf1d6260359fc..8943612f51ef61 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -306,11 +306,11 @@ static LaneBitmask getRegLanes(ArrayRef RegUnits, return I->LaneMask; } -static LaneBitmask -getLanesWithProperty(const LiveIntervals &LIS, const MachineRegisterInfo &MRI, - bool TrackLaneMasks, Register RegUnit, SlotIndex Pos, - LaneBitmask SafeDefault, - bool (*Property)(const LiveRange &LR, SlotIndex Pos)) { +static LaneBitmask getLanesWithProperty( + const LiveIntervals &LIS, const MachineRegisterInfo &MRI, + bool TrackLaneMasks, Register RegUnit, SlotIndex Pos, + LaneBitmask SafeDefault, + function_ref Property) { if (RegUnit.isVirtual()) { const LiveInterval &LI = LIS.getInterval(RegUnit); LaneBitmask Result; @@ -325,14 +325,14 @@ getLanesWithProperty(const LiveIntervals &LIS, const MachineRegisterInfo &MRI, } return Result; - } else { - const LiveRange *LR = LIS.getCachedRegUnit(RegUnit); - // Be prepared for missing liveranges: We usually do not compute liveranges - // for physical registers on targets with many registers (GPUs). - if (LR == nullptr) - return SafeDefault; - return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone(); } + + const LiveRange *LR = LIS.getCachedRegUnit(RegUnit); + // Be prepared for missing liveranges: We usually do not compute liveranges + // for physical registers on targets with many registers (GPUs). + if (LR == nullptr) + return SafeDefault; + return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone(); } /// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx). @@ -341,19 +341,21 @@ getLanesWithProperty(const LiveIntervals &LIS, const MachineRegisterInfo &MRI, static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask, SlotIndex PriorUseIdx, SlotIndex NextUseIdx, const MachineRegisterInfo &MRI, + const SIRegisterInfo *TRI, const LiveIntervals *LIS, bool Upward = false) { - const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); - for (const MachineOperand &MO : MRI.use_nodbg_operands(Reg)) { + for (const MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) { if (MO.isUndef()) continue; + if (!MO.readsReg()) + continue; const MachineInstr *MI = MO.getParent(); SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot(); bool InRange = Upward ? (InstSlot > PriorUseIdx && InstSlot <= NextUseIdx) : (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx); if (InRange) { unsigned SubRegIdx = MO.getSubReg(); - LaneBitmask UseMask = TRI.getSubRegIndexLaneMask(SubRegIdx); + LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(SubRegIdx); LastUseMask &= ~UseMask; if (LastUseMask.none()) return LaneBitmask::getNone(); @@ -528,7 +530,9 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI) { // Account for register pressure similar to RegPressureTracker::recede(). RegisterOperands RegOpers; - const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); + + const SIRegisterInfo *TRI = + MI->getMF()->getSubtarget().getRegisterInfo(); RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/true); assert(RegOpers.DeadDefs.empty()); RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx); @@ -569,8 +573,8 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI) { LastTrackedMI ? LIS.getInstructionIndex(*LastTrackedMI).getRegSlot() : LIS.getMBBEndIdx(MI->getParent()); ; - LaneBitmask LastUseMask = - findUseBetween(Reg, P.LaneMask, SlotIdx, CurrIdx, *MRI, &LIS, true); + LaneBitmask LastUseMask = findUseBetween(Reg, P.LaneMask, SlotIdx, CurrIdx, + *MRI, TRI, &LIS, true); LastUseMask &= ~LiveAfter; LaneBitmask LiveBefore = (LiveAfter | LastUseMask); CurPressure.inc(Reg, LiveAfter, LiveBefore, *MRI); @@ -744,7 +748,8 @@ void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI) { // Account for register pressure similar to RegPressureTracker::recede(). RegisterOperands RegOpers; - const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); + const SIRegisterInfo *TRI = + MI->getMF()->getSubtarget().getRegisterInfo(); RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/false); RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx); @@ -771,7 +776,7 @@ void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI) { } LastUseMask = - findUseBetween(Reg, LastUseMask, CurrIdx, SlotIdx, *MRI, &LIS); + findUseBetween(Reg, LastUseMask, CurrIdx, SlotIdx, *MRI, TRI, &LIS); if (LastUseMask.none()) continue; From 0b3e08fc2d727b5744bf34b64107dda0bf9d09b9 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 21 Aug 2024 15:16:05 -0700 Subject: [PATCH 16/27] Port changes from pull/93088 --- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 16 +++++----------- llvm/lib/Target/AMDGPU/GCNRegPressure.h | 4 ++-- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 8943612f51ef61..99baeebc532ae7 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -328,8 +328,6 @@ static LaneBitmask getLanesWithProperty( } const LiveRange *LR = LIS.getCachedRegUnit(RegUnit); - // Be prepared for missing liveranges: We usually do not compute liveranges - // for physical registers on targets with many registers (GPUs). if (LR == nullptr) return SafeDefault; return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone(); @@ -344,11 +342,9 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask, const SIRegisterInfo *TRI, const LiveIntervals *LIS, bool Upward = false) { - for (const MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) { + for (const MachineOperand &MO : MRI.use_nodbg_operands(Reg)) { if (MO.isUndef()) continue; - if (!MO.readsReg()) - continue; const MachineInstr *MI = MO.getParent(); SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot(); bool InRange = Upward ? (InstSlot > PriorUseIdx && InstSlot <= NextUseIdx) @@ -523,7 +519,8 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { assert(CurPressure == getRegPressure(*MRI, LiveRegs)); } -void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI) { +void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI, + const SIRegisterInfo *TRI) { assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction."); SlotIndex SlotIdx = LIS.getInstructionIndex(*MI).getRegSlot(); @@ -531,8 +528,6 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI) { // Account for register pressure similar to RegPressureTracker::recede(). RegisterOperands RegOpers; - const SIRegisterInfo *TRI = - MI->getMF()->getSubtarget().getRegisterInfo(); RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/true); assert(RegOpers.DeadDefs.empty()); RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx); @@ -740,7 +735,8 @@ Printable llvm::reportMismatch(const GCNRPTracker::LiveRegSet &LISLR, }); } -void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI) { +void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI, + const SIRegisterInfo *TRI) { assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction."); SlotIndex SlotIdx; @@ -748,8 +744,6 @@ void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI) { // Account for register pressure similar to RegPressureTracker::recede(). RegisterOperands RegOpers; - const SIRegisterInfo *TRI = - MI->getMF()->getSubtarget().getRegisterInfo(); RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/false); RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx); diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 063fb79fbf77bd..62a7f25aa36bb6 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -222,7 +222,7 @@ class GCNUpwardRPTracker : public GCNRPTracker { /// to false allows for an externally managed iterator / program order. void recede(const MachineInstr &MI); - void bumpUpwardPressure(const MachineInstr *MI); + void bumpUpwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI); /// \p returns whether the tracker's state after receding MI corresponds /// to reported by LIS. @@ -306,7 +306,7 @@ class GCNDownwardRPTracker : public GCNRPTracker { MachineBasicBlock::const_iterator End, const LiveRegSet *LiveRegsCopy = nullptr); - void bumpDownwardPressure(const MachineInstr *MI); + void bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI); }; /// \returns the LaneMask of live lanes of \p Reg at position \p SI. Only the From 9de7cc2b3d90eebc4e66496648d25be0c6d2c3e5 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 21 Aug 2024 15:34:33 -0700 Subject: [PATCH 17/27] Feed SIRegisterInfo to Trackers + Propagate unused AGPR speculative pressure + Use correct previous VGPR pressure --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 26 +++++++++++---------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 7ce8d8c56baf56..bf812e840b876c 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -154,13 +154,11 @@ static bool canUsePressureDiffs(const SUnit &SU) { return true; } -static void getRegisterPressures(bool AtTop, - const RegPressureTracker &RPTracker, SUnit *SU, - std::vector &Pressure, - std::vector &MaxPressure, - GCNDownwardRPTracker &DownwardTracker, - GCNUpwardRPTracker &UpwardTracker, - ScheduleDAGMI *DAG) { +static void getRegisterPressures( + bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, + std::vector &Pressure, std::vector &MaxPressure, + GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, + ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) { // getDownwardPressure() and getUpwardPressure() make temporary changes to // the tracker, so we need to pass those function a non-const copy. RegPressureTracker &TempTracker = const_cast(RPTracker); @@ -177,18 +175,22 @@ static void getRegisterPressures(bool AtTop, MachineInstr *MI = SU->getInstr(); if (AtTop) { GCNDownwardRPTracker TempDownwardTracker(DownwardTracker); - TempDownwardTracker.bumpDownwardPressure(MI); + TempDownwardTracker.bumpDownwardPressure(MI, SRI); Pressure[AMDGPU::RegisterPressureSets::SReg_32] = TempDownwardTracker.getPressure().getSGPRNum(); Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = TempDownwardTracker.getPressure().getArchVGPRNum(); + Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = + TempDownwardTracker.getPressure().getAGPRNum(); } else { GCNUpwardRPTracker TempUpwardTracker(UpwardTracker); - TempUpwardTracker.bumpUpwardPressure(MI); + TempUpwardTracker.bumpUpwardPressure(MI, SRI); Pressure[AMDGPU::RegisterPressureSets::SReg_32] = TempUpwardTracker.getPressure().getSGPRNum(); Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = TempUpwardTracker.getPressure().getArchVGPRNum(); + Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = + TempDownwardTracker.getPressure().getAGPRNum(); } } @@ -220,7 +222,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, // PressureDiffs. if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) { getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure, - DownwardTracker, UpwardTracker, DAG); + DownwardTracker, UpwardTracker, DAG, SRI); } else { // Reserve 4 slots. Pressure.resize(4, 0); @@ -239,7 +241,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, #ifdef EXPENSIVE_CHECKS std::vector CheckPressure, CheckMaxPressure; getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure, - TheTracker, UpwardTracker, DAG); + TheTracker, UpwardTracker, DAG, SRI); if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] != CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] || Pressure[AMDGPU::RegisterPressureSets::VGPR_32] != @@ -335,7 +337,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, ? static_cast(&UpwardTracker) : static_cast(&DownwardTracker); SGPRPressure = T->getPressure().getSGPRNum(); - VGPRPressure = T->getPressure().getVGPRNum(false); + VGPRPressure = T->getPressure().getArchVGPRNum(); } } ReadyQueue &Q = Zone.Available; From a97ee42105f2f9d382c19381d7e94c243dfb4a9a Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Thu, 5 Sep 2024 08:24:43 -0700 Subject: [PATCH 18/27] Review comments Change-Id: I286c9ed1ae91a68da881c6fa27f5f391102d0a9c --- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 68 +++++++++++++-------- llvm/lib/Target/AMDGPU/GCNRegPressure.h | 11 ++++ llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 2 +- 3 files changed, 54 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 99baeebc532ae7..b1658e261cedd3 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -296,6 +296,7 @@ collectVirtualRegUses(SmallVectorImpl &RegMaskPairs, } } +/// Mostly copy/paste from CodeGen/RegisterPressure.cpp static LaneBitmask getRegLanes(ArrayRef RegUnits, Register RegUnit) { auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { @@ -306,6 +307,7 @@ static LaneBitmask getRegLanes(ArrayRef RegUnits, return I->LaneMask; } +/// Mostly copy/paste from CodeGen/RegisterPressure.cpp static LaneBitmask getLanesWithProperty( const LiveIntervals &LIS, const MachineRegisterInfo &MRI, bool TrackLaneMasks, Register RegUnit, SlotIndex Pos, @@ -333,6 +335,7 @@ static LaneBitmask getLanesWithProperty( return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone(); } +/// Mostly copy/paste from CodeGen/RegisterPressure.cpp /// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx). /// The query starts with a lane bitmask which gets lanes/bits removed for every /// use we find. @@ -360,6 +363,35 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask, return LastUseMask; } +/// Mostly copy/paste from CodeGen/RegisterPressure.cpp +static LaneBitmask getLiveLanesAt(const LiveIntervals &LIS, + const MachineRegisterInfo &MRI, + bool TrackLaneMasks, Register RegUnit, + SlotIndex Pos) { + return getLanesWithProperty( + LIS, MRI, TrackLaneMasks, RegUnit, Pos, LaneBitmask::getAll(), + [](const LiveRange &LR, SlotIndex Pos) { return LR.liveAt(Pos); }); +} + +// Copy/paste from RegisterPressure.cpp (RegisterOperands::adjustLaneLiveness) +static void adjustDefLaneLiveness(SmallVectorImpl &Defs, + SlotIndex &Pos, const LiveIntervals &LIS, + const MachineRegisterInfo &MRI) { + for (auto *I = Defs.begin(); I != Defs.end();) { + LaneBitmask LiveAfter = + getLiveLanesAt(LIS, MRI, true, I->RegUnit, Pos.getDeadSlot()); + // If the def is all that is live after the instruction, then in case + // of a subregister def we need a read-undef flag. + LaneBitmask ActualDef = I->LaneMask & LiveAfter; + if (ActualDef.none()) { + I = Defs.erase(I); + } else { + I->LaneMask = ActualDef; + ++I; + } + } +} + /////////////////////////////////////////////////////////////////////////////// // GCNRPTracker @@ -427,6 +459,7 @@ void GCNRPTracker::reset(const MachineRegisterInfo &MRI_, } void GCNRPTracker::bumpDeadDefs(ArrayRef DeadDefs) { + GCNRegPressure TempPressure = CurPressure; for (const RegisterMaskPair &P : DeadDefs) { Register Reg = P.RegUnit; if (!Reg.isVirtual()) @@ -436,16 +469,9 @@ void GCNRPTracker::bumpDeadDefs(ArrayRef DeadDefs) { CurPressure.inc(Reg, LiveMask, BumpedMask, *MRI); } MaxPressure = max(MaxPressure, CurPressure); - for (const RegisterMaskPair &P : DeadDefs) { - Register Reg = P.RegUnit; - if (!Reg.isVirtual()) - continue; - LaneBitmask LiveMask = LiveRegs[Reg]; - LaneBitmask BumpedMask = LiveMask | P.LaneMask; - CurPressure.inc(Reg, BumpedMask, LiveMask, *MRI); - } + CurPressure = TempPressure; } - +/// Mostly copy/paste from CodeGen/RegisterPressure.cpp LaneBitmask GCNRPTracker::getLastUsedLanes(Register RegUnit, SlotIndex Pos) const { return getLanesWithProperty( @@ -530,7 +556,7 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI, RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/true); assert(RegOpers.DeadDefs.empty()); - RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx); + adjustDefLaneLiveness(RegOpers.Defs, SlotIdx, LIS, *MRI); RegOpers.detectDeadDefs(*MI, LIS); // Boost max pressure for all dead defs together. @@ -547,11 +573,7 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI, LaneBitmask DefLanes = P.LaneMask; LaneBitmask LiveBefore = (LiveAfter & ~DefLanes) | UseLanes; - // There may be parts of the register that were dead before the - // instruction, but became live afterwards. Similarly, some parts - // may have been killed in this instruction. CurPressure.inc(Reg, LiveAfter, LiveAfter & LiveBefore, *MRI); - CurPressure.inc(Reg, LiveAfter, ~LiveAfter & LiveBefore, *MRI); MaxPressure = max(MaxPressure, CurPressure); } // Generate liveness for uses. @@ -559,19 +581,8 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI, Register Reg = P.RegUnit; if (!Reg.isVirtual()) continue; - // If this register was also in a def operand, we've handled it - // with defs. - if (getRegLanes(RegOpers.Defs, Reg).any()) - continue; LaneBitmask LiveAfter = LiveRegs[Reg]; - SlotIndex CurrIdx = - LastTrackedMI ? LIS.getInstructionIndex(*LastTrackedMI).getRegSlot() - : LIS.getMBBEndIdx(MI->getParent()); - ; - LaneBitmask LastUseMask = findUseBetween(Reg, P.LaneMask, SlotIdx, CurrIdx, - *MRI, TRI, &LIS, true); - LastUseMask &= ~LiveAfter; - LaneBitmask LiveBefore = (LiveAfter | LastUseMask); + LaneBitmask LiveBefore = LiveAfter | P.LaneMask; CurPressure.inc(Reg, LiveAfter, LiveBefore, *MRI); } MaxPressure = max(MaxPressure, CurPressure); @@ -692,8 +703,13 @@ bool GCNDownwardRPTracker::advance(MachineInstr *MI, bool UseInternalIterator, LiveIntervals *TheLIS) { if (UseInternalIterator && NextMI == MBBEnd) return false; + advanceBeforeNext(MI, UseInternalIterator, TheLIS); advanceToNext(MI, UseInternalIterator); + if (!UseInternalIterator) { + // We must remove any dead def lanes from the current RP + advanceBeforeNext(MI, true, TheLIS); + } return true; } diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 62a7f25aa36bb6..ae17ea4c348bd3 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -169,6 +169,7 @@ class GCNRPTracker { void reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy, bool After); + /// Mostly copy/paste from CodeGen/RegisterPressure.cpp void bumpDeadDefs(ArrayRef DeadDefs); LaneBitmask getLastUsedLanes(Register RegUnit, SlotIndex Pos) const; @@ -222,6 +223,11 @@ class GCNUpwardRPTracker : public GCNRPTracker { /// to false allows for an externally managed iterator / program order. void recede(const MachineInstr &MI); + /// Mostly copy/paste from CodeGen/RegisterPressure.cpp + /// Calculate the impact \p MI will have on CurPressure and MaxPressure. This + /// does not rely on the implicit program ordering in the LiveIntervals to + /// support RP Speculation. It leaves the state of pressure inconsistent with + /// the current position void bumpUpwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI); /// \p returns whether the tracker's state after receding MI corresponds @@ -306,6 +312,11 @@ class GCNDownwardRPTracker : public GCNRPTracker { MachineBasicBlock::const_iterator End, const LiveRegSet *LiveRegsCopy = nullptr); + /// Mostly copy/paste from CodeGen/RegisterPressure.cpp + /// Calculate the impact \p MI will have on CurPressure and MaxPressure. This + /// does not rely on the implicit program ordering in the LiveIntervals to + /// support RP Speculation. It leaves the state of pressure inconsistent with + /// the current position void bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI); }; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index bf812e840b876c..651f25c80d60c7 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -190,7 +190,7 @@ static void getRegisterPressures( Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = TempUpwardTracker.getPressure().getArchVGPRNum(); Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = - TempDownwardTracker.getPressure().getAGPRNum(); + TempUpwardTracker.getPressure().getAGPRNum(); } } From 66c42b0669f9d166335d4bb419a5cf253083faaa Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 18 Sep 2024 12:59:36 -0700 Subject: [PATCH 19/27] Avoid const_cast Change-Id: Ib7b21b2ab4cc44abc61fb8ad8880fb78f831619a --- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index b1658e261cedd3..7e2c396270a624 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -609,15 +609,15 @@ bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI, LiveIntervals *TheLIS) { assert(MRI && "call reset first"); SlotIndex SI; - LiveIntervals *CurrLIS; - MachineInstr *CurrMI; + const LiveIntervals *CurrLIS; + const MachineInstr *CurrMI; if (UseInternalIterator) { if (!LastTrackedMI) return NextMI == MBBEnd; assert(NextMI == MBBEnd || !NextMI->isDebugInstr()); - CurrLIS = const_cast(&LIS); - CurrMI = const_cast(LastTrackedMI); + CurrLIS = &LIS; + CurrMI = LastTrackedMI; SI = NextMI == MBBEnd ? CurrLIS->getInstructionIndex(*LastTrackedMI).getDeadSlot() @@ -683,7 +683,7 @@ void GCNDownwardRPTracker::advanceToNext(MachineInstr *MI, LastTrackedMI = MI; } - MachineInstr *CurrMI = const_cast(LastTrackedMI); + const MachineInstr *CurrMI = LastTrackedMI; // Add new registers or mask bits. for (const auto &MO : CurrMI->all_defs()) { From dbd68129d8ea360e405bf4b72e9621d8bf5e8512 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Mon, 23 Sep 2024 10:08:21 -0700 Subject: [PATCH 20/27] Fix shouldTrackVGPRs calculation Change-Id: I3d0aae74f20927722cd6844b1d586ae7accab86e --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 651f25c80d60c7..28ca41d2dc96ed 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -333,7 +333,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; } else { - GCNRPTracker *T = Zone.isTop() + GCNRPTracker *T = IsBottomUp ? static_cast(&UpwardTracker) : static_cast(&DownwardTracker); SGPRPressure = T->getPressure().getSGPRNum(); From 2714af5756bc6bddafa88406ffcd068890c2eb4d Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Fri, 27 Sep 2024 12:40:02 -0700 Subject: [PATCH 21/27] Add lit tests Change-Id: I228916bf04add1de7615294d1e58ee4213f0bbde --- .../CodeGen/AMDGPU/high-RP-reschedule.mir | 10 +- llvm/test/CodeGen/AMDGPU/pr51516.mir | 6 +- .../schedule-amdgpu-tracker-physreg-crash.ll | 65 ++ .../AMDGPU/schedule-amdgpu-tracker-physreg.ll | 491 +++++++++++++ .../AMDGPU/schedule-amdgpu-trackers.ll | 647 ++++++++++++++++++ ...schedule-regpressure-ilp-metric-spills.mir | 15 + .../AMDGPU/schedule-relaxed-occupancy.ll | 10 +- 7 files changed, 1240 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll diff --git a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir index e9005e94ce5db7..d57450baea911a 100644 --- a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir +++ b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir @@ -1,11 +1,17 @@ # REQUIRES: asserts -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -amdgpu-use-amdgpu-trackers=1 -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN-GCNTRACKER %s --- | define amdgpu_kernel void @high-RP-reschedule() { ret void } ... -# CHECK: Unclustered High Register Pressure Reschedule stage successfully increased occupancy to 4 +# GCN: Unclustered High Register Pressure Reschedule stage successfully increased occupancy to 4 + +# GCN-GCNTRACKER: Occupancy before scheduling: 3, after 4. +# GCN-GCNTRACKER-NEXT: Ending scheduling stage: Max Occupancy Initial Schedule + +# When using the GCN Trackers, the scheduler is able to acieve desired occupancy without running high-RP-reschedule stage. --- name: high-RP-reschedule diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir index 4be102f7860eab..49dd5c6c39ff5c 100644 --- a/llvm/test/CodeGen/AMDGPU/pr51516.mir +++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir @@ -1,4 +1,5 @@ -# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=1 -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s # Check that %3 was not rematerialized before the last store since its operand %1 # is killed by that store. @@ -7,6 +8,9 @@ # GCN: renamable $vgpr33_vgpr34_vgpr35_vgpr36 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) # GCN: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr47, killed renamable $vgpr29_vgpr30_vgpr31_vgpr32, killed renamable $sgpr0_sgpr1, 16, 0, implicit $exec, implicit killed renamable $vgpr46 +# GCN-GCNTRACKER-LABEL: name: global_sextload_v32i32_to_v32i64 +# GCN-GCNTRACKER-NOT: SI_SPILL + --- name: global_sextload_v32i32_to_v32i64 tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll new file mode 100644 index 00000000000000..79187f51af0d2b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg-crash.ll @@ -0,0 +1,65 @@ +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack -amdgpu-use-amdgpu-trackers=1 2>&1 < %s | FileCheck -check-prefixes=ERR-GCNTRACKERS %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack 2>&1 < %s | FileCheck -check-prefixes=GCN %s + +%asm.output = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs + <16 x i32>, <7 x i32>, ; vgprs + i64 ; vcc + } + +%asm.output2 = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, i32, ; sgprs + <16 x i32>, <5 x i32>, ; vgprs + i64 ; vcc + } + +%asm.output3 = type { <16 x i32>, <16 x i32>, <16 x i32>, <8 x i32>, <2 x i32>, ; sgprs + <16 x i32>, <6 x i32>, ; vgprs + i64 ; vcc + } + +; ERR-GCNTRACKERS: ran out of registers during register allocation +; GCN-NOT: ran out of registers during register allocation + +; FIXME: GCN Trackers do not track pressure from PhysRegs, so scheduling is actually worse + +define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs() #0 { + %alloca0 = alloca [4096 x i32], align 64, addrspace(5) + %alloca1 = alloca i32, align 4, addrspace(5) + call void asm sideeffect "; use alloca0 $0", "v"(ptr addrspace(5) %alloca0) + + %asm = call %asm.output asm sideeffect + "; def $0, $1, $2, $3, $4, $5, $6, $7, $8", + "={s[0:15]},={s[16:31]},={s[32:47]},={s[48:55]},={s[56:57]},={s58},={v[0:15]},={v[16:22]},={vcc}"() + + %s0 = extractvalue %asm.output %asm, 0 + %s1 = extractvalue %asm.output %asm, 1 + %s2 = extractvalue %asm.output %asm, 2 + %s3 = extractvalue %asm.output %asm, 3 + %s4 = extractvalue %asm.output %asm, 4 + %s5 = extractvalue %asm.output %asm, 5 + + %v0 = extractvalue %asm.output %asm, 6 + %v1 = extractvalue %asm.output %asm, 7 + + %vcc = extractvalue %asm.output %asm, 8 + + ; scc is unavailable since it is live in + call void asm sideeffect "; use $0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10", + "{s[0:15]},{s[16:31]},{s[32:47]},{s[48:55]},{s[56:57]},{s58},{v[0:15]},{v[16:22]},{vcc},{s59},{scc}"( + <16 x i32> %s0, + <16 x i32> %s1, + <16 x i32> %s2, + <8 x i32> %s3, + <2 x i32> %s4, + i32 %s5, + <16 x i32> %v0, + <7 x i32> %v1, + i64 %vcc, + ptr addrspace(5) %alloca1, + i32 0) ; use of scc + + ret void +} + +attributes #0 = { nounwind alignstack=64 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" } +attributes #1 = { nounwind alignstack=16 "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="10,10" "no-realign-stack" } + diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll new file mode 100644 index 00000000000000..c490c76f4531de --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll @@ -0,0 +1,491 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0 < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-s-branch-bits=5 -amdgpu-long-branch-factor=0 -amdgpu-use-amdgpu-trackers=1 < %s | FileCheck --check-prefix=GCN-GCNTRACKERS %s + +; CHECK-LABEL: {{^}}spill: +; GCN: codeLenInByte = 1000 +; GCN-GCNTRACKERS: codeLenInByte = 1016 +; GCN: NumSgprs: 104 +; GCN-GCNTRACKERS: NumSgprs: 104 +; GCN: NumVgprs: 1 +; GCN-GCNTRACKERS: NumVgprs: 2 +; GCN: ScratchSize: 0 +; GCN-GCNTRACKERS: ScratchSize: 0 +; GCN: Occupancy: 5 +; GCN-GCNTRACKERS: Occupancy: 5 + +; FIXME: GCN Trackers do not track pressure from PhysRegs, so scheduling is actually worse + +define amdgpu_kernel void @spill(ptr addrspace(1) %arg, i32 %cnd) #0 { +entry: + %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #0 + %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={s1}"() #0 + %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={s2}"() #0 + %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={s3}"() #0 + %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={s4}"() #0 + %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={s5}"() #0 + %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={s6}"() #0 + %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={s7}"() #0 + %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={s8}"() #0 + %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={s9}"() #0 + %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={s10}"() #0 + %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={s11}"() #0 + %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={s12}"() #0 + %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={s13}"() #0 + %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={s14}"() #0 + %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={s15}"() #0 + %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={s16}"() #0 + %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={s17}"() #0 + %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={s18}"() #0 + %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={s19}"() #0 + %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={s20}"() #0 + %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={s21}"() #0 + %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={s22}"() #0 + %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={s23}"() #0 + %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={s24}"() #0 + %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={s25}"() #0 + %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={s26}"() #0 + %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={s27}"() #0 + %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={s28}"() #0 + %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={s29}"() #0 + %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={s30}"() #0 + %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={s31}"() #0 + %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={s32}"() #0 + %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={s33}"() #0 + %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={s34}"() #0 + %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={s35}"() #0 + %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={s36}"() #0 + %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={s37}"() #0 + %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={s38}"() #0 + %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={s39}"() #0 + %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={s40}"() #0 + %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={s41}"() #0 + %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={s42}"() #0 + %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={s43}"() #0 + %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={s44}"() #0 + %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={s45}"() #0 + %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={s46}"() #0 + %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={s47}"() #0 + %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={s48}"() #0 + %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={s49}"() #0 + %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={s50}"() #0 + %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={s51}"() #0 + %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={s52}"() #0 + %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={s53}"() #0 + %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={s54}"() #0 + %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={s55}"() #0 + %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={s56}"() #0 + %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={s57}"() #0 + %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={s58}"() #0 + %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={s59}"() #0 + %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={s60}"() #0 + %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={s61}"() #0 + %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={s62}"() #0 + %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={s63}"() #0 + %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={s64}"() #0 + %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={s65}"() #0 + %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={s66}"() #0 + %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={s67}"() #0 + %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={s68}"() #0 + %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={s69}"() #0 + %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={s70}"() #0 + %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={s71}"() #0 + %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={s72}"() #0 + %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={s73}"() #0 + %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={s74}"() #0 + %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={s75}"() #0 + %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={s76}"() #0 + %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={s77}"() #0 + %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={s78}"() #0 + %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={s79}"() #0 + %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={s80}"() #0 + %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={s81}"() #0 + %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={s82}"() #0 + %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={s83}"() #0 + %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={s84}"() #0 + %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={s85}"() #0 + %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={s86}"() #0 + %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={s87}"() #0 + %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={s88}"() #0 + %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={s89}"() #0 + %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={s90}"() #0 + %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={s91}"() #0 + %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={s92}"() #0 + %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={s93}"() #0 + %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={s94}"() #0 + %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #0 + %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #0 + %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #0 + %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #0 + %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #0 + %sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={s100}"() #0 + %sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={s101}"() #0 + %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_lo}"() #0 + %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_hi}"() #0 + %cmp = icmp eq i32 %cnd, 0 + br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch + +bb2: ; 68 bytes + ; 64 byte asm + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64",""() #0 + br label %bb3 + +bb3: + tail call void asm sideeffect "; reg use $0", "{s0}"(i32 %sgpr0) #0 + tail call void asm sideeffect "; reg use $0", "{s1}"(i32 %sgpr1) #0 + tail call void asm sideeffect "; reg use $0", "{s2}"(i32 %sgpr2) #0 + tail call void asm sideeffect "; reg use $0", "{s3}"(i32 %sgpr3) #0 + tail call void asm sideeffect "; reg use $0", "{s4}"(i32 %sgpr4) #0 + tail call void asm sideeffect "; reg use $0", "{s5}"(i32 %sgpr5) #0 + tail call void asm sideeffect "; reg use $0", "{s6}"(i32 %sgpr6) #0 + tail call void asm sideeffect "; reg use $0", "{s7}"(i32 %sgpr7) #0 + tail call void asm sideeffect "; reg use $0", "{s8}"(i32 %sgpr8) #0 + tail call void asm sideeffect "; reg use $0", "{s9}"(i32 %sgpr9) #0 + tail call void asm sideeffect "; reg use $0", "{s10}"(i32 %sgpr10) #0 + tail call void asm sideeffect "; reg use $0", "{s11}"(i32 %sgpr11) #0 + tail call void asm sideeffect "; reg use $0", "{s12}"(i32 %sgpr12) #0 + tail call void asm sideeffect "; reg use $0", "{s13}"(i32 %sgpr13) #0 + tail call void asm sideeffect "; reg use $0", "{s14}"(i32 %sgpr14) #0 + tail call void asm sideeffect "; reg use $0", "{s15}"(i32 %sgpr15) #0 + tail call void asm sideeffect "; reg use $0", "{s16}"(i32 %sgpr16) #0 + tail call void asm sideeffect "; reg use $0", "{s17}"(i32 %sgpr17) #0 + tail call void asm sideeffect "; reg use $0", "{s18}"(i32 %sgpr18) #0 + tail call void asm sideeffect "; reg use $0", "{s19}"(i32 %sgpr19) #0 + tail call void asm sideeffect "; reg use $0", "{s20}"(i32 %sgpr20) #0 + tail call void asm sideeffect "; reg use $0", "{s21}"(i32 %sgpr21) #0 + tail call void asm sideeffect "; reg use $0", "{s22}"(i32 %sgpr22) #0 + tail call void asm sideeffect "; reg use $0", "{s23}"(i32 %sgpr23) #0 + tail call void asm sideeffect "; reg use $0", "{s24}"(i32 %sgpr24) #0 + tail call void asm sideeffect "; reg use $0", "{s25}"(i32 %sgpr25) #0 + tail call void asm sideeffect "; reg use $0", "{s26}"(i32 %sgpr26) #0 + tail call void asm sideeffect "; reg use $0", "{s27}"(i32 %sgpr27) #0 + tail call void asm sideeffect "; reg use $0", "{s28}"(i32 %sgpr28) #0 + tail call void asm sideeffect "; reg use $0", "{s29}"(i32 %sgpr29) #0 + tail call void asm sideeffect "; reg use $0", "{s30}"(i32 %sgpr30) #0 + tail call void asm sideeffect "; reg use $0", "{s31}"(i32 %sgpr31) #0 + tail call void asm sideeffect "; reg use $0", "{s32}"(i32 %sgpr32) #0 + tail call void asm sideeffect "; reg use $0", "{s33}"(i32 %sgpr33) #0 + tail call void asm sideeffect "; reg use $0", "{s34}"(i32 %sgpr34) #0 + tail call void asm sideeffect "; reg use $0", "{s35}"(i32 %sgpr35) #0 + tail call void asm sideeffect "; reg use $0", "{s36}"(i32 %sgpr36) #0 + tail call void asm sideeffect "; reg use $0", "{s37}"(i32 %sgpr37) #0 + tail call void asm sideeffect "; reg use $0", "{s38}"(i32 %sgpr38) #0 + tail call void asm sideeffect "; reg use $0", "{s39}"(i32 %sgpr39) #0 + tail call void asm sideeffect "; reg use $0", "{s40}"(i32 %sgpr40) #0 + tail call void asm sideeffect "; reg use $0", "{s41}"(i32 %sgpr41) #0 + tail call void asm sideeffect "; reg use $0", "{s42}"(i32 %sgpr42) #0 + tail call void asm sideeffect "; reg use $0", "{s43}"(i32 %sgpr43) #0 + tail call void asm sideeffect "; reg use $0", "{s44}"(i32 %sgpr44) #0 + tail call void asm sideeffect "; reg use $0", "{s45}"(i32 %sgpr45) #0 + tail call void asm sideeffect "; reg use $0", "{s46}"(i32 %sgpr46) #0 + tail call void asm sideeffect "; reg use $0", "{s47}"(i32 %sgpr47) #0 + tail call void asm sideeffect "; reg use $0", "{s48}"(i32 %sgpr48) #0 + tail call void asm sideeffect "; reg use $0", "{s49}"(i32 %sgpr49) #0 + tail call void asm sideeffect "; reg use $0", "{s50}"(i32 %sgpr50) #0 + tail call void asm sideeffect "; reg use $0", "{s51}"(i32 %sgpr51) #0 + tail call void asm sideeffect "; reg use $0", "{s52}"(i32 %sgpr52) #0 + tail call void asm sideeffect "; reg use $0", "{s53}"(i32 %sgpr53) #0 + tail call void asm sideeffect "; reg use $0", "{s54}"(i32 %sgpr54) #0 + tail call void asm sideeffect "; reg use $0", "{s55}"(i32 %sgpr55) #0 + tail call void asm sideeffect "; reg use $0", "{s56}"(i32 %sgpr56) #0 + tail call void asm sideeffect "; reg use $0", "{s57}"(i32 %sgpr57) #0 + tail call void asm sideeffect "; reg use $0", "{s58}"(i32 %sgpr58) #0 + tail call void asm sideeffect "; reg use $0", "{s59}"(i32 %sgpr59) #0 + tail call void asm sideeffect "; reg use $0", "{s60}"(i32 %sgpr60) #0 + tail call void asm sideeffect "; reg use $0", "{s61}"(i32 %sgpr61) #0 + tail call void asm sideeffect "; reg use $0", "{s62}"(i32 %sgpr62) #0 + tail call void asm sideeffect "; reg use $0", "{s63}"(i32 %sgpr63) #0 + tail call void asm sideeffect "; reg use $0", "{s64}"(i32 %sgpr64) #0 + tail call void asm sideeffect "; reg use $0", "{s65}"(i32 %sgpr65) #0 + tail call void asm sideeffect "; reg use $0", "{s66}"(i32 %sgpr66) #0 + tail call void asm sideeffect "; reg use $0", "{s67}"(i32 %sgpr67) #0 + tail call void asm sideeffect "; reg use $0", "{s68}"(i32 %sgpr68) #0 + tail call void asm sideeffect "; reg use $0", "{s69}"(i32 %sgpr69) #0 + tail call void asm sideeffect "; reg use $0", "{s70}"(i32 %sgpr70) #0 + tail call void asm sideeffect "; reg use $0", "{s71}"(i32 %sgpr71) #0 + tail call void asm sideeffect "; reg use $0", "{s72}"(i32 %sgpr72) #0 + tail call void asm sideeffect "; reg use $0", "{s73}"(i32 %sgpr73) #0 + tail call void asm sideeffect "; reg use $0", "{s74}"(i32 %sgpr74) #0 + tail call void asm sideeffect "; reg use $0", "{s75}"(i32 %sgpr75) #0 + tail call void asm sideeffect "; reg use $0", "{s76}"(i32 %sgpr76) #0 + tail call void asm sideeffect "; reg use $0", "{s77}"(i32 %sgpr77) #0 + tail call void asm sideeffect "; reg use $0", "{s78}"(i32 %sgpr78) #0 + tail call void asm sideeffect "; reg use $0", "{s79}"(i32 %sgpr79) #0 + tail call void asm sideeffect "; reg use $0", "{s80}"(i32 %sgpr80) #0 + tail call void asm sideeffect "; reg use $0", "{s81}"(i32 %sgpr81) #0 + tail call void asm sideeffect "; reg use $0", "{s82}"(i32 %sgpr82) #0 + tail call void asm sideeffect "; reg use $0", "{s83}"(i32 %sgpr83) #0 + tail call void asm sideeffect "; reg use $0", "{s84}"(i32 %sgpr84) #0 + tail call void asm sideeffect "; reg use $0", "{s85}"(i32 %sgpr85) #0 + tail call void asm sideeffect "; reg use $0", "{s86}"(i32 %sgpr86) #0 + tail call void asm sideeffect "; reg use $0", "{s87}"(i32 %sgpr87) #0 + tail call void asm sideeffect "; reg use $0", "{s88}"(i32 %sgpr88) #0 + tail call void asm sideeffect "; reg use $0", "{s89}"(i32 %sgpr89) #0 + tail call void asm sideeffect "; reg use $0", "{s90}"(i32 %sgpr90) #0 + tail call void asm sideeffect "; reg use $0", "{s91}"(i32 %sgpr91) #0 + tail call void asm sideeffect "; reg use $0", "{s92}"(i32 %sgpr92) #0 + tail call void asm sideeffect "; reg use $0", "{s93}"(i32 %sgpr93) #0 + tail call void asm sideeffect "; reg use $0", "{s94}"(i32 %sgpr94) #0 + tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #0 + tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #0 + tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #0 + tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #0 + tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #0 + tail call void asm sideeffect "; reg use $0", "{s100}"(i32 %sgpr100) #0 + tail call void asm sideeffect "; reg use $0", "{s101}"(i32 %sgpr101) #0 + tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #0 + tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #0 + ret void +} + +; CHECK-LABEL: {{^}}spill_func: +; GCN: codeLenInByte = 1612 +; GCN-GCNTRACKERS: codeLenInByte = 1660 +; GCN: NumSgprs: 104 +; GCN-GCNTRACKERS: NumSgprs: 104 +; GCN: NumVgprs: 3 +; GCN-GCNTRACKERS: NumVgprs: 4 +; GCN: ScratchSize: 12 +; GCN-GCNTRACKERS: ScratchSize: 16 + +define void @spill_func(ptr addrspace(1) %arg) #0 { +entry: + %cnd = tail call i32 @llvm.amdgcn.workgroup.id.x() #0 + %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={s0}"() #0 + %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={s1}"() #0 + %sgpr2 = tail call i32 asm sideeffect "s_mov_b32 s2, 0", "={s2}"() #0 + %sgpr3 = tail call i32 asm sideeffect "s_mov_b32 s3, 0", "={s3}"() #0 + %sgpr4 = tail call i32 asm sideeffect "s_mov_b32 s4, 0", "={s4}"() #0 + %sgpr5 = tail call i32 asm sideeffect "s_mov_b32 s5, 0", "={s5}"() #0 + %sgpr6 = tail call i32 asm sideeffect "s_mov_b32 s6, 0", "={s6}"() #0 + %sgpr7 = tail call i32 asm sideeffect "s_mov_b32 s7, 0", "={s7}"() #0 + %sgpr8 = tail call i32 asm sideeffect "s_mov_b32 s8, 0", "={s8}"() #0 + %sgpr9 = tail call i32 asm sideeffect "s_mov_b32 s9, 0", "={s9}"() #0 + %sgpr10 = tail call i32 asm sideeffect "s_mov_b32 s10, 0", "={s10}"() #0 + %sgpr11 = tail call i32 asm sideeffect "s_mov_b32 s11, 0", "={s11}"() #0 + %sgpr12 = tail call i32 asm sideeffect "s_mov_b32 s12, 0", "={s12}"() #0 + %sgpr13 = tail call i32 asm sideeffect "s_mov_b32 s13, 0", "={s13}"() #0 + %sgpr14 = tail call i32 asm sideeffect "s_mov_b32 s14, 0", "={s14}"() #0 + %sgpr15 = tail call i32 asm sideeffect "s_mov_b32 s15, 0", "={s15}"() #0 + %sgpr16 = tail call i32 asm sideeffect "s_mov_b32 s16, 0", "={s16}"() #0 + %sgpr17 = tail call i32 asm sideeffect "s_mov_b32 s17, 0", "={s17}"() #0 + %sgpr18 = tail call i32 asm sideeffect "s_mov_b32 s18, 0", "={s18}"() #0 + %sgpr19 = tail call i32 asm sideeffect "s_mov_b32 s19, 0", "={s19}"() #0 + %sgpr20 = tail call i32 asm sideeffect "s_mov_b32 s20, 0", "={s20}"() #0 + %sgpr21 = tail call i32 asm sideeffect "s_mov_b32 s21, 0", "={s21}"() #0 + %sgpr22 = tail call i32 asm sideeffect "s_mov_b32 s22, 0", "={s22}"() #0 + %sgpr23 = tail call i32 asm sideeffect "s_mov_b32 s23, 0", "={s23}"() #0 + %sgpr24 = tail call i32 asm sideeffect "s_mov_b32 s24, 0", "={s24}"() #0 + %sgpr25 = tail call i32 asm sideeffect "s_mov_b32 s25, 0", "={s25}"() #0 + %sgpr26 = tail call i32 asm sideeffect "s_mov_b32 s26, 0", "={s26}"() #0 + %sgpr27 = tail call i32 asm sideeffect "s_mov_b32 s27, 0", "={s27}"() #0 + %sgpr28 = tail call i32 asm sideeffect "s_mov_b32 s28, 0", "={s28}"() #0 + %sgpr29 = tail call i32 asm sideeffect "s_mov_b32 s29, 0", "={s29}"() #0 + %sgpr30 = tail call i32 asm sideeffect "s_mov_b32 s30, 0", "={s30}"() #0 + %sgpr31 = tail call i32 asm sideeffect "s_mov_b32 s31, 0", "={s31}"() #0 + %sgpr32 = tail call i32 asm sideeffect "s_mov_b32 s32, 0", "={s32}"() #0 + %sgpr33 = tail call i32 asm sideeffect "s_mov_b32 s33, 0", "={s33}"() #0 + %sgpr34 = tail call i32 asm sideeffect "s_mov_b32 s34, 0", "={s34}"() #0 + %sgpr35 = tail call i32 asm sideeffect "s_mov_b32 s35, 0", "={s35}"() #0 + %sgpr36 = tail call i32 asm sideeffect "s_mov_b32 s36, 0", "={s36}"() #0 + %sgpr37 = tail call i32 asm sideeffect "s_mov_b32 s37, 0", "={s37}"() #0 + %sgpr38 = tail call i32 asm sideeffect "s_mov_b32 s38, 0", "={s38}"() #0 + %sgpr39 = tail call i32 asm sideeffect "s_mov_b32 s39, 0", "={s39}"() #0 + %sgpr40 = tail call i32 asm sideeffect "s_mov_b32 s40, 0", "={s40}"() #0 + %sgpr41 = tail call i32 asm sideeffect "s_mov_b32 s41, 0", "={s41}"() #0 + %sgpr42 = tail call i32 asm sideeffect "s_mov_b32 s42, 0", "={s42}"() #0 + %sgpr43 = tail call i32 asm sideeffect "s_mov_b32 s43, 0", "={s43}"() #0 + %sgpr44 = tail call i32 asm sideeffect "s_mov_b32 s44, 0", "={s44}"() #0 + %sgpr45 = tail call i32 asm sideeffect "s_mov_b32 s45, 0", "={s45}"() #0 + %sgpr46 = tail call i32 asm sideeffect "s_mov_b32 s46, 0", "={s46}"() #0 + %sgpr47 = tail call i32 asm sideeffect "s_mov_b32 s47, 0", "={s47}"() #0 + %sgpr48 = tail call i32 asm sideeffect "s_mov_b32 s48, 0", "={s48}"() #0 + %sgpr49 = tail call i32 asm sideeffect "s_mov_b32 s49, 0", "={s49}"() #0 + %sgpr50 = tail call i32 asm sideeffect "s_mov_b32 s50, 0", "={s50}"() #0 + %sgpr51 = tail call i32 asm sideeffect "s_mov_b32 s51, 0", "={s51}"() #0 + %sgpr52 = tail call i32 asm sideeffect "s_mov_b32 s52, 0", "={s52}"() #0 + %sgpr53 = tail call i32 asm sideeffect "s_mov_b32 s53, 0", "={s53}"() #0 + %sgpr54 = tail call i32 asm sideeffect "s_mov_b32 s54, 0", "={s54}"() #0 + %sgpr55 = tail call i32 asm sideeffect "s_mov_b32 s55, 0", "={s55}"() #0 + %sgpr56 = tail call i32 asm sideeffect "s_mov_b32 s56, 0", "={s56}"() #0 + %sgpr57 = tail call i32 asm sideeffect "s_mov_b32 s57, 0", "={s57}"() #0 + %sgpr58 = tail call i32 asm sideeffect "s_mov_b32 s58, 0", "={s58}"() #0 + %sgpr59 = tail call i32 asm sideeffect "s_mov_b32 s59, 0", "={s59}"() #0 + %sgpr60 = tail call i32 asm sideeffect "s_mov_b32 s60, 0", "={s60}"() #0 + %sgpr61 = tail call i32 asm sideeffect "s_mov_b32 s61, 0", "={s61}"() #0 + %sgpr62 = tail call i32 asm sideeffect "s_mov_b32 s62, 0", "={s62}"() #0 + %sgpr63 = tail call i32 asm sideeffect "s_mov_b32 s63, 0", "={s63}"() #0 + %sgpr64 = tail call i32 asm sideeffect "s_mov_b32 s64, 0", "={s64}"() #0 + %sgpr65 = tail call i32 asm sideeffect "s_mov_b32 s65, 0", "={s65}"() #0 + %sgpr66 = tail call i32 asm sideeffect "s_mov_b32 s66, 0", "={s66}"() #0 + %sgpr67 = tail call i32 asm sideeffect "s_mov_b32 s67, 0", "={s67}"() #0 + %sgpr68 = tail call i32 asm sideeffect "s_mov_b32 s68, 0", "={s68}"() #0 + %sgpr69 = tail call i32 asm sideeffect "s_mov_b32 s69, 0", "={s69}"() #0 + %sgpr70 = tail call i32 asm sideeffect "s_mov_b32 s70, 0", "={s70}"() #0 + %sgpr71 = tail call i32 asm sideeffect "s_mov_b32 s71, 0", "={s71}"() #0 + %sgpr72 = tail call i32 asm sideeffect "s_mov_b32 s72, 0", "={s72}"() #0 + %sgpr73 = tail call i32 asm sideeffect "s_mov_b32 s73, 0", "={s73}"() #0 + %sgpr74 = tail call i32 asm sideeffect "s_mov_b32 s74, 0", "={s74}"() #0 + %sgpr75 = tail call i32 asm sideeffect "s_mov_b32 s75, 0", "={s75}"() #0 + %sgpr76 = tail call i32 asm sideeffect "s_mov_b32 s76, 0", "={s76}"() #0 + %sgpr77 = tail call i32 asm sideeffect "s_mov_b32 s77, 0", "={s77}"() #0 + %sgpr78 = tail call i32 asm sideeffect "s_mov_b32 s78, 0", "={s78}"() #0 + %sgpr79 = tail call i32 asm sideeffect "s_mov_b32 s79, 0", "={s79}"() #0 + %sgpr80 = tail call i32 asm sideeffect "s_mov_b32 s80, 0", "={s80}"() #0 + %sgpr81 = tail call i32 asm sideeffect "s_mov_b32 s81, 0", "={s81}"() #0 + %sgpr82 = tail call i32 asm sideeffect "s_mov_b32 s82, 0", "={s82}"() #0 + %sgpr83 = tail call i32 asm sideeffect "s_mov_b32 s83, 0", "={s83}"() #0 + %sgpr84 = tail call i32 asm sideeffect "s_mov_b32 s84, 0", "={s84}"() #0 + %sgpr85 = tail call i32 asm sideeffect "s_mov_b32 s85, 0", "={s85}"() #0 + %sgpr86 = tail call i32 asm sideeffect "s_mov_b32 s86, 0", "={s86}"() #0 + %sgpr87 = tail call i32 asm sideeffect "s_mov_b32 s87, 0", "={s87}"() #0 + %sgpr88 = tail call i32 asm sideeffect "s_mov_b32 s88, 0", "={s88}"() #0 + %sgpr89 = tail call i32 asm sideeffect "s_mov_b32 s89, 0", "={s89}"() #0 + %sgpr90 = tail call i32 asm sideeffect "s_mov_b32 s90, 0", "={s90}"() #0 + %sgpr91 = tail call i32 asm sideeffect "s_mov_b32 s91, 0", "={s91}"() #0 + %sgpr92 = tail call i32 asm sideeffect "s_mov_b32 s92, 0", "={s92}"() #0 + %sgpr93 = tail call i32 asm sideeffect "s_mov_b32 s93, 0", "={s93}"() #0 + %sgpr94 = tail call i32 asm sideeffect "s_mov_b32 s94, 0", "={s94}"() #0 + %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #0 + %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #0 + %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #0 + %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #0 + %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #0 + %sgpr100 = tail call i32 asm sideeffect "s_mov_b32 s100, 0", "={s100}"() #0 + %sgpr101 = tail call i32 asm sideeffect "s_mov_b32 s101, 0", "={s101}"() #0 + %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_lo}"() #0 + %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_hi}"() #0 + %cmp = icmp eq i32 %cnd, 0 + br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch + +bb2: ; 68 bytes + ; 64 byte asm + call void asm sideeffect + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64",""() #0 + br label %bb3 + +bb3: + tail call void asm sideeffect "; reg use $0", "{s0}"(i32 %sgpr0) #0 + tail call void asm sideeffect "; reg use $0", "{s1}"(i32 %sgpr1) #0 + tail call void asm sideeffect "; reg use $0", "{s2}"(i32 %sgpr2) #0 + tail call void asm sideeffect "; reg use $0", "{s3}"(i32 %sgpr3) #0 + tail call void asm sideeffect "; reg use $0", "{s4}"(i32 %sgpr4) #0 + tail call void asm sideeffect "; reg use $0", "{s5}"(i32 %sgpr5) #0 + tail call void asm sideeffect "; reg use $0", "{s6}"(i32 %sgpr6) #0 + tail call void asm sideeffect "; reg use $0", "{s7}"(i32 %sgpr7) #0 + tail call void asm sideeffect "; reg use $0", "{s8}"(i32 %sgpr8) #0 + tail call void asm sideeffect "; reg use $0", "{s9}"(i32 %sgpr9) #0 + tail call void asm sideeffect "; reg use $0", "{s10}"(i32 %sgpr10) #0 + tail call void asm sideeffect "; reg use $0", "{s11}"(i32 %sgpr11) #0 + tail call void asm sideeffect "; reg use $0", "{s12}"(i32 %sgpr12) #0 + tail call void asm sideeffect "; reg use $0", "{s13}"(i32 %sgpr13) #0 + tail call void asm sideeffect "; reg use $0", "{s14}"(i32 %sgpr14) #0 + tail call void asm sideeffect "; reg use $0", "{s15}"(i32 %sgpr15) #0 + tail call void asm sideeffect "; reg use $0", "{s16}"(i32 %sgpr16) #0 + tail call void asm sideeffect "; reg use $0", "{s17}"(i32 %sgpr17) #0 + tail call void asm sideeffect "; reg use $0", "{s18}"(i32 %sgpr18) #0 + tail call void asm sideeffect "; reg use $0", "{s19}"(i32 %sgpr19) #0 + tail call void asm sideeffect "; reg use $0", "{s20}"(i32 %sgpr20) #0 + tail call void asm sideeffect "; reg use $0", "{s21}"(i32 %sgpr21) #0 + tail call void asm sideeffect "; reg use $0", "{s22}"(i32 %sgpr22) #0 + tail call void asm sideeffect "; reg use $0", "{s23}"(i32 %sgpr23) #0 + tail call void asm sideeffect "; reg use $0", "{s24}"(i32 %sgpr24) #0 + tail call void asm sideeffect "; reg use $0", "{s25}"(i32 %sgpr25) #0 + tail call void asm sideeffect "; reg use $0", "{s26}"(i32 %sgpr26) #0 + tail call void asm sideeffect "; reg use $0", "{s27}"(i32 %sgpr27) #0 + tail call void asm sideeffect "; reg use $0", "{s28}"(i32 %sgpr28) #0 + tail call void asm sideeffect "; reg use $0", "{s29}"(i32 %sgpr29) #0 + tail call void asm sideeffect "; reg use $0", "{s30}"(i32 %sgpr30) #0 + tail call void asm sideeffect "; reg use $0", "{s31}"(i32 %sgpr31) #0 + tail call void asm sideeffect "; reg use $0", "{s32}"(i32 %sgpr32) #0 + tail call void asm sideeffect "; reg use $0", "{s33}"(i32 %sgpr33) #0 + tail call void asm sideeffect "; reg use $0", "{s34}"(i32 %sgpr34) #0 + tail call void asm sideeffect "; reg use $0", "{s35}"(i32 %sgpr35) #0 + tail call void asm sideeffect "; reg use $0", "{s36}"(i32 %sgpr36) #0 + tail call void asm sideeffect "; reg use $0", "{s37}"(i32 %sgpr37) #0 + tail call void asm sideeffect "; reg use $0", "{s38}"(i32 %sgpr38) #0 + tail call void asm sideeffect "; reg use $0", "{s39}"(i32 %sgpr39) #0 + tail call void asm sideeffect "; reg use $0", "{s40}"(i32 %sgpr40) #0 + tail call void asm sideeffect "; reg use $0", "{s41}"(i32 %sgpr41) #0 + tail call void asm sideeffect "; reg use $0", "{s42}"(i32 %sgpr42) #0 + tail call void asm sideeffect "; reg use $0", "{s43}"(i32 %sgpr43) #0 + tail call void asm sideeffect "; reg use $0", "{s44}"(i32 %sgpr44) #0 + tail call void asm sideeffect "; reg use $0", "{s45}"(i32 %sgpr45) #0 + tail call void asm sideeffect "; reg use $0", "{s46}"(i32 %sgpr46) #0 + tail call void asm sideeffect "; reg use $0", "{s47}"(i32 %sgpr47) #0 + tail call void asm sideeffect "; reg use $0", "{s48}"(i32 %sgpr48) #0 + tail call void asm sideeffect "; reg use $0", "{s49}"(i32 %sgpr49) #0 + tail call void asm sideeffect "; reg use $0", "{s50}"(i32 %sgpr50) #0 + tail call void asm sideeffect "; reg use $0", "{s51}"(i32 %sgpr51) #0 + tail call void asm sideeffect "; reg use $0", "{s52}"(i32 %sgpr52) #0 + tail call void asm sideeffect "; reg use $0", "{s53}"(i32 %sgpr53) #0 + tail call void asm sideeffect "; reg use $0", "{s54}"(i32 %sgpr54) #0 + tail call void asm sideeffect "; reg use $0", "{s55}"(i32 %sgpr55) #0 + tail call void asm sideeffect "; reg use $0", "{s56}"(i32 %sgpr56) #0 + tail call void asm sideeffect "; reg use $0", "{s57}"(i32 %sgpr57) #0 + tail call void asm sideeffect "; reg use $0", "{s58}"(i32 %sgpr58) #0 + tail call void asm sideeffect "; reg use $0", "{s59}"(i32 %sgpr59) #0 + tail call void asm sideeffect "; reg use $0", "{s60}"(i32 %sgpr60) #0 + tail call void asm sideeffect "; reg use $0", "{s61}"(i32 %sgpr61) #0 + tail call void asm sideeffect "; reg use $0", "{s62}"(i32 %sgpr62) #0 + tail call void asm sideeffect "; reg use $0", "{s63}"(i32 %sgpr63) #0 + tail call void asm sideeffect "; reg use $0", "{s64}"(i32 %sgpr64) #0 + tail call void asm sideeffect "; reg use $0", "{s65}"(i32 %sgpr65) #0 + tail call void asm sideeffect "; reg use $0", "{s66}"(i32 %sgpr66) #0 + tail call void asm sideeffect "; reg use $0", "{s67}"(i32 %sgpr67) #0 + tail call void asm sideeffect "; reg use $0", "{s68}"(i32 %sgpr68) #0 + tail call void asm sideeffect "; reg use $0", "{s69}"(i32 %sgpr69) #0 + tail call void asm sideeffect "; reg use $0", "{s70}"(i32 %sgpr70) #0 + tail call void asm sideeffect "; reg use $0", "{s71}"(i32 %sgpr71) #0 + tail call void asm sideeffect "; reg use $0", "{s72}"(i32 %sgpr72) #0 + tail call void asm sideeffect "; reg use $0", "{s73}"(i32 %sgpr73) #0 + tail call void asm sideeffect "; reg use $0", "{s74}"(i32 %sgpr74) #0 + tail call void asm sideeffect "; reg use $0", "{s75}"(i32 %sgpr75) #0 + tail call void asm sideeffect "; reg use $0", "{s76}"(i32 %sgpr76) #0 + tail call void asm sideeffect "; reg use $0", "{s77}"(i32 %sgpr77) #0 + tail call void asm sideeffect "; reg use $0", "{s78}"(i32 %sgpr78) #0 + tail call void asm sideeffect "; reg use $0", "{s79}"(i32 %sgpr79) #0 + tail call void asm sideeffect "; reg use $0", "{s80}"(i32 %sgpr80) #0 + tail call void asm sideeffect "; reg use $0", "{s81}"(i32 %sgpr81) #0 + tail call void asm sideeffect "; reg use $0", "{s82}"(i32 %sgpr82) #0 + tail call void asm sideeffect "; reg use $0", "{s83}"(i32 %sgpr83) #0 + tail call void asm sideeffect "; reg use $0", "{s84}"(i32 %sgpr84) #0 + tail call void asm sideeffect "; reg use $0", "{s85}"(i32 %sgpr85) #0 + tail call void asm sideeffect "; reg use $0", "{s86}"(i32 %sgpr86) #0 + tail call void asm sideeffect "; reg use $0", "{s87}"(i32 %sgpr87) #0 + tail call void asm sideeffect "; reg use $0", "{s88}"(i32 %sgpr88) #0 + tail call void asm sideeffect "; reg use $0", "{s89}"(i32 %sgpr89) #0 + tail call void asm sideeffect "; reg use $0", "{s90}"(i32 %sgpr90) #0 + tail call void asm sideeffect "; reg use $0", "{s91}"(i32 %sgpr91) #0 + tail call void asm sideeffect "; reg use $0", "{s92}"(i32 %sgpr92) #0 + tail call void asm sideeffect "; reg use $0", "{s93}"(i32 %sgpr93) #0 + tail call void asm sideeffect "; reg use $0", "{s94}"(i32 %sgpr94) #0 + tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #0 + tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #0 + tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #0 + tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #0 + tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #0 + tail call void asm sideeffect "; reg use $0", "{s100}"(i32 %sgpr100) #0 + tail call void asm sideeffect "; reg use $0", "{s101}"(i32 %sgpr101) #0 + tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #0 + tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #0 + ret void +} + +declare i32 @llvm.amdgcn.workgroup.id.x() #0 + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll new file mode 100644 index 00000000000000..53f533ebb28427 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll @@ -0,0 +1,647 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-misched < %s | FileCheck --check-prefixes=GFX11-PAL %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GFX11-PAL-GCNTRACKERS %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -verify-misched < %s | FileCheck --check-prefixes=TONGA %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=TONGA-GCNTRACKERS %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched < %s | FileCheck --check-prefixes=GFX908 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GFX908-GCNTRACKERS %s +; RUN: llc -mtriple=amdgcn -verify-misched < %s | FileCheck --check-prefixes=GENERIC %s +; RUN: llc -mtriple=amdgcn -amdgpu-use-amdgpu-trackers=1 -verify-misched < %s | FileCheck --check-prefixes=GENERIC-GCNTRACKERS %s + +; GCN Trackers are sensitive to minor changes in RP, and will avoid scheduling certain instructions, which, if scheduled, +; allow scheduling of other instructions which reduce RP + +; CHECK-LABEL: {{^}}return_72xi32: +; GFX11-PAL: codeLenInByte = 768 +; GFX11-PAL-GCNTRACKERS: codeLenInByte = 888 +; GFX11-PAL: NumSgprs: 33 +; GFX11-PAL-GCNTRACKERS: NumSgprs: 33 +; GFX11-PAL: NumVgprs: 64 +; GFX11-PAL-GCNTRACKERS: NumVgprs: 64 +; GFX11-PAL: ScratchSize: 220 +; GFX11-PAL-GCNTRACKERS: ScratchSize: 248 + + +; CHECK-LABEL: {{^}}call_72xi32: +; GFX11-PAL: codeLenInByte = 1300 +; GFX11-PAL-GCNTRACKERS: codeLenInByte = 1372 +; GFX11-PAL: NumSgprs: 35 +; GFX11-PAL-GCNTRACKERS: NumSgprs: 35 +; GFX11-PAL: NumVgprs: 64 +; GFX11-PAL-GCNTRACKERS: NumVgprs: 64 +; GFX11-PAL: ScratchSize: 2780 +; GFX11-PAL-GCNTRACKERS: ScratchSize: 2808 + + +define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 { + ret <72 x i32> %val +} + +define amdgpu_gfx void @call_72xi32() #1 { +entry: + %ret.0 = call amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> zeroinitializer) + %val.0 = insertelement <72 x i32> %ret.0, i32 42, i32 0 + %val.1 = insertelement <72 x i32> %val.0, i32 24, i32 58 + %ret.1 = call amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val.1) + ret void +} + +; CHECK-LABEL: {{^}}global_extload_v16f16_to_v16f64: +; TONGA: codeLenInByte = 420 +; TONGA-GCNTRACKERS: codeLenInByte = 436 +; TONGA: NumSgprs: 96 +; TONGA-GCNTRACKERS: NumSgprs: 96 +; TONGA: NumVgprs: 33 +; TONGA-GCNTRACKERS: NumVgprs: 25 +; TONGA: Occupancy: 7 +; TONGA-GCNTRACKERS: Occupancy: 8 + + +define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out, ptr addrspace(1) %in) { + %val = load <16 x half>, ptr addrspace(1) %in + %cvt = fpext <16 x half> %val to <16 x double> + store <16 x double> %cvt, ptr addrspace(1) %out + ret void +} + +; CHECK-LABEL: {{^}}constant_zextload_v64i16_to_v64i32: +; GENERIC: codeLenInByte = 860 +; GENERIC-GCNTRACKERS: codeLenInByte = 860 +; GENERIC: NumSgprs: 71 +; GENERIC-GCNTRACKERS: NumSgprs: 54 +; GENERIC: NumVgprs: 16 +; GENERIC-GCNTRACKERS: NumVgprs: 16 +; GENERIC: Occupancy: 7 +; GENERIC-GCNTRACKERS: Occupancy: 8 + +define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %out, ptr addrspace(4) %in) { + %load = load <64 x i16>, ptr addrspace(4) %in + %ext = zext <64 x i16> %load to <64 x i32> + store <64 x i32> %ext, ptr addrspace(1) %out + ret void +} + +; CHECK-LABEL: {{^}}excess_soft_clause_reg_pressure: +; GFX908: codeLenInByte = 1436 +; GFX908-GCNTRACKERS: codeLenInByte = 1436 +; GFX908: NumSgprs: 56 +; GFX908-GCNTRACKERS: NumSgprs: 56 +; GFX908: NumVgprs: 43 +; GFX908-GCNTRACKERS: NumVgprs: 39 +; GFX908: Occupancy: 5 +; GFX908-GCNTRACKERS: Occupancy: 6 + + +define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) { +entry: + %i = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %i2 = load i64, ptr addrspace(4) %i, align 8 + %i3 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %i4 = shl i32 %i3, 8 + %i5 = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !5 + %i6 = add i32 %i4, %i5 + %i7 = trunc i64 %i2 to i32 + %conv = add i32 %i6, %i7 + %conv.frozen = freeze i32 %conv + %div = udiv i32 %conv.frozen, 49 + %add.ptr22 = getelementptr inbounds float, ptr addrspace(4) %wei_ptr, i64 undef + %in.ptr1 = getelementptr inbounds float, ptr addrspace(1) %in, i32 %i5 + br label %for.cond28.preheader + +for.cond28.preheader: ; preds = %for.cond28.preheader, %entry + %accum.sroa.110.0 = phi float [ 0.000000e+00, %entry ], [ %i251, %for.cond28.preheader ] + %accum.sroa.106.0 = phi float [ 0.000000e+00, %entry ], [ %i247, %for.cond28.preheader ] + %accum.sroa.102.0 = phi float [ 0.000000e+00, %entry ], [ %i243, %for.cond28.preheader ] + %accum.sroa.98.0 = phi float [ 0.000000e+00, %entry ], [ %i239, %for.cond28.preheader ] + %accum.sroa.94.0 = phi float [ 0.000000e+00, %entry ], [ %i235, %for.cond28.preheader ] + %accum.sroa.90.0 = phi float [ 0.000000e+00, %entry ], [ %i231, %for.cond28.preheader ] + %accum.sroa.86.0 = phi float [ 0.000000e+00, %entry ], [ %i227, %for.cond28.preheader ] + %accum.sroa.82.0 = phi float [ 0.000000e+00, %entry ], [ %i223, %for.cond28.preheader ] + %accum.sroa.78.0 = phi float [ 0.000000e+00, %entry ], [ %i219, %for.cond28.preheader ] + %accum.sroa.74.0 = phi float [ 0.000000e+00, %entry ], [ %i215, %for.cond28.preheader ] + %accum.sroa.70.0 = phi float [ 0.000000e+00, %entry ], [ %i211, %for.cond28.preheader ] + %accum.sroa.66.0 = phi float [ 0.000000e+00, %entry ], [ %i207, %for.cond28.preheader ] + %accum.sroa.62.0 = phi float [ 0.000000e+00, %entry ], [ %i203, %for.cond28.preheader ] + %accum.sroa.58.0 = phi float [ 0.000000e+00, %entry ], [ %i199, %for.cond28.preheader ] + %accum.sroa.54.0 = phi float [ 0.000000e+00, %entry ], [ %i195, %for.cond28.preheader ] + %accum.sroa.50.0 = phi float [ 0.000000e+00, %entry ], [ %i191, %for.cond28.preheader ] + %accum.sroa.46.0 = phi float [ 0.000000e+00, %entry ], [ %i187, %for.cond28.preheader ] + %accum.sroa.42.0 = phi float [ 0.000000e+00, %entry ], [ %i183, %for.cond28.preheader ] + %accum.sroa.38.0 = phi float [ 0.000000e+00, %entry ], [ %i179, %for.cond28.preheader ] + %accum.sroa.34.0 = phi float [ 0.000000e+00, %entry ], [ %i175, %for.cond28.preheader ] + %accum.sroa.30.0 = phi float [ 0.000000e+00, %entry ], [ %i171, %for.cond28.preheader ] + %accum.sroa.26.0 = phi float [ 0.000000e+00, %entry ], [ %i167, %for.cond28.preheader ] + %accum.sroa.22.0 = phi float [ 0.000000e+00, %entry ], [ %i163, %for.cond28.preheader ] + %accum.sroa.18.0 = phi float [ 0.000000e+00, %entry ], [ %i159, %for.cond28.preheader ] + %accum.sroa.14.0 = phi float [ 0.000000e+00, %entry ], [ %i155, %for.cond28.preheader ] + %accum.sroa.10.0 = phi float [ 0.000000e+00, %entry ], [ %i151, %for.cond28.preheader ] + %accum.sroa.6.0 = phi float [ 0.000000e+00, %entry ], [ %i147, %for.cond28.preheader ] + %accum.sroa.0.0 = phi float [ 0.000000e+00, %entry ], [ %i143, %for.cond28.preheader ] + %accum.sroa.114.0 = phi float [ 0.000000e+00, %entry ], [ %i255, %for.cond28.preheader ] + %accum.sroa.118.0 = phi float [ 0.000000e+00, %entry ], [ %i259, %for.cond28.preheader ] + %accum.sroa.122.0 = phi float [ 0.000000e+00, %entry ], [ %i263, %for.cond28.preheader ] + %accum.sroa.126.0 = phi float [ 0.000000e+00, %entry ], [ %i267, %for.cond28.preheader ] + %i_ptr.0288 = phi ptr addrspace(1) [ %in.ptr1, %entry ], [ %add.ptr47.3, %for.cond28.preheader ] + %w_ptr.0287 = phi ptr addrspace(4) [ %add.ptr22, %entry ], [ %add.ptr74, %for.cond28.preheader ] + %ci.0286 = phi i32 [ 0, %entry ], [ %inc116, %for.cond28.preheader ] + %i8 = load float, ptr addrspace(1) %i_ptr.0288, align 4 + %add.ptr47 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 49 + %i9 = load float, ptr addrspace(1) %add.ptr47, align 4 + %add.ptr47.1 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 98 + %i10 = load float, ptr addrspace(1) %add.ptr47.1, align 4 + %add.ptr47.2 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 147 + %i11 = load float, ptr addrspace(1) %add.ptr47.2, align 4 + %i12 = load float, ptr addrspace(4) %w_ptr.0287, align 4 + %add.ptr66 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1024 + %i13 = load float, ptr addrspace(4) %add.ptr66, align 4 + %add.ptr66.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2048 + %i14 = load float, ptr addrspace(4) %add.ptr66.1, align 4 + %add.ptr66.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3072 + %i15 = load float, ptr addrspace(4) %add.ptr66.2, align 4 + %add.ptr70 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1 + %i16 = load float, ptr addrspace(4) %add.ptr70, align 4 + %add.ptr66.1291 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1025 + %i17 = load float, ptr addrspace(4) %add.ptr66.1291, align 4 + %add.ptr66.1.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2049 + %i18 = load float, ptr addrspace(4) %add.ptr66.1.1, align 4 + %add.ptr66.2.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3073 + %i19 = load float, ptr addrspace(4) %add.ptr66.2.1, align 4 + %add.ptr70.1 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2 + %i20 = load float, ptr addrspace(4) %add.ptr70.1, align 4 + %add.ptr66.2293 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1026 + %i21 = load float, ptr addrspace(4) %add.ptr66.2293, align 4 + %add.ptr66.1.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2050 + %i22 = load float, ptr addrspace(4) %add.ptr66.1.2, align 4 + %add.ptr66.2.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3074 + %i23 = load float, ptr addrspace(4) %add.ptr66.2.2, align 4 + %add.ptr70.2 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3 + %i24 = load float, ptr addrspace(4) %add.ptr70.2, align 4 + %add.ptr66.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1027 + %i25 = load float, ptr addrspace(4) %add.ptr66.3, align 4 + %add.ptr66.1.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2051 + %i26 = load float, ptr addrspace(4) %add.ptr66.1.3, align 4 + %add.ptr66.2.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3075 + %i27 = load float, ptr addrspace(4) %add.ptr66.2.3, align 4 + %add.ptr70.3 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 4 + %i28 = load float, ptr addrspace(4) %add.ptr70.3, align 4 + %add.ptr66.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1028 + %i29 = load float, ptr addrspace(4) %add.ptr66.4, align 4 + %add.ptr66.1.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2052 + %i30 = load float, ptr addrspace(4) %add.ptr66.1.4, align 4 + %add.ptr66.2.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3076 + %i31 = load float, ptr addrspace(4) %add.ptr66.2.4, align 4 + %add.ptr70.4 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 5 + %i32 = load float, ptr addrspace(4) %add.ptr70.4, align 4 + %add.ptr66.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1029 + %i33 = load float, ptr addrspace(4) %add.ptr66.5, align 4 + %add.ptr66.1.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2053 + %i34 = load float, ptr addrspace(4) %add.ptr66.1.5, align 4 + %add.ptr66.2.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3077 + %i35 = load float, ptr addrspace(4) %add.ptr66.2.5, align 4 + %add.ptr70.5 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 6 + %i36 = load float, ptr addrspace(4) %add.ptr70.5, align 4 + %add.ptr66.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1030 + %i37 = load float, ptr addrspace(4) %add.ptr66.6, align 4 + %add.ptr66.1.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2054 + %i38 = load float, ptr addrspace(4) %add.ptr66.1.6, align 4 + %add.ptr66.2.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3078 + %i39 = load float, ptr addrspace(4) %add.ptr66.2.6, align 4 + %add.ptr70.6 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 7 + %i40 = load float, ptr addrspace(4) %add.ptr70.6, align 4 + %add.ptr66.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1031 + %i41 = load float, ptr addrspace(4) %add.ptr66.7, align 4 + %add.ptr66.1.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2055 + %i42 = load float, ptr addrspace(4) %add.ptr66.1.7, align 4 + %add.ptr66.2.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3079 + %i43 = load float, ptr addrspace(4) %add.ptr66.2.7, align 4 + %add.ptr70.7 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 8 + %i44 = load float, ptr addrspace(4) %add.ptr70.7, align 4 + %add.ptr66.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1032 + %i45 = load float, ptr addrspace(4) %add.ptr66.8, align 4 + %add.ptr66.1.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2056 + %i46 = load float, ptr addrspace(4) %add.ptr66.1.8, align 4 + %add.ptr66.2.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3080 + %i47 = load float, ptr addrspace(4) %add.ptr66.2.8, align 4 + %add.ptr70.8 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 9 + %i48 = load float, ptr addrspace(4) %add.ptr70.8, align 4 + %add.ptr66.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1033 + %i49 = load float, ptr addrspace(4) %add.ptr66.9, align 4 + %add.ptr66.1.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2057 + %i50 = load float, ptr addrspace(4) %add.ptr66.1.9, align 4 + %add.ptr66.2.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3081 + %i51 = load float, ptr addrspace(4) %add.ptr66.2.9, align 4 + %add.ptr70.9 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 10 + %i52 = load float, ptr addrspace(4) %add.ptr70.9, align 4 + %add.ptr66.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1034 + %i53 = load float, ptr addrspace(4) %add.ptr66.10, align 4 + %add.ptr66.1.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2058 + %i54 = load float, ptr addrspace(4) %add.ptr66.1.10, align 4 + %add.ptr66.2.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3082 + %i55 = load float, ptr addrspace(4) %add.ptr66.2.10, align 4 + %add.ptr70.10 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 11 + %i56 = load float, ptr addrspace(4) %add.ptr70.10, align 4 + %add.ptr66.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1035 + %i57 = load float, ptr addrspace(4) %add.ptr66.11, align 4 + %add.ptr66.1.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2059 + %i58 = load float, ptr addrspace(4) %add.ptr66.1.11, align 4 + %add.ptr66.2.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3083 + %i59 = load float, ptr addrspace(4) %add.ptr66.2.11, align 4 + %add.ptr70.11 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 12 + %i60 = load float, ptr addrspace(4) %add.ptr70.11, align 4 + %add.ptr66.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1036 + %i61 = load float, ptr addrspace(4) %add.ptr66.12, align 4 + %add.ptr66.1.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2060 + %i62 = load float, ptr addrspace(4) %add.ptr66.1.12, align 4 + %add.ptr66.2.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3084 + %i63 = load float, ptr addrspace(4) %add.ptr66.2.12, align 4 + %add.ptr70.12 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 13 + %i64 = load float, ptr addrspace(4) %add.ptr70.12, align 4 + %add.ptr66.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1037 + %i65 = load float, ptr addrspace(4) %add.ptr66.13, align 4 + %add.ptr66.1.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2061 + %i66 = load float, ptr addrspace(4) %add.ptr66.1.13, align 4 + %add.ptr66.2.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3085 + %i67 = load float, ptr addrspace(4) %add.ptr66.2.13, align 4 + %add.ptr70.13 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 14 + %i68 = load float, ptr addrspace(4) %add.ptr70.13, align 4 + %add.ptr66.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1038 + %i69 = load float, ptr addrspace(4) %add.ptr66.14, align 4 + %add.ptr66.1.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2062 + %i70 = load float, ptr addrspace(4) %add.ptr66.1.14, align 4 + %add.ptr66.2.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3086 + %i71 = load float, ptr addrspace(4) %add.ptr66.2.14, align 4 + %add.ptr70.14 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 15 + %i72 = load float, ptr addrspace(4) %add.ptr70.14, align 4 + %add.ptr66.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1039 + %i73 = load float, ptr addrspace(4) %add.ptr66.15, align 4 + %add.ptr66.1.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2063 + %i74 = load float, ptr addrspace(4) %add.ptr66.1.15, align 4 + %add.ptr66.2.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3087 + %i75 = load float, ptr addrspace(4) %add.ptr66.2.15, align 4 + %add.ptr70.15 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 16 + %i76 = load float, ptr addrspace(4) %add.ptr70.15, align 4 + %add.ptr66.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1040 + %i77 = load float, ptr addrspace(4) %add.ptr66.16, align 4 + %add.ptr66.1.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2064 + %i78 = load float, ptr addrspace(4) %add.ptr66.1.16, align 4 + %add.ptr66.2.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3088 + %i79 = load float, ptr addrspace(4) %add.ptr66.2.16, align 4 + %add.ptr70.16 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 17 + %i80 = load float, ptr addrspace(4) %add.ptr70.16, align 4 + %add.ptr66.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1041 + %i81 = load float, ptr addrspace(4) %add.ptr66.17, align 4 + %add.ptr66.1.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2065 + %i82 = load float, ptr addrspace(4) %add.ptr66.1.17, align 4 + %add.ptr66.2.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3089 + %i83 = load float, ptr addrspace(4) %add.ptr66.2.17, align 4 + %add.ptr70.17 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 18 + %i84 = load float, ptr addrspace(4) %add.ptr70.17, align 4 + %add.ptr66.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1042 + %i85 = load float, ptr addrspace(4) %add.ptr66.18, align 4 + %add.ptr66.1.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2066 + %i86 = load float, ptr addrspace(4) %add.ptr66.1.18, align 4 + %add.ptr66.2.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3090 + %i87 = load float, ptr addrspace(4) %add.ptr66.2.18, align 4 + %add.ptr70.18 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 19 + %i88 = load float, ptr addrspace(4) %add.ptr70.18, align 4 + %add.ptr66.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1043 + %i89 = load float, ptr addrspace(4) %add.ptr66.19, align 4 + %add.ptr66.1.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2067 + %i90 = load float, ptr addrspace(4) %add.ptr66.1.19, align 4 + %add.ptr66.2.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3091 + %i91 = load float, ptr addrspace(4) %add.ptr66.2.19, align 4 + %add.ptr70.19 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 20 + %i92 = load float, ptr addrspace(4) %add.ptr70.19, align 4 + %add.ptr66.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1044 + %i93 = load float, ptr addrspace(4) %add.ptr66.20, align 4 + %add.ptr66.1.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2068 + %i94 = load float, ptr addrspace(4) %add.ptr66.1.20, align 4 + %add.ptr66.2.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3092 + %i95 = load float, ptr addrspace(4) %add.ptr66.2.20, align 4 + %add.ptr70.20 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 21 + %i96 = load float, ptr addrspace(4) %add.ptr70.20, align 4 + %add.ptr66.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1045 + %i97 = load float, ptr addrspace(4) %add.ptr66.21, align 4 + %add.ptr66.1.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2069 + %i98 = load float, ptr addrspace(4) %add.ptr66.1.21, align 4 + %add.ptr66.2.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3093 + %i99 = load float, ptr addrspace(4) %add.ptr66.2.21, align 4 + %add.ptr70.21 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 22 + %i100 = load float, ptr addrspace(4) %add.ptr70.21, align 4 + %add.ptr66.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1046 + %i101 = load float, ptr addrspace(4) %add.ptr66.22, align 4 + %add.ptr66.1.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2070 + %i102 = load float, ptr addrspace(4) %add.ptr66.1.22, align 4 + %add.ptr66.2.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3094 + %i103 = load float, ptr addrspace(4) %add.ptr66.2.22, align 4 + %add.ptr70.22 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 23 + %i104 = load float, ptr addrspace(4) %add.ptr70.22, align 4 + %add.ptr66.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1047 + %i105 = load float, ptr addrspace(4) %add.ptr66.23, align 4 + %add.ptr66.1.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2071 + %i106 = load float, ptr addrspace(4) %add.ptr66.1.23, align 4 + %add.ptr66.2.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3095 + %i107 = load float, ptr addrspace(4) %add.ptr66.2.23, align 4 + %add.ptr70.23 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 24 + %i108 = load float, ptr addrspace(4) %add.ptr70.23, align 4 + %add.ptr66.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1048 + %i109 = load float, ptr addrspace(4) %add.ptr66.24, align 4 + %add.ptr66.1.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2072 + %i110 = load float, ptr addrspace(4) %add.ptr66.1.24, align 4 + %add.ptr66.2.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3096 + %i111 = load float, ptr addrspace(4) %add.ptr66.2.24, align 4 + %add.ptr70.24 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 25 + %i112 = load float, ptr addrspace(4) %add.ptr70.24, align 4 + %add.ptr66.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1049 + %i113 = load float, ptr addrspace(4) %add.ptr66.25, align 4 + %add.ptr66.1.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2073 + %i114 = load float, ptr addrspace(4) %add.ptr66.1.25, align 4 + %add.ptr66.2.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3097 + %i115 = load float, ptr addrspace(4) %add.ptr66.2.25, align 4 + %add.ptr70.25 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 26 + %i116 = load float, ptr addrspace(4) %add.ptr70.25, align 4 + %add.ptr66.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1050 + %i117 = load float, ptr addrspace(4) %add.ptr66.26, align 4 + %add.ptr66.1.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2074 + %i118 = load float, ptr addrspace(4) %add.ptr66.1.26, align 4 + %add.ptr66.2.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3098 + %i119 = load float, ptr addrspace(4) %add.ptr66.2.26, align 4 + %add.ptr70.26 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 27 + %i120 = load float, ptr addrspace(4) %add.ptr70.26, align 4 + %add.ptr66.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1051 + %i121 = load float, ptr addrspace(4) %add.ptr66.27, align 4 + %add.ptr66.1.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2075 + %i122 = load float, ptr addrspace(4) %add.ptr66.1.27, align 4 + %add.ptr66.2.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3099 + %i123 = load float, ptr addrspace(4) %add.ptr66.2.27, align 4 + %add.ptr70.27 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 28 + %i124 = load float, ptr addrspace(4) %add.ptr70.27, align 4 + %add.ptr66.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1052 + %i125 = load float, ptr addrspace(4) %add.ptr66.28, align 4 + %add.ptr66.1.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2076 + %i126 = load float, ptr addrspace(4) %add.ptr66.1.28, align 4 + %add.ptr66.2.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3100 + %i127 = load float, ptr addrspace(4) %add.ptr66.2.28, align 4 + %add.ptr70.28 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 29 + %i128 = load float, ptr addrspace(4) %add.ptr70.28, align 4 + %add.ptr66.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1053 + %i129 = load float, ptr addrspace(4) %add.ptr66.29, align 4 + %add.ptr66.1.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2077 + %i130 = load float, ptr addrspace(4) %add.ptr66.1.29, align 4 + %add.ptr66.2.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3101 + %i131 = load float, ptr addrspace(4) %add.ptr66.2.29, align 4 + %add.ptr70.29 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 30 + %i132 = load float, ptr addrspace(4) %add.ptr70.29, align 4 + %add.ptr66.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1054 + %i133 = load float, ptr addrspace(4) %add.ptr66.30, align 4 + %add.ptr66.1.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2078 + %i134 = load float, ptr addrspace(4) %add.ptr66.1.30, align 4 + %add.ptr66.2.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3102 + %i135 = load float, ptr addrspace(4) %add.ptr66.2.30, align 4 + %add.ptr70.30 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 31 + %i136 = load float, ptr addrspace(4) %add.ptr70.30, align 4 + %add.ptr66.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 1055 + %i137 = load float, ptr addrspace(4) %add.ptr66.31, align 4 + %add.ptr66.1.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 2079 + %i138 = load float, ptr addrspace(4) %add.ptr66.1.31, align 4 + %add.ptr66.2.31 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 3103 + %i139 = load float, ptr addrspace(4) %add.ptr66.2.31, align 4 + %add.ptr47.3 = getelementptr inbounds float, ptr addrspace(1) %i_ptr.0288, i64 196 + %i140 = tail call float @llvm.fmuladd.f32(float %i8, float %i12, float %accum.sroa.0.0) + %i141 = tail call float @llvm.fmuladd.f32(float %i9, float %i13, float %i140) + %i142 = tail call float @llvm.fmuladd.f32(float %i10, float %i14, float %i141) + %i143 = tail call float @llvm.fmuladd.f32(float %i11, float %i15, float %i142) + %i144 = tail call float @llvm.fmuladd.f32(float %i8, float %i16, float %accum.sroa.6.0) + %i145 = tail call float @llvm.fmuladd.f32(float %i9, float %i17, float %i144) + %i146 = tail call float @llvm.fmuladd.f32(float %i10, float %i18, float %i145) + %i147 = tail call float @llvm.fmuladd.f32(float %i11, float %i19, float %i146) + %i148 = tail call float @llvm.fmuladd.f32(float %i8, float %i20, float %accum.sroa.10.0) + %i149 = tail call float @llvm.fmuladd.f32(float %i9, float %i21, float %i148) + %i150 = tail call float @llvm.fmuladd.f32(float %i10, float %i22, float %i149) + %i151 = tail call float @llvm.fmuladd.f32(float %i11, float %i23, float %i150) + %i152 = tail call float @llvm.fmuladd.f32(float %i8, float %i24, float %accum.sroa.14.0) + %i153 = tail call float @llvm.fmuladd.f32(float %i9, float %i25, float %i152) + %i154 = tail call float @llvm.fmuladd.f32(float %i10, float %i26, float %i153) + %i155 = tail call float @llvm.fmuladd.f32(float %i11, float %i27, float %i154) + %i156 = tail call float @llvm.fmuladd.f32(float %i8, float %i28, float %accum.sroa.18.0) + %i157 = tail call float @llvm.fmuladd.f32(float %i9, float %i29, float %i156) + %i158 = tail call float @llvm.fmuladd.f32(float %i10, float %i30, float %i157) + %i159 = tail call float @llvm.fmuladd.f32(float %i11, float %i31, float %i158) + %i160 = tail call float @llvm.fmuladd.f32(float %i8, float %i32, float %accum.sroa.22.0) + %i161 = tail call float @llvm.fmuladd.f32(float %i9, float %i33, float %i160) + %i162 = tail call float @llvm.fmuladd.f32(float %i10, float %i34, float %i161) + %i163 = tail call float @llvm.fmuladd.f32(float %i11, float %i35, float %i162) + %i164 = tail call float @llvm.fmuladd.f32(float %i8, float %i36, float %accum.sroa.26.0) + %i165 = tail call float @llvm.fmuladd.f32(float %i9, float %i37, float %i164) + %i166 = tail call float @llvm.fmuladd.f32(float %i10, float %i38, float %i165) + %i167 = tail call float @llvm.fmuladd.f32(float %i11, float %i39, float %i166) + %i168 = tail call float @llvm.fmuladd.f32(float %i8, float %i40, float %accum.sroa.30.0) + %i169 = tail call float @llvm.fmuladd.f32(float %i9, float %i41, float %i168) + %i170 = tail call float @llvm.fmuladd.f32(float %i10, float %i42, float %i169) + %i171 = tail call float @llvm.fmuladd.f32(float %i11, float %i43, float %i170) + %i172 = tail call float @llvm.fmuladd.f32(float %i8, float %i44, float %accum.sroa.34.0) + %i173 = tail call float @llvm.fmuladd.f32(float %i9, float %i45, float %i172) + %i174 = tail call float @llvm.fmuladd.f32(float %i10, float %i46, float %i173) + %i175 = tail call float @llvm.fmuladd.f32(float %i11, float %i47, float %i174) + %i176 = tail call float @llvm.fmuladd.f32(float %i8, float %i48, float %accum.sroa.38.0) + %i177 = tail call float @llvm.fmuladd.f32(float %i9, float %i49, float %i176) + %i178 = tail call float @llvm.fmuladd.f32(float %i10, float %i50, float %i177) + %i179 = tail call float @llvm.fmuladd.f32(float %i11, float %i51, float %i178) + %i180 = tail call float @llvm.fmuladd.f32(float %i8, float %i52, float %accum.sroa.42.0) + %i181 = tail call float @llvm.fmuladd.f32(float %i9, float %i53, float %i180) + %i182 = tail call float @llvm.fmuladd.f32(float %i10, float %i54, float %i181) + %i183 = tail call float @llvm.fmuladd.f32(float %i11, float %i55, float %i182) + %i184 = tail call float @llvm.fmuladd.f32(float %i8, float %i56, float %accum.sroa.46.0) + %i185 = tail call float @llvm.fmuladd.f32(float %i9, float %i57, float %i184) + %i186 = tail call float @llvm.fmuladd.f32(float %i10, float %i58, float %i185) + %i187 = tail call float @llvm.fmuladd.f32(float %i11, float %i59, float %i186) + %i188 = tail call float @llvm.fmuladd.f32(float %i8, float %i60, float %accum.sroa.50.0) + %i189 = tail call float @llvm.fmuladd.f32(float %i9, float %i61, float %i188) + %i190 = tail call float @llvm.fmuladd.f32(float %i10, float %i62, float %i189) + %i191 = tail call float @llvm.fmuladd.f32(float %i11, float %i63, float %i190) + %i192 = tail call float @llvm.fmuladd.f32(float %i8, float %i64, float %accum.sroa.54.0) + %i193 = tail call float @llvm.fmuladd.f32(float %i9, float %i65, float %i192) + %i194 = tail call float @llvm.fmuladd.f32(float %i10, float %i66, float %i193) + %i195 = tail call float @llvm.fmuladd.f32(float %i11, float %i67, float %i194) + %i196 = tail call float @llvm.fmuladd.f32(float %i8, float %i68, float %accum.sroa.58.0) + %i197 = tail call float @llvm.fmuladd.f32(float %i9, float %i69, float %i196) + %i198 = tail call float @llvm.fmuladd.f32(float %i10, float %i70, float %i197) + %i199 = tail call float @llvm.fmuladd.f32(float %i11, float %i71, float %i198) + %i200 = tail call float @llvm.fmuladd.f32(float %i8, float %i72, float %accum.sroa.62.0) + %i201 = tail call float @llvm.fmuladd.f32(float %i9, float %i73, float %i200) + %i202 = tail call float @llvm.fmuladd.f32(float %i10, float %i74, float %i201) + %i203 = tail call float @llvm.fmuladd.f32(float %i11, float %i75, float %i202) + %i204 = tail call float @llvm.fmuladd.f32(float %i8, float %i76, float %accum.sroa.66.0) + %i205 = tail call float @llvm.fmuladd.f32(float %i9, float %i77, float %i204) + %i206 = tail call float @llvm.fmuladd.f32(float %i10, float %i78, float %i205) + %i207 = tail call float @llvm.fmuladd.f32(float %i11, float %i79, float %i206) + %i208 = tail call float @llvm.fmuladd.f32(float %i8, float %i80, float %accum.sroa.70.0) + %i209 = tail call float @llvm.fmuladd.f32(float %i9, float %i81, float %i208) + %i210 = tail call float @llvm.fmuladd.f32(float %i10, float %i82, float %i209) + %i211 = tail call float @llvm.fmuladd.f32(float %i11, float %i83, float %i210) + %i212 = tail call float @llvm.fmuladd.f32(float %i8, float %i84, float %accum.sroa.74.0) + %i213 = tail call float @llvm.fmuladd.f32(float %i9, float %i85, float %i212) + %i214 = tail call float @llvm.fmuladd.f32(float %i10, float %i86, float %i213) + %i215 = tail call float @llvm.fmuladd.f32(float %i11, float %i87, float %i214) + %i216 = tail call float @llvm.fmuladd.f32(float %i8, float %i88, float %accum.sroa.78.0) + %i217 = tail call float @llvm.fmuladd.f32(float %i9, float %i89, float %i216) + %i218 = tail call float @llvm.fmuladd.f32(float %i10, float %i90, float %i217) + %i219 = tail call float @llvm.fmuladd.f32(float %i11, float %i91, float %i218) + %i220 = tail call float @llvm.fmuladd.f32(float %i8, float %i92, float %accum.sroa.82.0) + %i221 = tail call float @llvm.fmuladd.f32(float %i9, float %i93, float %i220) + %i222 = tail call float @llvm.fmuladd.f32(float %i10, float %i94, float %i221) + %i223 = tail call float @llvm.fmuladd.f32(float %i11, float %i95, float %i222) + %i224 = tail call float @llvm.fmuladd.f32(float %i8, float %i96, float %accum.sroa.86.0) + %i225 = tail call float @llvm.fmuladd.f32(float %i9, float %i97, float %i224) + %i226 = tail call float @llvm.fmuladd.f32(float %i10, float %i98, float %i225) + %i227 = tail call float @llvm.fmuladd.f32(float %i11, float %i99, float %i226) + %i228 = tail call float @llvm.fmuladd.f32(float %i8, float %i100, float %accum.sroa.90.0) + %i229 = tail call float @llvm.fmuladd.f32(float %i9, float %i101, float %i228) + %i230 = tail call float @llvm.fmuladd.f32(float %i10, float %i102, float %i229) + %i231 = tail call float @llvm.fmuladd.f32(float %i11, float %i103, float %i230) + %i232 = tail call float @llvm.fmuladd.f32(float %i8, float %i104, float %accum.sroa.94.0) + %i233 = tail call float @llvm.fmuladd.f32(float %i9, float %i105, float %i232) + %i234 = tail call float @llvm.fmuladd.f32(float %i10, float %i106, float %i233) + %i235 = tail call float @llvm.fmuladd.f32(float %i11, float %i107, float %i234) + %i236 = tail call float @llvm.fmuladd.f32(float %i8, float %i108, float %accum.sroa.98.0) + %i237 = tail call float @llvm.fmuladd.f32(float %i9, float %i109, float %i236) + %i238 = tail call float @llvm.fmuladd.f32(float %i10, float %i110, float %i237) + %i239 = tail call float @llvm.fmuladd.f32(float %i11, float %i111, float %i238) + %i240 = tail call float @llvm.fmuladd.f32(float %i8, float %i112, float %accum.sroa.102.0) + %i241 = tail call float @llvm.fmuladd.f32(float %i9, float %i113, float %i240) + %i242 = tail call float @llvm.fmuladd.f32(float %i10, float %i114, float %i241) + %i243 = tail call float @llvm.fmuladd.f32(float %i11, float %i115, float %i242) + %i244 = tail call float @llvm.fmuladd.f32(float %i8, float %i116, float %accum.sroa.106.0) + %i245 = tail call float @llvm.fmuladd.f32(float %i9, float %i117, float %i244) + %i246 = tail call float @llvm.fmuladd.f32(float %i10, float %i118, float %i245) + %i247 = tail call float @llvm.fmuladd.f32(float %i11, float %i119, float %i246) + %i248 = tail call float @llvm.fmuladd.f32(float %i8, float %i120, float %accum.sroa.110.0) + %i249 = tail call float @llvm.fmuladd.f32(float %i9, float %i121, float %i248) + %i250 = tail call float @llvm.fmuladd.f32(float %i10, float %i122, float %i249) + %i251 = tail call float @llvm.fmuladd.f32(float %i11, float %i123, float %i250) + %i252 = tail call float @llvm.fmuladd.f32(float %i8, float %i124, float %accum.sroa.114.0) + %i253 = tail call float @llvm.fmuladd.f32(float %i9, float %i125, float %i252) + %i254 = tail call float @llvm.fmuladd.f32(float %i10, float %i126, float %i253) + %i255 = tail call float @llvm.fmuladd.f32(float %i11, float %i127, float %i254) + %i256 = tail call float @llvm.fmuladd.f32(float %i8, float %i128, float %accum.sroa.118.0) + %i257 = tail call float @llvm.fmuladd.f32(float %i9, float %i129, float %i256) + %i258 = tail call float @llvm.fmuladd.f32(float %i10, float %i130, float %i257) + %i259 = tail call float @llvm.fmuladd.f32(float %i11, float %i131, float %i258) + %i260 = tail call float @llvm.fmuladd.f32(float %i8, float %i132, float %accum.sroa.122.0) + %i261 = tail call float @llvm.fmuladd.f32(float %i9, float %i133, float %i260) + %i262 = tail call float @llvm.fmuladd.f32(float %i10, float %i134, float %i261) + %i263 = tail call float @llvm.fmuladd.f32(float %i11, float %i135, float %i262) + %i264 = tail call float @llvm.fmuladd.f32(float %i8, float %i136, float %accum.sroa.126.0) + %i265 = tail call float @llvm.fmuladd.f32(float %i9, float %i137, float %i264) + %i266 = tail call float @llvm.fmuladd.f32(float %i10, float %i138, float %i265) + %i267 = tail call float @llvm.fmuladd.f32(float %i11, float %i139, float %i266) + %add.ptr74 = getelementptr inbounds float, ptr addrspace(4) %w_ptr.0287, i64 4096 + %inc116 = add nuw nsw i32 %ci.0286, 1 + %exitcond.not = icmp eq i32 %inc116, 512 + br i1 %exitcond.not, label %for.cond.cleanup26, label %for.cond28.preheader + +for.cond.cleanup26: ; preds = %for.cond28.preheader + %mul119 = shl nuw nsw i32 undef, 1 + %mul120 = mul i32 %div, 200704 + %mul121 = mul i32 undef, 6272 + %add122 = add i32 %mul120, %mul121 + %mul123 = mul nuw nsw i32 undef, 28 + %add124 = add i32 %add122, %mul123 + %add126 = add i32 %add124, %mul119 + %idx.ext127 = zext i32 %add126 to i64 + %add.ptr128 = getelementptr inbounds float, ptr addrspace(1) %out_ptr, i64 %idx.ext127 + store float %i143, ptr addrspace(1) %add.ptr128, align 4 + %add.ptr184 = getelementptr inbounds float, ptr addrspace(1) %add.ptr128, i64 196 + store float %i147, ptr addrspace(1) %add.ptr184, align 4 + %add.ptr167.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184, i64 14 + store float 0.000000e+00, ptr addrspace(1) %add.ptr167.1, align 4 + %add.ptr175.1.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr167.1, i64 1 + store float 0.000000e+00, ptr addrspace(1) %add.ptr175.1.1, align 4 + %add.ptr184.1 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184, i64 196 + store float %i151, ptr addrspace(1) %add.ptr184.1, align 4 + %add.ptr184.2 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.1, i64 196 + store float %i155, ptr addrspace(1) %add.ptr184.2, align 4 + %add.ptr184.3 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.2, i64 196 + store float %i159, ptr addrspace(1) %add.ptr184.3, align 4 + %add.ptr184.4 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.3, i64 196 + store float %i163, ptr addrspace(1) %add.ptr184.4, align 4 + %add.ptr154.5 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.4, i64 1 + store float 0.000000e+00, ptr addrspace(1) %add.ptr154.5, align 4 + %add.ptr184.5 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.4, i64 196 + store float %i167, ptr addrspace(1) %add.ptr184.5, align 4 + %add.ptr154.6 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.5, i64 1 + store float 0.000000e+00, ptr addrspace(1) %add.ptr154.6, align 4 + %add.ptr184.6 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.5, i64 196 + store float %i171, ptr addrspace(1) %add.ptr184.6, align 4 + %add.ptr184.7 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.6, i64 196 + store float %i175, ptr addrspace(1) %add.ptr184.7, align 4 + %add.ptr167.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.7, i64 14 + store float 0.000000e+00, ptr addrspace(1) %add.ptr167.8, align 4 + %add.ptr175.1.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr167.8, i64 1 + store float 0.000000e+00, ptr addrspace(1) %add.ptr175.1.8, align 4 + %add.ptr184.8 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.7, i64 196 + store float %i179, ptr addrspace(1) %add.ptr184.8, align 4 + %add.ptr184.9 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.8, i64 196 + store float %i183, ptr addrspace(1) %add.ptr184.9, align 4 + %add.ptr184.10 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.9, i64 196 + store float %i187, ptr addrspace(1) %add.ptr184.10, align 4 + %add.ptr184.11 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.10, i64 196 + store float %i191, ptr addrspace(1) %add.ptr184.11, align 4 + %add.ptr184.12 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.11, i64 196 + store float %i195, ptr addrspace(1) %add.ptr184.12, align 4 + %add.ptr184.13 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.12, i64 196 + store float %i199, ptr addrspace(1) %add.ptr184.13, align 4 + %add.ptr184.14 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.13, i64 196 + store float %i203, ptr addrspace(1) %add.ptr184.14, align 4 + %add.ptr184.15 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.14, i64 196 + store float %i207, ptr addrspace(1) %add.ptr184.15, align 4 + %add.ptr184.16 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.15, i64 196 + store float %i211, ptr addrspace(1) %add.ptr184.16, align 4 + %add.ptr184.17 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.16, i64 196 + store float %i215, ptr addrspace(1) %add.ptr184.17, align 4 + %add.ptr184.18 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.17, i64 196 + store float %i219, ptr addrspace(1) %add.ptr184.18, align 4 + %add.ptr184.19 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.18, i64 196 + store float %i223, ptr addrspace(1) %add.ptr184.19, align 4 + %add.ptr184.20 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.19, i64 196 + store float %i227, ptr addrspace(1) %add.ptr184.20, align 4 + %add.ptr184.21 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.20, i64 196 + store float %i231, ptr addrspace(1) %add.ptr184.21, align 4 + %add.ptr184.22 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.21, i64 196 + store float %i235, ptr addrspace(1) %add.ptr184.22, align 4 + %add.ptr184.23 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.22, i64 196 + store float %i239, ptr addrspace(1) %add.ptr184.23, align 4 + %add.ptr184.24 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.23, i64 196 + store float %i243, ptr addrspace(1) %add.ptr184.24, align 4 + %add.ptr184.25 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.24, i64 196 + store float %i247, ptr addrspace(1) %add.ptr184.25, align 4 + %add.ptr184.26 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.25, i64 196 + store float %i251, ptr addrspace(1) %add.ptr184.26, align 4 + %add.ptr184.27 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.26, i64 196 + store float %i255, ptr addrspace(1) %add.ptr184.27, align 4 + %add.ptr184.28 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.27, i64 196 + store float %i259, ptr addrspace(1) %add.ptr184.28, align 4 + %add.ptr184.29 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.28, i64 196 + store float %i263, ptr addrspace(1) %add.ptr184.29, align 4 + %add.ptr184.30 = getelementptr inbounds float, ptr addrspace(1) %add.ptr184.29, i64 196 + store float %i267, ptr addrspace(1) %add.ptr184.30, align 4 + ret void +} + + + +declare float @llvm.fmuladd.f32(float, float, float) #2 +declare i32 @llvm.amdgcn.workitem.id.x() #3 +declare i32 @llvm.amdgcn.workgroup.id.x() #3 +declare align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3 + +!0 = !{i32 1, i32 2, i32 1, i32 0} +!1 = !{!"none", !"none", !"none", !"none"} +!2 = !{!"ptr", !"ptr", !"ptr", !"float"} +!3 = !{!"restrict const", !"restrict const", !"restrict", !""} +!4 = !{i32 256, i32 1, i32 1} +!5 = !{i32 0, i32 1024} + +attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1" } +attributes #1 = { nounwind "amdgpu-num-vgpr"="64" } +attributes #2 = { nofree nosync nounwind readnone speculatable willreturn } +attributes #3 = { nounwind readnone speculatable willreturn } \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir index 14bb4310c619ea..3ce6279f9082fb 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,1 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s --- | define amdgpu_kernel void @no_sched_metric_due_to_spills() #0 { @@ -11,6 +12,20 @@ # GCN-LABEL: name: no_sched_metric_due_to_spills # GCN-NOT: SI_SPILL_ # GCN: S_ENDPGM + +# GCN-GCNTRACKER-LABEL: name: no_sched_metric_due_to_spills +# GCN-GCNTRACKER: SI_SPILL_V32_SAVE +# GCN-GCNTRACKER: SI_SPILL_V32_SAVE +# GCN-GCNTRACKER: SI_SPILL_V32_SAVE +# GCN-GCNTRACKER: SI_SPILL_V32_SAVE +# GCN-GCNTRACKER: SI_SPILL_V32_SAVE +# GCN-GCNTRACKER: SI_SPILL_V32_SAVE +# GCN-GCNTRACKER: S_ENDPGM + +# When using the GCN Trackers, the scheduler is able to acieve desired occupancy without running high-RP-reschedule stage. However, the RP is still high, +# and RA is unable to allocate without spills. By running the high-RP-reschedule schedule we would have furhter decreased RP, which provides increased +# flexibility for RA. + --- name: no_sched_metric_due_to_spills tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll index 94815558bf3d6d..71f8d91874f04f 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll +++ b/llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll @@ -1,16 +1,24 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=OCC %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs < %s | FileCheck --check-prefix=OCC-GCNTRACKER %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs -amdgpu-schedule-relaxed-occupancy=true < %s | FileCheck --check-prefix=RELAX %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -amdgpu-schedule-relaxed-occupancy=true < %s | FileCheck --check-prefix=RELAX-GCNTRACKER %s ; Using -amgpu-schedule-relaxed-occupancy allows scheduler to produce better ILP by further relaxing occupancy target -; GCN-LABEL: {{^}}load_fma_store: +; CHECK-LABEL: {{^}}load_fma_store: ; OCC: NumVgprs: 32 +; OCC-GCNTRACKER: NumVgprs: 24 ; RELAX: NumVgprs: 64 +; RELAX-GCNTRACKER: NumVgprs: 60 ; OCC: NumVGPRsForWavesPerEU: 32 +; OCC-GCNTRACKER: NumVGPRsForWavesPerEU: 24 ; RELAX: NumVGPRsForWavesPerEU: 64 +; RELAX-GCNTRACKER: NumVGPRsForWavesPerEU: 60 ; OCC: Occupancy: 8 +; OCC-GCNTRACKER: Occupancy: 8 ; RELAX: Occupancy: 4 +; RELAX-GCNTRACKER: Occupancy: 4 define amdgpu_kernel void @load_fma_store(ptr addrspace(3) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) #1 { bb: From 7096cb0b3b78ff07b48f6a6a219907e37970955a Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Fri, 27 Sep 2024 12:40:02 -0700 Subject: [PATCH 22/27] Remove CurrLIS Change-Id: I228916bf04add1de7615294d1e58ee4213f0bbde --- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 21 ++++++++------------- llvm/lib/Target/AMDGPU/GCNRegPressure.h | 10 ++++------ llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 2 +- 3 files changed, 13 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 7e2c396270a624..653701ff772307 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -605,26 +605,22 @@ bool GCNDownwardRPTracker::reset(const MachineInstr &MI, } bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI, - bool UseInternalIterator, - LiveIntervals *TheLIS) { + bool UseInternalIterator) { assert(MRI && "call reset first"); SlotIndex SI; - const LiveIntervals *CurrLIS; const MachineInstr *CurrMI; if (UseInternalIterator) { if (!LastTrackedMI) return NextMI == MBBEnd; assert(NextMI == MBBEnd || !NextMI->isDebugInstr()); - CurrLIS = &LIS; CurrMI = LastTrackedMI; SI = NextMI == MBBEnd - ? CurrLIS->getInstructionIndex(*LastTrackedMI).getDeadSlot() - : CurrLIS->getInstructionIndex(*NextMI).getBaseIndex(); + ? LIS.getInstructionIndex(*LastTrackedMI).getDeadSlot() + : LIS.getInstructionIndex(*NextMI).getBaseIndex(); } else { //! UseInternalIterator - CurrLIS = TheLIS; - SI = CurrLIS->getInstructionIndex(*MI).getBaseIndex(); + SI = LIS.getInstructionIndex(*MI).getBaseIndex(); CurrMI = MI; } @@ -641,7 +637,7 @@ bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI, continue; if (!SeenRegs.insert(MO.getReg()).second) continue; - const LiveInterval &LI = CurrLIS->getInterval(MO.getReg()); + const LiveInterval &LI = LIS.getInterval(MO.getReg()); if (LI.hasSubRanges()) { auto It = LiveRegs.end(); for (const auto &S : LI.subranges()) { @@ -699,16 +695,15 @@ void GCNDownwardRPTracker::advanceToNext(MachineInstr *MI, MaxPressure = max(MaxPressure, CurPressure); } -bool GCNDownwardRPTracker::advance(MachineInstr *MI, bool UseInternalIterator, - LiveIntervals *TheLIS) { +bool GCNDownwardRPTracker::advance(MachineInstr *MI, bool UseInternalIterator) { if (UseInternalIterator && NextMI == MBBEnd) return false; - advanceBeforeNext(MI, UseInternalIterator, TheLIS); + advanceBeforeNext(MI, UseInternalIterator); advanceToNext(MI, UseInternalIterator); if (!UseInternalIterator) { // We must remove any dead def lanes from the current RP - advanceBeforeNext(MI, true, TheLIS); + advanceBeforeNext(MI, true); } return true; } diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index ae17ea4c348bd3..2058b9099604f6 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -280,10 +280,9 @@ class GCNDownwardRPTracker : public GCNRPTracker { /// it is assumed that the tracker is using an externally managed iterator, /// and advance* calls will not update the state of the iterator. In such /// cases, the tracker will move to the state right before the provided \p MI - /// and use the provided \p TheLIS for RP calculations. + /// and use LIS for RP calculations. bool advanceBeforeNext(MachineInstr *MI = nullptr, - bool UseInternalIterator = true, - LiveIntervals *TheLIS = nullptr); + bool UseInternalIterator = true); /// Move to the state at the MI, advanceBeforeNext has to be called first. /// If \p UseInternalIterator is true, then internal iterators are used and @@ -300,9 +299,8 @@ class GCNDownwardRPTracker : public GCNRPTracker { /// then it is assumed that the tracker is using an externally managed /// iterator, and advance* calls will not update the state of the iterator. In /// such cases, the tracker will move to the state right before the provided - /// \p MI and use the provided \p TheLIS for RP calculations. - bool advance(MachineInstr *MI = nullptr, bool UseInternalIterator = true, - LiveIntervals *TheLIS = nullptr); + /// \p MI and use LIS for RP calculations. + bool advance(MachineInstr *MI = nullptr, bool UseInternalIterator = true); /// Advance instructions until before \p End. bool advance(MachineBasicBlock::const_iterator End); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 28ca41d2dc96ed..b47cdb2e7ddcf1 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -490,7 +490,7 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { if (GCNTrackers) { MachineInstr *MI = SU->getInstr(); - IsTopNode ? (void)DownwardTracker.advance(MI, false, DAG->getLIS()) + IsTopNode ? (void)DownwardTracker.advance(MI, false) : UpwardTracker.recede(*MI); } From 9a6563e1269bc9ef9bb8fbfe491419b31834de8a Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Thu, 3 Oct 2024 13:42:41 -0700 Subject: [PATCH 23/27] Mark speculative query methods as const Change-Id: I9ebe0cf7252068dcee90d419945085efae75547d --- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 55 +++++++++------------ llvm/lib/Target/AMDGPU/GCNRegPressure.h | 6 ++- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 21 +++----- 3 files changed, 33 insertions(+), 49 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 653701ff772307..bd404b8f66e13b 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -458,19 +458,6 @@ void GCNRPTracker::reset(const MachineRegisterInfo &MRI_, MaxPressure = CurPressure = getRegPressure(MRI_, LiveRegs_); } -void GCNRPTracker::bumpDeadDefs(ArrayRef DeadDefs) { - GCNRegPressure TempPressure = CurPressure; - for (const RegisterMaskPair &P : DeadDefs) { - Register Reg = P.RegUnit; - if (!Reg.isVirtual()) - continue; - LaneBitmask LiveMask = LiveRegs[Reg]; - LaneBitmask BumpedMask = LiveMask | P.LaneMask; - CurPressure.inc(Reg, LiveMask, BumpedMask, *MRI); - } - MaxPressure = max(MaxPressure, CurPressure); - CurPressure = TempPressure; -} /// Mostly copy/paste from CodeGen/RegisterPressure.cpp LaneBitmask GCNRPTracker::getLastUsedLanes(Register RegUnit, SlotIndex Pos) const { @@ -545,8 +532,9 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { assert(CurPressure == getRegPressure(*MRI, LiveRegs)); } -void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI, - const SIRegisterInfo *TRI) { +GCNRegPressure +GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI, + const SIRegisterInfo *TRI) const { assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction."); SlotIndex SlotIdx = LIS.getInstructionIndex(*MI).getRegSlot(); @@ -559,33 +547,32 @@ void GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI, adjustDefLaneLiveness(RegOpers.Defs, SlotIdx, LIS, *MRI); RegOpers.detectDeadDefs(*MI, LIS); - // Boost max pressure for all dead defs together. - // Since CurrSetPressure and MaxSetPressure - bumpDeadDefs(RegOpers.DeadDefs); + GCNRegPressure TempPressure = CurPressure; // Kill liveness at live defs. for (const RegisterMaskPair &P : RegOpers.Defs) { Register Reg = P.RegUnit; if (!Reg.isVirtual()) continue; - LaneBitmask LiveAfter = LiveRegs[Reg]; + LaneBitmask LiveAfter = + LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0); LaneBitmask UseLanes = getRegLanes(RegOpers.Uses, Reg); LaneBitmask DefLanes = P.LaneMask; LaneBitmask LiveBefore = (LiveAfter & ~DefLanes) | UseLanes; - CurPressure.inc(Reg, LiveAfter, LiveAfter & LiveBefore, *MRI); - MaxPressure = max(MaxPressure, CurPressure); + TempPressure.inc(Reg, LiveAfter, LiveAfter & LiveBefore, *MRI); } // Generate liveness for uses. for (const RegisterMaskPair &P : RegOpers.Uses) { Register Reg = P.RegUnit; if (!Reg.isVirtual()) continue; - LaneBitmask LiveAfter = LiveRegs[Reg]; + LaneBitmask LiveAfter = + LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0); LaneBitmask LiveBefore = LiveAfter | P.LaneMask; - CurPressure.inc(Reg, LiveAfter, LiveBefore, *MRI); + TempPressure.inc(Reg, LiveAfter, LiveBefore, *MRI); } - MaxPressure = max(MaxPressure, CurPressure); + return TempPressure; } //////////////////////////////////////////////////////////////////////////////// @@ -746,8 +733,9 @@ Printable llvm::reportMismatch(const GCNRPTracker::LiveRegSet &LISLR, }); } -void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI, - const SIRegisterInfo *TRI) { +GCNRegPressure +GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI, + const SIRegisterInfo *TRI) const { assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction."); SlotIndex SlotIdx; @@ -757,6 +745,7 @@ void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI, RegisterOperands RegOpers; RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/false); RegOpers.adjustLaneLiveness(LIS, *MRI, SlotIdx); + GCNRegPressure TempPressure = CurPressure; for (const RegisterMaskPair &Use : RegOpers.Uses) { Register Reg = Use.RegUnit; @@ -785,9 +774,10 @@ void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI, if (LastUseMask.none()) continue; - LaneBitmask LiveMask = LiveRegs[Reg]; + LaneBitmask LiveMask = + LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0); LaneBitmask NewMask = LiveMask & ~LastUseMask; - CurPressure.inc(Reg, LiveMask, NewMask, *MRI); + TempPressure.inc(Reg, LiveMask, NewMask, *MRI); } // Generate liveness for defs. @@ -795,14 +785,13 @@ void GCNDownwardRPTracker::bumpDownwardPressure(const MachineInstr *MI, Register Reg = Def.RegUnit; if (!Reg.isVirtual()) continue; - LaneBitmask LiveMask = LiveRegs[Reg]; + LaneBitmask LiveMask = + LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0); LaneBitmask NewMask = LiveMask | Def.LaneMask; - CurPressure.inc(Reg, LiveMask, NewMask, *MRI); + TempPressure.inc(Reg, LiveMask, NewMask, *MRI); } - MaxPressure = max(MaxPressure, CurPressure); - // Boost pressure for all dead defs together. - bumpDeadDefs(RegOpers.DeadDefs); + return TempPressure; } bool GCNUpwardRPTracker::isValid() const { diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 2058b9099604f6..a8b4d7339085b9 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -228,7 +228,8 @@ class GCNUpwardRPTracker : public GCNRPTracker { /// does not rely on the implicit program ordering in the LiveIntervals to /// support RP Speculation. It leaves the state of pressure inconsistent with /// the current position - void bumpUpwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI); + GCNRegPressure bumpUpwardPressure(const MachineInstr *MI, + const SIRegisterInfo *TRI) const; /// \p returns whether the tracker's state after receding MI corresponds /// to reported by LIS. @@ -315,7 +316,8 @@ class GCNDownwardRPTracker : public GCNRPTracker { /// does not rely on the implicit program ordering in the LiveIntervals to /// support RP Speculation. It leaves the state of pressure inconsistent with /// the current position - void bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI); + GCNRegPressure bumpDownwardPressure(const MachineInstr *MI, + const SIRegisterInfo *TRI) const; }; /// \returns the LaneMask of live lanes of \p Reg at position \p SI. Only the diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index b47cdb2e7ddcf1..e28acd4c07beb6 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -173,25 +173,18 @@ static void getRegisterPressures( // GCNTrackers Pressure.resize(4, 0); MachineInstr *MI = SU->getInstr(); + GCNRegPressure NewPressure; if (AtTop) { GCNDownwardRPTracker TempDownwardTracker(DownwardTracker); - TempDownwardTracker.bumpDownwardPressure(MI, SRI); - Pressure[AMDGPU::RegisterPressureSets::SReg_32] = - TempDownwardTracker.getPressure().getSGPRNum(); - Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = - TempDownwardTracker.getPressure().getArchVGPRNum(); - Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = - TempDownwardTracker.getPressure().getAGPRNum(); + NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI); } else { GCNUpwardRPTracker TempUpwardTracker(UpwardTracker); - TempUpwardTracker.bumpUpwardPressure(MI, SRI); - Pressure[AMDGPU::RegisterPressureSets::SReg_32] = - TempUpwardTracker.getPressure().getSGPRNum(); - Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = - TempUpwardTracker.getPressure().getArchVGPRNum(); - Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = - TempUpwardTracker.getPressure().getAGPRNum(); + NewPressure = TempUpwardTracker.bumpUpwardPressure(MI, SRI); } + Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum(); + Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = + NewPressure.getArchVGPRNum(); + Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum(); } void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, From e97630857c5bc838c53d3fe05856004034bff77e Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Sun, 6 Oct 2024 15:43:51 -0700 Subject: [PATCH 24/27] Fix lit tests Change-Id: Ie204904f04dc9d2f53d586795c886a3f8c6b1268 --- llvm/test/CodeGen/AMDGPU/pr51516.mir | 4 ++-- .../CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/pr51516.mir b/llvm/test/CodeGen/AMDGPU/pr51516.mir index 49dd5c6c39ff5c..f496a4b06bb237 100644 --- a/llvm/test/CodeGen/AMDGPU/pr51516.mir +++ b/llvm/test/CodeGen/AMDGPU/pr51516.mir @@ -1,5 +1,5 @@ -# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=1 -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -amdgpu-use-amdgpu-trackers=1 -verify-misched -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s # Check that %3 was not rematerialized before the last store since its operand %1 # is killed by that store. diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir index 3ce6279f9082fb..34d203e0de2ffa 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir @@ -1,5 +1,5 @@ -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,1 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -amdgpu-use-amdgpu-trackers=1 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-GCNTRACKER %s --- | define amdgpu_kernel void @no_sched_metric_due_to_spills() #0 { From b6e86d84f803f417cdeef4b0259a6bd6658d6143 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Mon, 7 Oct 2024 15:20:02 -0700 Subject: [PATCH 25/27] Remove bumpUpwardPressure Change-Id: I74c8ed0076ff8557d9a23a7ec7b1c9c00290be01 --- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 73 ----------------------- llvm/lib/Target/AMDGPU/GCNRegPressure.h | 8 --- 2 files changed, 81 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index bd404b8f66e13b..150ce86055e7cf 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -296,17 +296,6 @@ collectVirtualRegUses(SmallVectorImpl &RegMaskPairs, } } -/// Mostly copy/paste from CodeGen/RegisterPressure.cpp -static LaneBitmask getRegLanes(ArrayRef RegUnits, - Register RegUnit) { - auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) { - return Other.RegUnit == RegUnit; - }); - if (I == RegUnits.end()) - return LaneBitmask::getNone(); - return I->LaneMask; -} - /// Mostly copy/paste from CodeGen/RegisterPressure.cpp static LaneBitmask getLanesWithProperty( const LiveIntervals &LIS, const MachineRegisterInfo &MRI, @@ -373,25 +362,6 @@ static LaneBitmask getLiveLanesAt(const LiveIntervals &LIS, [](const LiveRange &LR, SlotIndex Pos) { return LR.liveAt(Pos); }); } -// Copy/paste from RegisterPressure.cpp (RegisterOperands::adjustLaneLiveness) -static void adjustDefLaneLiveness(SmallVectorImpl &Defs, - SlotIndex &Pos, const LiveIntervals &LIS, - const MachineRegisterInfo &MRI) { - for (auto *I = Defs.begin(); I != Defs.end();) { - LaneBitmask LiveAfter = - getLiveLanesAt(LIS, MRI, true, I->RegUnit, Pos.getDeadSlot()); - // If the def is all that is live after the instruction, then in case - // of a subregister def we need a read-undef flag. - LaneBitmask ActualDef = I->LaneMask & LiveAfter; - if (ActualDef.none()) { - I = Defs.erase(I); - } else { - I->LaneMask = ActualDef; - ++I; - } - } -} - /////////////////////////////////////////////////////////////////////////////// // GCNRPTracker @@ -532,49 +502,6 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { assert(CurPressure == getRegPressure(*MRI, LiveRegs)); } -GCNRegPressure -GCNUpwardRPTracker::bumpUpwardPressure(const MachineInstr *MI, - const SIRegisterInfo *TRI) const { - assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction."); - - SlotIndex SlotIdx = LIS.getInstructionIndex(*MI).getRegSlot(); - - // Account for register pressure similar to RegPressureTracker::recede(). - RegisterOperands RegOpers; - - RegOpers.collect(*MI, *TRI, *MRI, true, /*IgnoreDead=*/true); - assert(RegOpers.DeadDefs.empty()); - adjustDefLaneLiveness(RegOpers.Defs, SlotIdx, LIS, *MRI); - RegOpers.detectDeadDefs(*MI, LIS); - - GCNRegPressure TempPressure = CurPressure; - - // Kill liveness at live defs. - for (const RegisterMaskPair &P : RegOpers.Defs) { - Register Reg = P.RegUnit; - if (!Reg.isVirtual()) - continue; - LaneBitmask LiveAfter = - LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0); - LaneBitmask UseLanes = getRegLanes(RegOpers.Uses, Reg); - LaneBitmask DefLanes = P.LaneMask; - LaneBitmask LiveBefore = (LiveAfter & ~DefLanes) | UseLanes; - - TempPressure.inc(Reg, LiveAfter, LiveAfter & LiveBefore, *MRI); - } - // Generate liveness for uses. - for (const RegisterMaskPair &P : RegOpers.Uses) { - Register Reg = P.RegUnit; - if (!Reg.isVirtual()) - continue; - LaneBitmask LiveAfter = - LiveRegs.contains(Reg) ? LiveRegs.at(Reg) : LaneBitmask(0); - LaneBitmask LiveBefore = LiveAfter | P.LaneMask; - TempPressure.inc(Reg, LiveAfter, LiveBefore, *MRI); - } - return TempPressure; -} - //////////////////////////////////////////////////////////////////////////////// // GCNDownwardRPTracker diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index a8b4d7339085b9..fcf5360881117e 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -223,14 +223,6 @@ class GCNUpwardRPTracker : public GCNRPTracker { /// to false allows for an externally managed iterator / program order. void recede(const MachineInstr &MI); - /// Mostly copy/paste from CodeGen/RegisterPressure.cpp - /// Calculate the impact \p MI will have on CurPressure and MaxPressure. This - /// does not rely on the implicit program ordering in the LiveIntervals to - /// support RP Speculation. It leaves the state of pressure inconsistent with - /// the current position - GCNRegPressure bumpUpwardPressure(const MachineInstr *MI, - const SIRegisterInfo *TRI) const; - /// \p returns whether the tracker's state after receding MI corresponds /// to reported by LIS. bool isValid() const; From 35ab17309b952cdfba10813776e3340ed3d5f7b6 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 8 Oct 2024 08:26:39 -0700 Subject: [PATCH 26/27] Changes from pull/111452 + use the new recede Change-Id: I9d6ba224894947f6e94df9ece3faf3f73d4e700f --- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 10 ---------- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 3 ++- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 150ce86055e7cf..05053a3f3caaa6 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -352,16 +352,6 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask, return LastUseMask; } -/// Mostly copy/paste from CodeGen/RegisterPressure.cpp -static LaneBitmask getLiveLanesAt(const LiveIntervals &LIS, - const MachineRegisterInfo &MRI, - bool TrackLaneMasks, Register RegUnit, - SlotIndex Pos) { - return getLanesWithProperty( - LIS, MRI, TrackLaneMasks, RegUnit, Pos, LaneBitmask::getAll(), - [](const LiveRange &LR, SlotIndex Pos) { return LR.liveAt(Pos); }); -} - /////////////////////////////////////////////////////////////////////////////// // GCNRPTracker diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index e28acd4c07beb6..f5f47d5268d8a3 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -179,7 +179,8 @@ static void getRegisterPressures( NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI); } else { GCNUpwardRPTracker TempUpwardTracker(UpwardTracker); - NewPressure = TempUpwardTracker.bumpUpwardPressure(MI, SRI); + TempUpwardTracker.recede(*MI); + NewPressure = TempUpwardTracker.getPressure(); } Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum(); Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = From aa74786d3202cd99afb9e39754b82d8a6ceedf9c Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 8 Oct 2024 13:43:37 -0700 Subject: [PATCH 27/27] Code / comment cleanup Change-Id: I841ac8dd48b520397f96452f1d167613edf94895 --- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 17 +++++++++-------- llvm/lib/Target/AMDGPU/GCNRegPressure.h | 7 +++---- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 2 +- llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 4 ++-- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index 05053a3f3caaa6..d46c4cf23a221e 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -325,7 +325,7 @@ static LaneBitmask getLanesWithProperty( } /// Mostly copy/paste from CodeGen/RegisterPressure.cpp -/// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx). +/// Helper to find a vreg use between two indices {PriorUseIdx, NextUseIdx}. /// The query starts with a lane bitmask which gets lanes/bits removed for every /// use we find. static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask, @@ -341,13 +341,14 @@ static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask, SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot(); bool InRange = Upward ? (InstSlot > PriorUseIdx && InstSlot <= NextUseIdx) : (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx); - if (InRange) { - unsigned SubRegIdx = MO.getSubReg(); - LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(SubRegIdx); - LastUseMask &= ~UseMask; - if (LastUseMask.none()) - return LaneBitmask::getNone(); - } + if (!InRange) + continue; + + unsigned SubRegIdx = MO.getSubReg(); + LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(SubRegIdx); + LastUseMask &= ~UseMask; + if (LastUseMask.none()) + return LaneBitmask::getNone(); } return LastUseMask; } diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index fcf5360881117e..06c3d9027db1b5 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -304,10 +304,9 @@ class GCNDownwardRPTracker : public GCNRPTracker { const LiveRegSet *LiveRegsCopy = nullptr); /// Mostly copy/paste from CodeGen/RegisterPressure.cpp - /// Calculate the impact \p MI will have on CurPressure and MaxPressure. This - /// does not rely on the implicit program ordering in the LiveIntervals to - /// support RP Speculation. It leaves the state of pressure inconsistent with - /// the current position + /// Calculate the impact \p MI will have on CurPressure and \return the + /// speculated pressure. In order to support RP Speculation, this does not + /// rely on the implicit program ordering in the LiveIntervals. GCNRegPressure bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI) const; }; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index f5f47d5268d8a3..11c95675aeeafa 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -774,7 +774,7 @@ GCNScheduleDAGMILive::getRegionLiveOutMap() const { void RegionPressureMap::buildLiveRegMap() { IdxToInstruction.clear(); - BBLiveRegMap = + RegionLiveRegMap = IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap(); for (unsigned I = 0; I < DAG->Regions.size(); I++) { MachineInstr *RegionKey = diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 91b4c0c63d2bb3..64d517038f90e0 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -181,7 +181,7 @@ class RegionPressureMap { GCNScheduleDAGMILive *DAG; // The live in/out pressure as indexed by the first or last MI in the region // before scheduling. - DenseMap BBLiveRegMap; + DenseMap RegionLiveRegMap; // The mapping of RegionIDx to key instruction DenseMap IdxToInstruction; // Whether we are calculating LiveOuts or LiveIns @@ -198,7 +198,7 @@ class RegionPressureMap { GCNRPTracker::LiveRegSet &getLiveRegsForRegionIdx(unsigned RegionIdx) { assert(IdxToInstruction.find(RegionIdx) != IdxToInstruction.end()); MachineInstr *Key = IdxToInstruction[RegionIdx]; - return BBLiveRegMap[Key]; + return RegionLiveRegMap[Key]; } };