diff --git a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl index a05e21b37b9127..f15130d5f8b611 100644 --- a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl +++ b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl @@ -2,7 +2,7 @@ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx908 -Rpass-analysis=kernel-resource-usage -S -O0 -verify %s -o /dev/null // expected-remark@+10 {{Function Name: foo}} -// expected-remark@+9 {{ SGPRs: 13}} +// expected-remark@+9 {{ TotalSGPRs: 13}} // expected-remark@+8 {{ VGPRs: 10}} // expected-remark@+7 {{ AGPRs: 12}} // expected-remark@+6 {{ ScratchSize [bytes/lane]: 0}} diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 9e11b13c101d47..38300863f7889a 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1757,6 +1757,55 @@ As part of the AMDGPU MC layer, AMDGPU provides the following target specific =================== ================= ======================================================== +Function Resource Usage +----------------------- + +A function's resource usage depends on each of its callees' resource usage. The +expressions used to denote resource usage reflect this by propagating each +callees' equivalent expressions. Said expressions are emitted as symbols by the +compiler when compiling to either assembly or object format and should not be +overwritten or redefined. + +The following describes all emitted function resource usage symbols: + + .. table:: Function Resource Usage: + :name: function-usage-table + + ===================================== ========= ========================================= =============================================================================== + Symbol Type Description Example + ===================================== ========= ========================================= =============================================================================== + .num_vgpr Integer Number of VGPRs used by , .set foo.num_vgpr, max(32, bar.num_vgpr, baz.num_vgpr) + worst case of itself and its callees' + VGPR use + .num_agpr Integer Number of AGPRs used by , .set foo.num_agpr, max(35, bar.num_agpr) + worst case of itself and its callees' + AGPR use + .numbered_sgpr Integer Number of SGPRs used by , .set foo.num_sgpr, 21 + worst case of itself and its callees' + SGPR use (without any of the implicitly + used SGPRs) + .private_seg_size Integer Total stack size required for .set foo.private_seg_size, 16+max(bar.private_seg_size, baz.private_seg_size) + , expression is the + locally used stack size + the worst case + callee + .uses_vcc Bool Whether , or any of its .set foo.uses_vcc, or(0, bar.uses_vcc) + callees, uses vcc + .uses_flat_scratch Bool Whether , or any of its .set foo.uses_flat_scratch, 1 + callees, uses flat scratch or not + .has_dyn_sized_stack Bool Whether , or any of its .set foo.has_dyn_sized_stack, 1 + callees, is dynamically sized + .has_recursion Bool Whether , or any of its .set foo.has_recursion, 0 + callees, contains recursion + .has_indirect_call Bool Whether , or any of its .set foo.has_indirect_call, max(0, bar.has_indirect_call) + callees, contains an indirect call + ===================================== ========= ========================================= =============================================================================== + +Futhermore, three symbols are additionally emitted describing the compilation +unit's worst case (i.e, maxima) ``num_vgpr``, ``num_agpr``, and +``numbered_sgpr`` which may be referenced and used by the aforementioned +symbolic expressions. These three symbols are ``amdgcn.max_num_vgpr``, +``amdgcn.max_num_agpr``, and ``amdgcn.max_num_sgpr``. + .. _amdgpu-elf-code-object: ELF Code Object diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 682c29457c35ee..4f6633d8027c70 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -18,6 +18,7 @@ #include "AMDGPUAsmPrinter.h" #include "AMDGPU.h" #include "AMDGPUHSAMetadataStreamer.h" +#include "AMDGPUMCResourceInfo.h" #include "AMDGPUResourceUsageAnalysis.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUInstPrinter.h" @@ -33,6 +34,7 @@ #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/MC/MCAssembler.h" @@ -359,6 +361,127 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) { return AsmPrinter::doInitialization(M); } +void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) { + if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv())) + return; + + using RIK = MCResourceInfo::ResourceInfoKind; + const GCNSubtarget &STM = TM.getSubtarget(F); + + auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool { + int64_t Val; + if (Value->evaluateAsAbsolute(Val)) { + Res = Val; + return true; + } + return false; + }; + + const uint64_t MaxScratchPerWorkitem = + STM.getMaxWaveScratchSize() / STM.getWavefrontSize(); + MCSymbol *ScratchSizeSymbol = + RI.getSymbol(F.getName(), RIK::RIK_PrivateSegSize, OutContext); + uint64_t ScratchSize; + if (ScratchSizeSymbol->isVariable() && + TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) && + ScratchSize > MaxScratchPerWorkitem) { + DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem, + DS_Error); + F.getContext().diagnose(DiagStackSize); + } + + // Validate addressable scalar registers (i.e., prior to added implicit + // SGPRs). + MCSymbol *NumSGPRSymbol = + RI.getSymbol(F.getName(), RIK::RIK_NumSGPR, OutContext); + if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && + !STM.hasSGPRInitBug()) { + unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); + uint64_t NumSgpr; + if (NumSGPRSymbol->isVariable() && + TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) && + NumSgpr > MaxAddressableNumSGPRs) { + DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers", + NumSgpr, MaxAddressableNumSGPRs, + DS_Error, DK_ResourceLimit); + F.getContext().diagnose(Diag); + return; + } + } + + MCSymbol *VCCUsedSymbol = + RI.getSymbol(F.getName(), RIK::RIK_UsesVCC, OutContext); + MCSymbol *FlatUsedSymbol = + RI.getSymbol(F.getName(), RIK::RIK_UsesFlatScratch, OutContext); + uint64_t VCCUsed, FlatUsed, NumSgpr; + + if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() && + FlatUsedSymbol->isVariable() && + TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) && + TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) && + TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) { + + // Recomputes NumSgprs + implicit SGPRs but all symbols should now be + // resolvable. + NumSgpr += IsaInfo::getNumExtraSGPRs( + &STM, VCCUsed, FlatUsed, + getTargetStreamer()->getTargetID()->isXnackOnOrAny()); + if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || + STM.hasSGPRInitBug()) { + unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); + if (NumSgpr > MaxAddressableNumSGPRs) { + DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr, + MaxAddressableNumSGPRs, DS_Error, + DK_ResourceLimit); + F.getContext().diagnose(Diag); + return; + } + } + + MCSymbol *NumVgprSymbol = + RI.getSymbol(F.getName(), RIK::RIK_NumVGPR, OutContext); + MCSymbol *NumAgprSymbol = + RI.getSymbol(F.getName(), RIK::RIK_NumAGPR, OutContext); + uint64_t NumVgpr, NumAgpr; + + MachineModuleInfo &MMI = + getAnalysis().getMMI(); + MachineFunction *MF = MMI.getMachineFunction(F); + if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() && + TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) && + TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) { + const SIMachineFunctionInfo &MFI = *MF->getInfo(); + unsigned MaxWaves = MFI.getMaxWavesPerEU(); + uint64_t TotalNumVgpr = + getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr); + uint64_t NumVGPRsForWavesPerEU = std::max( + {TotalNumVgpr, (uint64_t)1, (uint64_t)STM.getMinNumVGPRs(MaxWaves)}); + uint64_t NumSGPRsForWavesPerEU = std::max( + {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)}); + const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy( + STM.computeOccupancy(F, MFI.getLDSSize()), + MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext), + MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM, + OutContext); + uint64_t Occupancy; + + const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute( + F, "amdgpu-waves-per-eu", {0, 0}, true); + + if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) { + DiagnosticInfoOptimizationFailure Diag( + F, F.getSubprogram(), + "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in " + "'" + + F.getName() + "': desired occupancy was " + Twine(MinWEU) + + ", final occupancy is " + Twine(Occupancy)); + F.getContext().diagnose(Diag); + return; + } + } + } +} + bool AMDGPUAsmPrinter::doFinalization(Module &M) { // Pad with s_code_end to help tools and guard against instruction prefetch // causing stale data in caches. Arguably this should be done by the linker, @@ -371,25 +494,24 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) { getTargetStreamer()->EmitCodeEnd(STI); } - return AsmPrinter::doFinalization(M); -} + // Assign expressions which can only be resolved when all other functions are + // known. + RI.finalize(OutContext); -// Print comments that apply to both callable functions and entry points. -void AMDGPUAsmPrinter::emitCommonFunctionComments( - uint32_t NumVGPR, std::optional NumAGPR, uint32_t TotalNumVGPR, - uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize, - const AMDGPUMachineFunction *MFI) { - OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); - OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false); - OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false); - if (NumAGPR) { - OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false); - OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR), - false); - } - OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false); - OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()), - false); + // Switch section and emit all GPR maximums within the processed module. + OutStreamer->pushSection(); + MCSectionELF *MaxGPRSection = + OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0); + OutStreamer->switchSection(MaxGPRSection); + getTargetStreamer()->EmitMCResourceMaximums(RI.getMaxVGPRSymbol(OutContext), + RI.getMaxAGPRSymbol(OutContext), + RI.getMaxSGPRSymbol(OutContext)); + OutStreamer->popSection(); + + for (Function &F : M.functions()) + validateMCResourceInfo(F); + + return AsmPrinter::doFinalization(M); } SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) { @@ -402,12 +524,14 @@ SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) { return Str; } +// Print comments that apply to both callable functions and entry points. void AMDGPUAsmPrinter::emitCommonFunctionComments( const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR, const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize, const AMDGPUMachineFunction *MFI) { OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); - OutStreamer->emitRawComment(" NumSgprs: " + getMCExprStr(NumSGPR), false); + OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR), + false); OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false); if (NumAGPR && TotalNumVGPR) { OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false); @@ -540,6 +664,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->switchSection(ConfigSection); } + const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = + ResourceUsage->getResourceInfo(); + RI.gatherResourceInfo(MF, Info, OutContext); + if (MFI->isModuleEntryFunction()) { getSIProgramInfo(CurrentProgramInfo, MF); } @@ -571,21 +699,44 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(), STM.hasMAIInsts()); + { + using RIK = MCResourceInfo::ResourceInfoKind; + getTargetStreamer()->EmitMCResourceInfo( + RI.getSymbol(MF.getName(), RIK::RIK_NumVGPR, OutContext), + RI.getSymbol(MF.getName(), RIK::RIK_NumAGPR, OutContext), + RI.getSymbol(MF.getName(), RIK::RIK_NumSGPR, OutContext), + RI.getSymbol(MF.getName(), RIK::RIK_PrivateSegSize, OutContext), + RI.getSymbol(MF.getName(), RIK::RIK_UsesVCC, OutContext), + RI.getSymbol(MF.getName(), RIK::RIK_UsesFlatScratch, OutContext), + RI.getSymbol(MF.getName(), RIK::RIK_HasDynSizedStack, OutContext), + RI.getSymbol(MF.getName(), RIK::RIK_HasRecursion, OutContext), + RI.getSymbol(MF.getName(), RIK::RIK_HasIndirectCall, OutContext)); + } + if (isVerbose()) { MCSectionELF *CommentSection = Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); OutStreamer->switchSection(CommentSection); if (!MFI->isEntryFunction()) { + using RIK = MCResourceInfo::ResourceInfoKind; OutStreamer->emitRawComment(" Function info:", false); - const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = - ResourceUsage->getResourceInfo(&MF.getFunction()); + emitCommonFunctionComments( - Info.NumVGPR, - STM.hasMAIInsts() ? Info.NumAGPR : std::optional(), - Info.getTotalNumVGPRs(STM), - Info.getTotalNumSGPRs(MF.getSubtarget()), - Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI); + RI.getSymbol(MF.getName(), RIK::RIK_NumVGPR, OutContext) + ->getVariableValue(), + STM.hasMAIInsts() + ? RI.getSymbol(MF.getName(), RIK::RIK_NumAGPR, OutContext) + ->getVariableValue() + : nullptr, + RI.createTotalNumVGPRs(MF, Ctx), + RI.createTotalNumSGPRs( + MF, + MF.getSubtarget().getTargetID().isXnackOnOrAny(), + Ctx), + RI.getSymbol(MF.getName(), RIK::RIK_PrivateSegSize, OutContext) + ->getVariableValue(), + getFunctionCodeSize(MF), MFI); return false; } @@ -751,10 +902,26 @@ uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const return CodeSize; } +// AccumOffset computed for the MCExpr equivalent of: +// alignTo(std::max(1, NumVGPR), 4) / 4 - 1; +static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) { + const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx); + const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx); + + // Can't be lower than 1 for subsequent alignTo. + const MCExpr *MaximumTaken = + AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx); + + // Practically, it's computing divideCeil(MaximumTaken, 4). + const MCExpr *DivCeil = MCBinaryExpr::createDiv( + AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour, + Ctx); + + return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx); +} + void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const MachineFunction &MF) { - const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = - ResourceUsage->getResourceInfo(&MF.getFunction()); const GCNSubtarget &STM = MF.getSubtarget(); MCContext &Ctx = MF.getContext(); @@ -771,28 +938,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, return false; }; - ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR); - ProgInfo.NumAccVGPR = CreateExpr(Info.NumAGPR); - ProgInfo.NumVGPR = CreateExpr(Info.getTotalNumVGPRs(STM)); - ProgInfo.AccumOffset = - CreateExpr(alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1); + auto GetSymRefExpr = + [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * { + MCSymbol *Sym = RI.getSymbol(MF.getName(), RIK, OutContext); + return MCSymbolRefExpr::create(Sym, Ctx); + }; + + using RIK = MCResourceInfo::ResourceInfoKind; + ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR); + ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR); + ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR( + ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx); + + ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx); ProgInfo.TgSplit = STM.isTgSplitEnabled(); - ProgInfo.NumSGPR = CreateExpr(Info.NumExplicitSGPR); - ProgInfo.ScratchSize = CreateExpr(Info.PrivateSegmentSize); - ProgInfo.VCCUsed = CreateExpr(Info.UsesVCC); - ProgInfo.FlatUsed = CreateExpr(Info.UsesFlatScratch); + ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR); + ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize); + ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC); + ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch); ProgInfo.DynamicCallStack = - CreateExpr(Info.HasDynamicallySizedStack || Info.HasRecursion); - - const uint64_t MaxScratchPerWorkitem = - STM.getMaxWaveScratchSize() / STM.getWavefrontSize(); - uint64_t ScratchSize; - if (TryGetMCExprValue(ProgInfo.ScratchSize, ScratchSize) && - ScratchSize > MaxScratchPerWorkitem) { - DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ScratchSize, - MaxScratchPerWorkitem, DS_Error); - MF.getFunction().getContext().diagnose(DiagStackSize); - } + MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack), + GetSymRefExpr(RIK::RIK_HasRecursion), Ctx); const SIMachineFunctionInfo *MFI = MF.getInfo(); @@ -1477,6 +1643,8 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AsmPrinter::getAnalysisUsage(AU); } @@ -1522,7 +1690,7 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks( // printing multiple diagnostic location and diag opts. EmitResourceUsageRemark("FunctionName", "Function Name", MF.getFunction().getName()); - EmitResourceUsageRemark("NumSGPR", "SGPRs", + EmitResourceUsageRemark("NumSGPR", "TotalSGPRs", getMCExprStr(CurrentProgramInfo.NumSGPR)); EmitResourceUsageRemark("NumVGPR", "VGPRs", getMCExprStr(CurrentProgramInfo.NumArchVGPR)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index f66bbde42ce278..cc8c4411805e23 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -14,6 +14,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H +#include "AMDGPUMCResourceInfo.h" #include "SIProgramInfo.h" #include "llvm/CodeGen/AsmPrinter.h" @@ -24,6 +25,7 @@ struct AMDGPUResourceUsageAnalysis; class AMDGPUTargetStreamer; class MCCodeEmitter; class MCOperand; +class MCResourceInfo; namespace AMDGPU { struct MCKernelDescriptor; @@ -40,6 +42,8 @@ class AMDGPUAsmPrinter final : public AsmPrinter { AMDGPUResourceUsageAnalysis *ResourceUsage; + MCResourceInfo RI; + SIProgramInfo CurrentProgramInfo; std::unique_ptr HSAMetadataStream; @@ -60,11 +64,6 @@ class AMDGPUAsmPrinter final : public AsmPrinter { void EmitPALMetadata(const MachineFunction &MF, const SIProgramInfo &KernelInfo); void emitPALFunctionMetadata(const MachineFunction &MF); - void emitCommonFunctionComments(uint32_t NumVGPR, - std::optional NumAGPR, - uint32_t TotalNumVGPR, uint32_t NumSGPR, - uint64_t ScratchSize, uint64_t CodeSize, - const AMDGPUMachineFunction *MFI); void emitCommonFunctionComments(const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR, const MCExpr *NumSGPR, @@ -84,6 +83,11 @@ class AMDGPUAsmPrinter final : public AsmPrinter { SmallString<128> getMCExprStr(const MCExpr *Value); + /// Attempts to replace the validation that is missed in getSIProgramInfo due + /// to MCExpr being unknown. Invoked during doFinalization such that the + /// MCResourceInfo symbols are known. + void validateMCResourceInfo(Function &F); + public: explicit AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr Streamer); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp new file mode 100644 index 00000000000000..b5f53290eff3ac --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp @@ -0,0 +1,224 @@ +//===- AMDGPUMCResourceInfo.cpp --- MC Resource Info ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief MC infrastructure to propagate the function level resource usage +/// info. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCResourceInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSymbol.h" + +using namespace llvm; + +MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK, + MCContext &OutContext) { + auto GOCS = [this, FuncName, &OutContext](StringRef Suffix) { + return OutContext.getOrCreateSymbol(FuncName + Twine(Suffix)); + }; + switch (RIK) { + case RIK_NumVGPR: + return GOCS(".num_vgpr"); + case RIK_NumAGPR: + return GOCS(".num_agpr"); + case RIK_NumSGPR: + return GOCS(".numbered_sgpr"); + case RIK_PrivateSegSize: + return GOCS(".private_seg_size"); + case RIK_UsesVCC: + return GOCS(".uses_vcc"); + case RIK_UsesFlatScratch: + return GOCS(".uses_flat_scratch"); + case RIK_HasDynSizedStack: + return GOCS(".has_dyn_sized_stack"); + case RIK_HasRecursion: + return GOCS(".has_recursion"); + case RIK_HasIndirectCall: + return GOCS(".has_indirect_call"); + } + llvm_unreachable("Unexpected ResourceInfoKind."); +} + +const MCExpr *MCResourceInfo::getSymRefExpr(StringRef FuncName, + ResourceInfoKind RIK, + MCContext &Ctx) { + return MCSymbolRefExpr::create(getSymbol(FuncName, RIK, Ctx), Ctx); +} + +void MCResourceInfo::assignMaxRegs(MCContext &OutContext) { + // Assign expression to get the max register use to the max_num_Xgpr symbol. + MCSymbol *MaxVGPRSym = getMaxVGPRSymbol(OutContext); + MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext); + MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext); + + auto assignMaxRegSym = [this, &OutContext](MCSymbol *Sym, int32_t RegCount) { + const MCExpr *MaxExpr = MCConstantExpr::create(RegCount, OutContext); + Sym->setVariableValue(MaxExpr); + }; + + assignMaxRegSym(MaxVGPRSym, MaxVGPR); + assignMaxRegSym(MaxAGPRSym, MaxAGPR); + assignMaxRegSym(MaxSGPRSym, MaxSGPR); +} + +void MCResourceInfo::finalize(MCContext &OutContext) { + assert(!Finalized && "Cannot finalize ResourceInfo again."); + Finalized = true; + assignMaxRegs(OutContext); +} + +MCSymbol *MCResourceInfo::getMaxVGPRSymbol(MCContext &OutContext) { + return OutContext.getOrCreateSymbol("amdgpu.max_num_vgpr"); +} + +MCSymbol *MCResourceInfo::getMaxAGPRSymbol(MCContext &OutContext) { + return OutContext.getOrCreateSymbol("amdgpu.max_num_agpr"); +} + +MCSymbol *MCResourceInfo::getMaxSGPRSymbol(MCContext &OutContext) { + return OutContext.getOrCreateSymbol("amdgpu.max_num_sgpr"); +} + +void MCResourceInfo::assignResourceInfoExpr( + int64_t LocalValue, ResourceInfoKind RIK, AMDGPUMCExpr::VariantKind Kind, + const MachineFunction &MF, const SmallVectorImpl &Callees, + MCContext &OutContext) { + const MCConstantExpr *LocalConstExpr = + MCConstantExpr::create(LocalValue, OutContext); + const MCExpr *SymVal = LocalConstExpr; + if (!Callees.empty()) { + SmallVector ArgExprs; + // Avoid recursive symbol assignment. + SmallPtrSet Seen; + ArgExprs.push_back(LocalConstExpr); + const Function &F = MF.getFunction(); + Seen.insert(&F); + + for (const Function *Callee : Callees) { + if (!Seen.insert(Callee).second) + continue; + MCSymbol *CalleeValSym = getSymbol(Callee->getName(), RIK, OutContext); + ArgExprs.push_back(MCSymbolRefExpr::create(CalleeValSym, OutContext)); + } + SymVal = AMDGPUMCExpr::create(Kind, ArgExprs, OutContext); + } + MCSymbol *Sym = getSymbol(MF.getName(), RIK, OutContext); + Sym->setVariableValue(SymVal); +} + +void MCResourceInfo::gatherResourceInfo( + const MachineFunction &MF, + const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI, + MCContext &OutContext) { + // Worst case VGPR use for non-hardware-entrypoints. + MCSymbol *MaxVGPRSym = getMaxVGPRSymbol(OutContext); + MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext); + MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext); + + if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())) { + addMaxVGPRCandidate(FRI.NumVGPR); + addMaxAGPRCandidate(FRI.NumAGPR); + addMaxSGPRCandidate(FRI.NumExplicitSGPR); + } + + auto SetMaxReg = [&](MCSymbol *MaxSym, int32_t numRegs, + ResourceInfoKind RIK) { + if (!FRI.HasIndirectCall) { + assignResourceInfoExpr(numRegs, RIK, AMDGPUMCExpr::AGVK_Max, MF, + FRI.Callees, OutContext); + } else { + const MCExpr *SymRef = MCSymbolRefExpr::create(MaxSym, OutContext); + MCSymbol *LocalNumSym = getSymbol(MF.getName(), RIK, OutContext); + const MCExpr *MaxWithLocal = AMDGPUMCExpr::createMax( + {MCConstantExpr::create(numRegs, OutContext), SymRef}, OutContext); + LocalNumSym->setVariableValue(MaxWithLocal); + } + }; + + SetMaxReg(MaxVGPRSym, FRI.NumVGPR, RIK_NumVGPR); + SetMaxReg(MaxAGPRSym, FRI.NumAGPR, RIK_NumAGPR); + SetMaxReg(MaxSGPRSym, FRI.NumExplicitSGPR, RIK_NumSGPR); + + { + // The expression for private segment size should be: FRI.PrivateSegmentSize + // + max(FRI.Callees, FRI.CalleeSegmentSize) + SmallVector ArgExprs; + if (FRI.CalleeSegmentSize) + ArgExprs.push_back( + MCConstantExpr::create(FRI.CalleeSegmentSize, OutContext)); + + if (!FRI.HasIndirectCall) { + for (const Function *Callee : FRI.Callees) { + MCSymbol *calleeValSym = + getSymbol(Callee->getName(), RIK_PrivateSegSize, OutContext); + ArgExprs.push_back(MCSymbolRefExpr::create(calleeValSym, OutContext)); + } + } + const MCExpr *localConstExpr = + MCConstantExpr::create(FRI.PrivateSegmentSize, OutContext); + if (!ArgExprs.empty()) { + const AMDGPUMCExpr *transitiveExpr = + AMDGPUMCExpr::createMax(ArgExprs, OutContext); + localConstExpr = + MCBinaryExpr::createAdd(localConstExpr, transitiveExpr, OutContext); + } + getSymbol(MF.getName(), RIK_PrivateSegSize, OutContext) + ->setVariableValue(localConstExpr); + } + + auto SetToLocal = [&](int64_t LocalValue, ResourceInfoKind RIK) { + MCSymbol *Sym = getSymbol(MF.getName(), RIK, OutContext); + Sym->setVariableValue(MCConstantExpr::create(LocalValue, OutContext)); + }; + + if (!FRI.HasIndirectCall) { + assignResourceInfoExpr(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC, + AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext); + assignResourceInfoExpr(FRI.UsesFlatScratch, + ResourceInfoKind::RIK_UsesFlatScratch, + AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext); + assignResourceInfoExpr(FRI.HasDynamicallySizedStack, + ResourceInfoKind::RIK_HasDynSizedStack, + AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext); + assignResourceInfoExpr(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion, + AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext); + assignResourceInfoExpr(FRI.HasIndirectCall, + ResourceInfoKind::RIK_HasIndirectCall, + AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext); + } else { + SetToLocal(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC); + SetToLocal(FRI.UsesFlatScratch, ResourceInfoKind::RIK_UsesFlatScratch); + SetToLocal(FRI.HasDynamicallySizedStack, + ResourceInfoKind::RIK_HasDynSizedStack); + SetToLocal(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion); + SetToLocal(FRI.HasIndirectCall, ResourceInfoKind::RIK_HasIndirectCall); + } +} + +const MCExpr *MCResourceInfo::createTotalNumVGPRs(const MachineFunction &MF, + MCContext &Ctx) { + return AMDGPUMCExpr::createTotalNumVGPR( + getSymRefExpr(MF.getName(), RIK_NumAGPR, Ctx), + getSymRefExpr(MF.getName(), RIK_NumVGPR, Ctx), Ctx); +} + +const MCExpr *MCResourceInfo::createTotalNumSGPRs(const MachineFunction &MF, + bool hasXnack, + MCContext &Ctx) { + return MCBinaryExpr::createAdd( + getSymRefExpr(MF.getName(), RIK_NumSGPR, Ctx), + AMDGPUMCExpr::createExtraSGPRs( + getSymRefExpr(MF.getName(), RIK_UsesVCC, Ctx), + getSymRefExpr(MF.getName(), RIK_UsesFlatScratch, Ctx), hasXnack, Ctx), + Ctx); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h new file mode 100644 index 00000000000000..08c0c106d5aa9b --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h @@ -0,0 +1,102 @@ +//===- AMDGPUMCResourceInfo.h ----- MC Resource Info --------------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief MC infrastructure to propagate the function level resource usage +/// info. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCRESOURCEINFO_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMCRESOURCEINFO_H + +#include "AMDGPUResourceUsageAnalysis.h" +#include "MCTargetDesc/AMDGPUMCExpr.h" + +namespace llvm { + +class MCContext; +class MCSymbol; +class StringRef; +class MachineFunction; + +class MCResourceInfo { +public: + enum ResourceInfoKind { + RIK_NumVGPR, + RIK_NumAGPR, + RIK_NumSGPR, + RIK_PrivateSegSize, + RIK_UsesVCC, + RIK_UsesFlatScratch, + RIK_HasDynSizedStack, + RIK_HasRecursion, + RIK_HasIndirectCall + }; + +private: + int32_t MaxVGPR = 0; + int32_t MaxAGPR = 0; + int32_t MaxSGPR = 0; + + // Whether the MCResourceInfo has been finalized through finalize(MCContext + // &). Should only be called once, at the end of AsmPrinting to assign MaxXGPR + // symbols to their final value. + bool Finalized = false; + + void assignResourceInfoExpr(int64_t localValue, ResourceInfoKind RIK, + AMDGPUMCExpr::VariantKind Kind, + const MachineFunction &MF, + const SmallVectorImpl &Callees, + MCContext &OutContext); + + // Assigns expression for Max S/V/A-GPRs to the referenced symbols. + void assignMaxRegs(MCContext &OutContext); + +public: + MCResourceInfo() = default; + void addMaxVGPRCandidate(int32_t candidate) { + MaxVGPR = std::max(MaxVGPR, candidate); + } + void addMaxAGPRCandidate(int32_t candidate) { + MaxAGPR = std::max(MaxAGPR, candidate); + } + void addMaxSGPRCandidate(int32_t candidate) { + MaxSGPR = std::max(MaxSGPR, candidate); + } + + MCSymbol *getSymbol(StringRef FuncName, ResourceInfoKind RIK, + MCContext &OutContext); + const MCExpr *getSymRefExpr(StringRef FuncName, ResourceInfoKind RIK, + MCContext &Ctx); + + // Resolves the final symbols that requires the inter-function resource info + // to be resolved. + void finalize(MCContext &OutContext); + + MCSymbol *getMaxVGPRSymbol(MCContext &OutContext); + MCSymbol *getMaxAGPRSymbol(MCContext &OutContext); + MCSymbol *getMaxSGPRSymbol(MCContext &OutContext); + + /// AMDGPUResourceUsageAnalysis gathers resource usage on a per-function + /// granularity. However, some resource info has to be assigned the call + /// transitive maximum or accumulative. For example, if A calls B and B's VGPR + /// usage exceeds A's, A should be assigned B's VGPR usage. Furthermore, + /// functions with indirect calls should be assigned the module level maximum. + void gatherResourceInfo( + const MachineFunction &MF, + const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI, + MCContext &OutContext); + + const MCExpr *createTotalNumVGPRs(const MachineFunction &MF, MCContext &Ctx); + const MCExpr *createTotalNumSGPRs(const MachineFunction &MF, bool hasXnack, + MCContext &Ctx); +}; +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMCRESOURCEINFO_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 0aca99a82d1978..1ee3c40d69a3b3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -13,14 +13,6 @@ /// The results of this analysis are used to fill the register usage, flat /// usage, etc. into hardware registers. /// -/// The analysis takes callees into account. E.g. if a function A that needs 10 -/// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A -/// will return 20. -/// It is assumed that an indirect call can go into any function except -/// hardware-entrypoints. Therefore the register usage of functions with -/// indirect calls is estimated as the maximum of all non-entrypoint functions -/// in the module. -/// //===----------------------------------------------------------------------===// #include "AMDGPUResourceUsageAnalysis.h" @@ -28,8 +20,8 @@ #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "llvm/ADT/PostOrderIterator.h" -#include "llvm/Analysis/CallGraph.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalValue.h" @@ -78,92 +70,37 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, return false; } -int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs( - const GCNSubtarget &ST) const { - return NumExplicitSGPR + - IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch, - ST.getTargetID().isXnackOnOrAny()); -} - -int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( - const GCNSubtarget &ST) const { - return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), NumAGPR, NumVGPR); -} - -bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) { +bool AMDGPUResourceUsageAnalysis::runOnMachineFunction(MachineFunction &MF) { auto *TPC = getAnalysisIfAvailable(); if (!TPC) return false; - MachineModuleInfo &MMI = getAnalysis().getMMI(); const TargetMachine &TM = TPC->getTM(); const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo(); - bool HasIndirectCall = false; - - CallGraph CG = CallGraph(M); - auto End = po_end(&CG); // By default, for code object v5 and later, track only the minimum scratch // size uint32_t AssumedStackSizeForDynamicSizeObjects = clAssumedStackSizeForDynamicSizeObjects; uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall; - if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 || + if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >= + AMDGPU::AMDHSA_COV5 || STI.getTargetTriple().getOS() == Triple::AMDPAL) { - if (clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences() == 0) + if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences()) AssumedStackSizeForDynamicSizeObjects = 0; - if (clAssumedStackSizeForExternalCall.getNumOccurrences() == 0) + if (!clAssumedStackSizeForExternalCall.getNumOccurrences()) AssumedStackSizeForExternalCall = 0; } - for (auto IT = po_begin(&CG); IT != End; ++IT) { - Function *F = IT->getFunction(); - if (!F || F->isDeclaration()) - continue; - - MachineFunction *MF = MMI.getMachineFunction(*F); - assert(MF && "function must have been generated already"); - - auto CI = - CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo())); - SIFunctionResourceInfo &Info = CI.first->second; - assert(CI.second && "should only be called once per function"); - Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects, - AssumedStackSizeForExternalCall); - HasIndirectCall |= Info.HasIndirectCall; - } - - // It's possible we have unreachable functions in the module which weren't - // visited by the PO traversal. Make sure we have some resource counts to - // report. - for (const auto &IT : CG) { - const Function *F = IT.first; - if (!F || F->isDeclaration()) - continue; - - auto CI = - CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo())); - if (!CI.second) // Skip already visited functions - continue; - - SIFunctionResourceInfo &Info = CI.first->second; - MachineFunction *MF = MMI.getMachineFunction(*F); - assert(MF && "function must have been generated already"); - Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects, - AssumedStackSizeForExternalCall); - HasIndirectCall |= Info.HasIndirectCall; - } - - if (HasIndirectCall) - propagateIndirectCallRegisterUsage(); + ResourceInfo = analyzeResourceUsage(MF, AssumedStackSizeForDynamicSizeObjects, + AssumedStackSizeForExternalCall); return false; } AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo AMDGPUResourceUsageAnalysis::analyzeResourceUsage( - const MachineFunction &MF, const TargetMachine &TM, - uint32_t AssumedStackSizeForDynamicSizeObjects, + const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects, uint32_t AssumedStackSizeForExternalCall) const { SIFunctionResourceInfo Info; @@ -253,7 +190,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( int32_t MaxVGPR = -1; int32_t MaxAGPR = -1; int32_t MaxSGPR = -1; - uint64_t CalleeFrameSize = 0; + Info.CalleeSegmentSize = 0; for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { @@ -512,8 +449,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( TII->getNamedOperand(MI, AMDGPU::OpName::callee); const Function *Callee = getCalleeFunction(*CalleeOp); - DenseMap::const_iterator I = - CallGraphResourceInfo.end(); // Avoid crashing on undefined behavior with an illegal call to a // kernel. If a callsite's calling convention doesn't match the @@ -522,9 +457,14 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) report_fatal_error("invalid call to entry function"); + auto isSameFunction = [](const MachineFunction &MF, const Function *F) { + return F == &MF.getFunction(); + }; + + if (Callee && !isSameFunction(MF, Callee)) + Info.Callees.push_back(Callee); + bool IsIndirect = !Callee || Callee->isDeclaration(); - if (!IsIndirect) - I = CallGraphResourceInfo.find(Callee); // FIXME: Call site could have norecurse on it if (!Callee || !Callee->doesNotRecurse()) { @@ -539,15 +479,15 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( // directly call the tail called function. If a kernel directly // calls a tail recursive function, we'll assume maximum stack size // based on the regular call instruction. - CalleeFrameSize = std::max( - CalleeFrameSize, + Info.CalleeSegmentSize = std::max( + Info.CalleeSegmentSize, static_cast(AssumedStackSizeForExternalCall)); } } - if (IsIndirect || I == CallGraphResourceInfo.end()) { - CalleeFrameSize = - std::max(CalleeFrameSize, + if (IsIndirect) { + Info.CalleeSegmentSize = + std::max(Info.CalleeSegmentSize, static_cast(AssumedStackSizeForExternalCall)); // Register usage of indirect calls gets handled later @@ -555,19 +495,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( Info.UsesFlatScratch = ST.hasFlatAddressSpace(); Info.HasDynamicallySizedStack = true; Info.HasIndirectCall = true; - } else { - // We force CodeGen to run in SCC order, so the callee's register - // usage etc. should be the cumulative usage of all callees. - MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); - MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); - MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); - CalleeFrameSize = - std::max(I->second.PrivateSegmentSize, CalleeFrameSize); - Info.UsesVCC |= I->second.UsesVCC; - Info.UsesFlatScratch |= I->second.UsesFlatScratch; - Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; - Info.HasRecursion |= I->second.HasRecursion; - Info.HasIndirectCall |= I->second.HasIndirectCall; } } } @@ -576,36 +503,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( Info.NumExplicitSGPR = MaxSGPR + 1; Info.NumVGPR = MaxVGPR + 1; Info.NumAGPR = MaxAGPR + 1; - Info.PrivateSegmentSize += CalleeFrameSize; return Info; } - -void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() { - // Collect the maximum number of registers from non-hardware-entrypoints. - // All these functions are potential targets for indirect calls. - int32_t NonKernelMaxSGPRs = 0; - int32_t NonKernelMaxVGPRs = 0; - int32_t NonKernelMaxAGPRs = 0; - - for (const auto &I : CallGraphResourceInfo) { - if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) { - auto &Info = I.getSecond(); - NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR); - NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR); - NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR); - } - } - - // Add register usage for functions with indirect calls. - // For calls to unknown functions, we assume the maximum register usage of - // all non-hardware-entrypoints in the current module. - for (auto &I : CallGraphResourceInfo) { - auto &Info = I.getSecond(); - if (Info.HasIndirectCall) { - Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs); - Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs); - Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs); - } - } -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h index 7f71de6749dcef..92ef41f49b3ba8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h @@ -15,8 +15,8 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H -#include "llvm/Analysis/CallGraphSCCPass.h" -#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineFunctionPass.h" namespace llvm { @@ -24,10 +24,9 @@ class GCNSubtarget; class MachineFunction; class TargetMachine; -struct AMDGPUResourceUsageAnalysis : public ModulePass { - static char ID; - +struct AMDGPUResourceUsageAnalysis : public MachineFunctionPass { public: + static char ID; // Track resource usage for callee functions. struct SIFunctionResourceInfo { // Track the number of explicitly used VGPRs. Special registers reserved at @@ -35,48 +34,33 @@ struct AMDGPUResourceUsageAnalysis : public ModulePass { int32_t NumVGPR = 0; int32_t NumAGPR = 0; int32_t NumExplicitSGPR = 0; + uint64_t CalleeSegmentSize = 0; uint64_t PrivateSegmentSize = 0; bool UsesVCC = false; bool UsesFlatScratch = false; bool HasDynamicallySizedStack = false; bool HasRecursion = false; bool HasIndirectCall = false; - - int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const; - // Total number of VGPRs is actually a combination of AGPR and VGPR - // depending on architecture - and some alignment constraints - int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const; + SmallVector Callees; }; - AMDGPUResourceUsageAnalysis() : ModulePass(ID) {} + AMDGPUResourceUsageAnalysis() : MachineFunctionPass(ID) {} - bool doInitialization(Module &M) override { - CallGraphResourceInfo.clear(); - return ModulePass::doInitialization(M); - } + bool runOnMachineFunction(MachineFunction &MF) override; - bool runOnModule(Module &M) override; + const SIFunctionResourceInfo &getResourceInfo() const { return ResourceInfo; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); AU.setPreservesAll(); - } - - const SIFunctionResourceInfo &getResourceInfo(const Function *F) const { - auto Info = CallGraphResourceInfo.find(F); - assert(Info != CallGraphResourceInfo.end() && - "Failed to find resource info for function"); - return Info->getSecond(); + MachineFunctionPass::getAnalysisUsage(AU); } private: SIFunctionResourceInfo - analyzeResourceUsage(const MachineFunction &MF, const TargetMachine &TM, + analyzeResourceUsage(const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects, uint32_t AssumedStackSizeForExternalCall) const; - void propagateIndirectCallRegisterUsage(); - - DenseMap CallGraphResourceInfo; + SIFunctionResourceInfo ResourceInfo; }; } // namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 7c883cc2017ddd..4605be344f7316 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -81,6 +81,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUMCInstLower.cpp AMDGPUMemoryUtils.cpp AMDGPUIGroupLP.cpp + AMDGPUMCResourceInfo.cpp AMDGPUMarkLastScratchLoad.cpp AMDGPUMIRFormatter.cpp AMDGPUOpenCLEnqueuedBlockLowering.cpp diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 73d466abc66f7b..a1a41d6cc8c6a0 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -271,6 +271,47 @@ void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, << Alignment.value() << '\n'; } +void AMDGPUTargetAsmStreamer::EmitMCResourceInfo( + const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, + const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize, + const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch, + const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion, + const MCSymbol *HasIndirectCall) { +#define PRINT_RES_INFO(ARG) \ + OS << "\t.set "; \ + ARG->print(OS, getContext().getAsmInfo()); \ + OS << ", "; \ + ARG->getVariableValue()->print(OS, getContext().getAsmInfo()); \ + Streamer.addBlankLine(); + + PRINT_RES_INFO(NumVGPR); + PRINT_RES_INFO(NumAGPR); + PRINT_RES_INFO(NumExplicitSGPR); + PRINT_RES_INFO(PrivateSegmentSize); + PRINT_RES_INFO(UsesVCC); + PRINT_RES_INFO(UsesFlatScratch); + PRINT_RES_INFO(HasDynamicallySizedStack); + PRINT_RES_INFO(HasRecursion); + PRINT_RES_INFO(HasIndirectCall); +#undef PRINT_RES_INFO +} + +void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(const MCSymbol *MaxVGPR, + const MCSymbol *MaxAGPR, + const MCSymbol *MaxSGPR) { +#define PRINT_RES_INFO(ARG) \ + OS << "\t.set "; \ + ARG->print(OS, getContext().getAsmInfo()); \ + OS << ", "; \ + ARG->getVariableValue()->print(OS, getContext().getAsmInfo()); \ + Streamer.addBlankLine(); + + PRINT_RES_INFO(MaxVGPR); + PRINT_RES_INFO(MaxAGPR); + PRINT_RES_INFO(MaxSGPR); +#undef PRINT_RES_INFO +} + bool AMDGPUTargetAsmStreamer::EmitISAVersion() { OS << "\t.amd_amdgpu_isa \"" << getTargetID()->toString() << "\"\n"; return true; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index bf1538c71d1543..6a91ad06de5d12 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -60,6 +60,17 @@ class AMDGPUTargetStreamer : public MCTargetStreamer { virtual void emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, Align Alignment) { } + virtual void EmitMCResourceInfo( + const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, + const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize, + const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch, + const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion, + const MCSymbol *HasIndirectCall) {}; + + virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, + const MCSymbol *MaxAGPR, + const MCSymbol *MaxSGPR) {}; + /// \returns True on success, false on failure. virtual bool EmitISAVersion() { return true; } @@ -136,6 +147,18 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer { void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override; + void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, + const MCSymbol *NumExplicitSGPR, + const MCSymbol *PrivateSegmentSize, + const MCSymbol *UsesVCC, + const MCSymbol *UsesFlatScratch, + const MCSymbol *HasDynamicallySizedStack, + const MCSymbol *HasRecursion, + const MCSymbol *HasIndirectCall) override; + + void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, + const MCSymbol *MaxSGPR) override; + /// \returns True on success, false on failure. bool EmitISAVersion() override; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index a53bf70d77717b..92d09b3afa77d7 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -215,15 +215,15 @@ void AMDGPUPALMetadata::setRegister(unsigned Reg, const MCExpr *Val, const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx); Val = MCBinaryExpr::createOr(Val, NExpr, Ctx); } - ExprIt->getSecond() = Val; } else if (N.getKind() == msgpack::Type::UInt) { const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx); Val = MCBinaryExpr::createOr(Val, NExpr, Ctx); - int64_t Unused; - if (!Val->evaluateAsAbsolute(Unused)) - REM[Reg] = Val; - (void)Unused; + } else { + // Default to uint64_t 0 so additional calls to setRegister will allow + // propagate ORs. + N = (uint64_t)0; } + REM[Reg] = Val; DelayedExprs.assignDocNode(N, msgpack::Type::UInt, Val); } diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll index 8d87b53efb4e73..0e16ea10c019ac 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll @@ -154,25 +154,28 @@ bb: declare void @undef_func() ; GCN-LABEL: {{^}}kernel_call_undef_func: -; GFX908: .amdhsa_next_free_vgpr 32 -; GFX90A: .amdhsa_next_free_vgpr 64 -; GFX90A: .amdhsa_accum_offset 32 -; GCN: NumVgprs: 32 -; GCN: NumAgprs: 32 -; GFX908: TotalNumVgprs: 32 -; GFX90A: TotalNumVgprs: 64 -; GFX908: VGPRBlocks: 7 -; GFX90A: VGPRBlocks: 7 -; GFX908: NumVGPRsForWavesPerEU: 32 -; GFX90A: NumVGPRsForWavesPerEU: 64 -; GFX90A: AccumOffset: 32 -; GFX908: Occupancy: 8 -; GFX90A: Occupancy: 8 -; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 7 +; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0) +; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4 +; GCN: .set kernel_call_undef_func.num_vgpr, max(32, amdgpu.max_num_vgpr) +; GCN: .set kernel_call_undef_func.num_agpr, max(0, amdgpu.max_num_agpr) +; GCN: NumVgprs: kernel_call_undef_func.num_vgpr +; GCN: NumAgprs: kernel_call_undef_func.num_agpr +; GCN: TotalNumVgprs: totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr) +; GFX908: VGPRBlocks: ((alignto(max(max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0), 1), 4))/4)-1 +; GFX90A: VGPRBlocks: ((alignto(max(max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0), 1), 8))/8)-1 +; GCN: NumVGPRsForWavesPerEU: max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0) +; GFX90A: AccumOffset: ((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)+1)*4 +; GFX908: Occupancy: occupancy(10, 4, 256, 8, 10, max(kernel_call_undef_func.numbered_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)) +; GFX90A: Occupancy: occupancy(8, 8, 512, 8, 8, max(kernel_call_undef_func.numbered_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)) +; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: ((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)&(~65536))&63 define amdgpu_kernel void @kernel_call_undef_func() #0 { bb: call void @undef_func() ret void } +; GCN: .set amdgpu.max_num_vgpr, 32 +; GCN-NEXT: .set amdgpu.max_num_agpr, 32 +; GCN-NEXT: .set amdgpu.max_num_sgpr, 34 + attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll index c8ba6722d9d85e..122fc42ef9b62a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll @@ -11,7 +11,7 @@ ; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6: ; ASM: .amdhsa_user_sgpr_count 10 ; ASM: .amdhsa_next_free_sgpr 10 -; ASM: ; NumSgprs: 16 +; ASM: ; TotalNumSgprs: 16 ; ASM: ; NumSGPRsForWavesPerEU: 16 ; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT @@ -31,7 +31,7 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret ; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2: ; ASM: .amdhsa_user_sgpr_count 10 ; ASM: .amdhsa_next_free_sgpr 10 -; ASM: ; NumSgprs: 16 +; ASM: ; TotalNumSgprs: 16 ; ASM: ; NumSGPRsForWavesPerEU: 16 ; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2 @@ -47,7 +47,7 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 { ; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2: ; ASM: .amdhsa_user_sgpr_count 3 ; ASM: .amdhsa_next_free_sgpr 3 -; ASM: ; NumSgprs: 9 +; ASM: ; TotalNumSgprs: 9 ; ASM: ; NumSGPRsForWavesPerEU: 9 ; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD. @@ -62,7 +62,7 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { r ; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2: ; ASM: .amdhsa_user_sgpr_count 2 ; ASM: .amdhsa_next_free_sgpr 0 -; ASM: ; NumSgprs: 6 +; ASM: ; TotalNumSgprs: 6 ; ASM: ; NumSGPRsForWavesPerEU: 6 ; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD. diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll index 99a7ae37e0e78d..8f4cb364751d88 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll @@ -60,7 +60,9 @@ bb: declare void @undef_func() ; CHECK: .type kernel_call_undef_func -; CHECK: NumAgprs: 32 +; CHECK: .set kernel_call_undef_func.num_agpr, max(0, amdgpu.max_num_agpr) +; CHECK: NumAgprs: kernel_call_undef_func.num_agpr +; CHECK: .set amdgpu.max_num_agpr, 32 define amdgpu_kernel void @kernel_call_undef_func() #0 { bb: call void @undef_func() diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll index e4d427a0b826f8..d45e116beb4e3e 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll @@ -547,18 +547,20 @@ define amdgpu_kernel void @f256() #256 { attributes #256 = { nounwind "amdgpu-flat-work-group-size"="256,256" } ; GCN-LABEL: {{^}}f512: -; GFX9: NumVgprs: 128 -; GFX90A: NumVgprs: 128 -; GFX90A: NumAgprs: 128 -; GFX90A: TotalNumVgprs: 256 -; GFX10WGP-WAVE32: NumVgprs: 256 -; GFX10WGP-WAVE64: NumVgprs: 256 -; GFX10CU-WAVE32: NumVgprs: 128 -; GFX10CU-WAVE64: NumVgprs: 128 -; GFX11WGP-WAVE32: NumVgprs: 256 -; GFX11WGP-WAVE64: NumVgprs: 256 -; GFX11CU-WAVE32: NumVgprs: 192 -; GFX11CU-WAVE64: NumVgprs: 192 +; GFX9: .set f512.num_vgpr, max(128, amdgpu.max_num_vgpr) +; GFX90A: .set f512.num_vgpr, max(128, amdgpu.max_num_vgpr) +; GFX90A: .set f512.num_agpr, max(128, amdgpu.max_num_agpr) +; GFX10WGP-WAVE32: .set f512.num_vgpr, max(256, amdgpu.max_num_vgpr) +; GFX10WGP-WAVE64: .set f512.num_vgpr, max(256, amdgpu.max_num_vgpr) +; GFX10CU-WAVE32: .set f512.num_vgpr, max(128, amdgpu.max_num_vgpr) +; GFX10CU-WAVE64: .set f512.num_vgpr, max(128, amdgpu.max_num_vgpr) +; GFX11WGP-WAVE32: .set f512.num_vgpr, max(256, amdgpu.max_num_vgpr) +; GFX11WGP-WAVE64: .set f512.num_vgpr, max(256, amdgpu.max_num_vgpr) +; GFX11CU-WAVE32: .set f512.num_vgpr, max(192, amdgpu.max_num_vgpr) +; GFX11CU-WAVE64: .set f512.num_vgpr, max(192, amdgpu.max_num_vgpr) +; GCN: NumVgprs: f512.num_vgpr +; GFX90A: NumAgprs: f512.num_agpr +; GFX90A: TotalNumVgprs: totalnumvgprs(f512.num_agpr, f512.num_vgpr) define amdgpu_kernel void @f512() #512 { call void @foo() call void @use256vgprs() @@ -567,17 +569,20 @@ define amdgpu_kernel void @f512() #512 { attributes #512 = { nounwind "amdgpu-flat-work-group-size"="512,512" } ; GCN-LABEL: {{^}}f1024: -; GFX9: NumVgprs: 64 -; GFX90A: NumAgprs: 64 -; GFX90A: TotalNumVgprs: 128 -; GFX10WGP-WAVE32: NumVgprs: 128 -; GFX10WGP-WAVE64: NumVgprs: 128 -; GFX10CU-WAVE32: NumVgprs: 64 -; GFX10CU-WAVE64: NumVgprs: 64 -; GFX11WGP-WAVE32: NumVgprs: 192 -; GFX11WGP-WAVE64: NumVgprs: 192 -; GFX11CU-WAVE32: NumVgprs: 96 -; GFX11CU-WAVE64: NumVgprs: 96 +; GFX9: .set f1024.num_vgpr, max(64, amdgpu.max_num_vgpr) +; GFX90A: .set f1024.num_vgpr, max(64, amdgpu.max_num_vgpr) +; GFX90A: .set f1024.num_agpr, max(64, amdgpu.max_num_agpr) +; GFX10WGP-WAVE32: .set f1024.num_vgpr, max(128, amdgpu.max_num_vgpr) +; GFX10WGP-WAVE64: .set f1024.num_vgpr, max(128, amdgpu.max_num_vgpr) +; GFX10CU-WAVE32: .set f1024.num_vgpr, max(64, amdgpu.max_num_vgpr) +; GFX10CU-WAVE64: .set f1024.num_vgpr, max(64, amdgpu.max_num_vgpr) +; GFX11WGP-WAVE32: .set f1024.num_vgpr, max(192, amdgpu.max_num_vgpr) +; GFX11WGP-WAVE64: .set f1024.num_vgpr, max(192, amdgpu.max_num_vgpr) +; GFX11CU-WAVE32: .set f1024.num_vgpr, max(96, amdgpu.max_num_vgpr) +; GFX11CU-WAVE64: .set f1024.num_vgpr, max(96, amdgpu.max_num_vgpr) +; GCN: NumVgprs: f1024.num_vgpr +; GFX90A: NumAgprs: f1024.num_agpr +; GFX90A: TotalNumVgprs: totalnumvgprs(f1024.num_agpr, f1024.num_vgpr) define amdgpu_kernel void @f1024() #1024 { call void @foo() call void @use256vgprs() diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll index a795e995603410..e8898d6a7001cc 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=ALL,GFX908 %s +; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=ALL %s ; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=ALL,GFX90A %s ; CallGraphAnalysis, which CodeGenSCC order depends on, does not look @@ -8,12 +8,13 @@ @alias = hidden alias void (), ptr @aliasee_default ; ALL-LABEL: {{^}}kernel: -; GFX908: .amdhsa_next_free_vgpr 32 -; GFX908-NEXT: .amdhsa_next_free_sgpr 33 +; ALL: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel.num_agpr, kernel.num_vgpr), 1, 0) +; ALL-NEXT: .amdhsa_next_free_sgpr (max(kernel.numbered_sgpr+(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1)) +; GFX90A-NEXT: .amdhsa_accum_offset ((((((alignto(max(1, kernel.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4 -; GFX90A: .amdhsa_next_free_vgpr 59 -; GFX90A-NEXT: .amdhsa_next_free_sgpr 33 -; GFX90A-NEXT: .amdhsa_accum_offset 32 +; ALL: .set kernel.num_vgpr, max(32, aliasee_default.num_vgpr) +; ALL-NEXT: .set kernel.num_agpr, max(0, aliasee_default.num_agpr) +; ALL-NEXT: .set kernel.numbered_sgpr, max(33, aliasee_default.numbered_sgpr) define amdgpu_kernel void @kernel() #0 { bb: call void @alias() #2 @@ -25,6 +26,9 @@ bb: call void asm sideeffect "; clobber a26 ", "~{a26}"() ret void } +; ALL: .set aliasee_default.num_vgpr, 0 +; ALL-NEXT: .set aliasee_default.num_agpr, 27 +; ALL-NEXT: .set aliasee_default.numbered_sgpr, 32 attributes #0 = { noinline norecurse nounwind optnone } attributes #1 = { noinline norecurse nounwind readnone willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll index c976cc3d53b5eb..a01268625cedbd 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll @@ -7,14 +7,18 @@ @alias0 = hidden alias void (), ptr @aliasee_default_vgpr64_sgpr102 ; CHECK-LABEL: {{^}}kernel0: -; CHECK: .amdhsa_next_free_vgpr 53 -; CHECK-NEXT: .amdhsa_next_free_sgpr 33 +; CHECK: .set kernel0.num_vgpr, max(32, aliasee_default_vgpr64_sgpr102.num_vgpr) +; CHECK-NEXT: .set kernel0.num_agpr, max(0, aliasee_default_vgpr64_sgpr102.num_agpr) +; CHECK-NEXT: .set kernel0.numbered_sgpr, max(33, aliasee_default_vgpr64_sgpr102.numbered_sgpr) define amdgpu_kernel void @kernel0() #0 { bb: call void @alias0() #2 ret void } +; CHECK: .set aliasee_default_vgpr64_sgpr102.num_vgpr, 53 +; CHECK-NEXT: .set aliasee_default_vgpr64_sgpr102.num_agpr, 0 +; CHECK-NEXT: .set aliasee_default_vgpr64_sgpr102.numbered_sgpr, 32 define internal void @aliasee_default_vgpr64_sgpr102() #1 { bb: call void asm sideeffect "; clobber v52 ", "~{v52}"() diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll index edef71ef143dfd..86defe3ba7ec08 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll @@ -9,8 +9,12 @@ ; The parent kernel has a higher VGPR usage than the possible callees. ; CHECK-LABEL: {{^}}kernel1: -; CHECK: .amdhsa_next_free_vgpr 41 -; CHECK-NEXT: .amdhsa_next_free_sgpr 33 +; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel1.num_agpr, kernel1.num_vgpr), 1, 0) +; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel1.numbered_sgpr+(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1)) + +; CHECK: .set kernel1.num_vgpr, max(41, aliasee_vgpr32_sgpr76.num_vgpr) +; CHECK-NEXT: .set kernel1.num_agpr, max(0, aliasee_vgpr32_sgpr76.num_agpr) +; CHECK-NEXT: .set kernel1.numbered_sgpr, max(33, aliasee_vgpr32_sgpr76.numbered_sgpr) define amdgpu_kernel void @kernel1() #0 { bb: call void asm sideeffect "; clobber v40 ", "~{v40}"() @@ -18,6 +22,9 @@ bb: ret void } +; CHECK: .set aliasee_vgpr32_sgpr76.num_vgpr, 27 +; CHECK-NEXT: .set aliasee_vgpr32_sgpr76.num_agpr, 0 +; CHECK-NEXT: .set aliasee_vgpr32_sgpr76.numbered_sgpr, 32 define internal void @aliasee_vgpr32_sgpr76() #1 { bb: call void asm sideeffect "; clobber v26 ", "~{v26}"() diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll index bb34ef1a15d2b9..6b1fbd9b6e16a2 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll @@ -7,14 +7,21 @@ @alias2 = hidden alias void (), ptr @aliasee_vgpr64_sgpr102 ; CHECK-LABEL: {{^}}kernel2: -; CHECK: .amdhsa_next_free_vgpr 53 -; CHECK-NEXT: .amdhsa_next_free_sgpr 33 +; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel2.num_agpr, kernel2.num_vgpr), 1, 0) +; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel2.numbered_sgpr+(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1)) + +; CHECK: .set kernel2.num_vgpr, max(32, aliasee_vgpr64_sgpr102.num_vgpr) +; CHECK-NEXT: .set kernel2.num_agpr, max(0, aliasee_vgpr64_sgpr102.num_agpr) +; CHECK-NEXT: .set kernel2.numbered_sgpr, max(33, aliasee_vgpr64_sgpr102.numbered_sgpr) define amdgpu_kernel void @kernel2() #0 { bb: call void @alias2() #2 ret void } +; CHECK: .set aliasee_vgpr64_sgpr102.num_vgpr, 53 +; CHECK-NEXT: .set aliasee_vgpr64_sgpr102.num_agpr, 0 +; CHECK-NEXT: .set aliasee_vgpr64_sgpr102.numbered_sgpr, 32 define internal void @aliasee_vgpr64_sgpr102() #1 { bb: call void asm sideeffect "; clobber v52 ", "~{v52}"() diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll index 8a88eb7e51ad72..c81181cd826677 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll @@ -7,14 +7,21 @@ @alias3 = hidden alias void (), ptr @aliasee_vgpr256_sgpr102 ; CHECK-LABEL: {{^}}kernel3: -; CHECK: .amdhsa_next_free_vgpr 253 -; CHECK-NEXT: .amdhsa_next_free_sgpr 33 +; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel3.num_agpr, kernel3.num_vgpr), 1, 0) +; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel3.numbered_sgpr+(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1)) + +; CHECK: .set kernel3.num_vgpr, max(32, aliasee_vgpr256_sgpr102.num_vgpr) +; CHECK-NEXT: .set kernel3.num_agpr, max(0, aliasee_vgpr256_sgpr102.num_agpr) +; CHECK-NEXT: .set kernel3.numbered_sgpr, max(33, aliasee_vgpr256_sgpr102.numbered_sgpr) define amdgpu_kernel void @kernel3() #0 { bb: call void @alias3() #2 ret void } +; CHECK: .set aliasee_vgpr256_sgpr102.num_vgpr, 253 +; CHECK-NEXT: .set aliasee_vgpr256_sgpr102.num_agpr, 0 +; CHECK-NEXT: .set aliasee_vgpr256_sgpr102.numbered_sgpr, 33 define internal void @aliasee_vgpr256_sgpr102() #1 { bb: call void asm sideeffect "; clobber v252 ", "~{v252}"() diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll index 6af45035d394f8..dbd00f09943c01 100644 --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -7,7 +7,7 @@ ; Make sure to run a GPU with the SGPR allocation bug. ; GCN-LABEL: {{^}}use_vcc: -; GCN: ; NumSgprs: 34 +; GCN: ; TotalNumSgprs: 34 ; GCN: ; NumVgprs: 0 define void @use_vcc() #1 { call void asm sideeffect "", "~{vcc}" () #0 @@ -25,7 +25,7 @@ define void @use_vcc() #1 { ; GCN: v_readlane_b32 s4, v40, 2 ; GCN: s_mov_b32 s33, s4 ; GCN: s_setpc_b64 s[30:31] -; GCN: ; NumSgprs: 36 +; GCN: ; TotalNumSgprs: 36 ; GCN: ; NumVgprs: 41 define void @indirect_use_vcc() #1 { call void @use_vcc() @@ -33,9 +33,9 @@ define void @indirect_use_vcc() #1 { } ; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel: -; CI: ; NumSgprs: 38 -; VI-NOBUG: ; NumSgprs: 40 -; VI-BUG: ; NumSgprs: 96 +; CI: ; TotalNumSgprs: 38 +; VI-NOBUG: ; TotalNumSgprs: 40 +; VI-BUG: ; TotalNumSgprs: 96 ; GCN: ; NumVgprs: 41 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 { call void @indirect_use_vcc() @@ -43,8 +43,8 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) } ; GCN-LABEL: {{^}}use_flat_scratch: -; CI: ; NumSgprs: 36 -; VI: ; NumSgprs: 38 +; CI: ; TotalNumSgprs: 36 +; VI: ; TotalNumSgprs: 38 ; GCN: ; NumVgprs: 0 define void @use_flat_scratch() #1 { call void asm sideeffect "", "~{flat_scratch}" () #0 @@ -52,8 +52,8 @@ define void @use_flat_scratch() #1 { } ; GCN-LABEL: {{^}}indirect_use_flat_scratch: -; CI: ; NumSgprs: 38 -; VI: ; NumSgprs: 40 +; CI: ; TotalNumSgprs: 38 +; VI: ; TotalNumSgprs: 40 ; GCN: ; NumVgprs: 41 define void @indirect_use_flat_scratch() #1 { call void @use_flat_scratch() @@ -61,9 +61,9 @@ define void @indirect_use_flat_scratch() #1 { } ; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel: -; CI: ; NumSgprs: 38 -; VI-NOBUG: ; NumSgprs: 40 -; VI-BUG: ; NumSgprs: 96 +; CI: ; TotalNumSgprs: 38 +; VI-NOBUG: ; TotalNumSgprs: 40 +; VI-BUG: ; TotalNumSgprs: 96 ; GCN: ; NumVgprs: 41 define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace(1) %out) #0 { call void @indirect_use_flat_scratch() @@ -107,23 +107,23 @@ define void @indirect_use_50_vgpr() #0 { } ; GCN-LABEL: {{^}}use_80_sgpr: -; GCN: ; NumSgprs: 80 +; GCN: ; TotalNumSgprs: 80 define void @use_80_sgpr() #1 { call void asm sideeffect "", "~{s79}"() #0 ret void } ; GCN-LABEL: {{^}}indirect_use_80_sgpr: -; GCN: ; NumSgprs: 82 +; GCN: ; TotalNumSgprs: 82 define void @indirect_use_80_sgpr() #1 { call void @use_80_sgpr() ret void } ; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr: -; CI: ; NumSgprs: 84 -; VI-NOBUG: ; NumSgprs: 86 -; VI-BUG: ; NumSgprs: 96 +; CI: ; TotalNumSgprs: 84 +; VI-NOBUG: ; TotalNumSgprs: 86 +; VI-BUG: ; TotalNumSgprs: 96 define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 { call void @indirect_use_80_sgpr() ret void @@ -176,7 +176,7 @@ define amdgpu_kernel void @multi_call_use_use_stack() #0 { declare void @external() #0 ; GCN-LABEL: {{^}}usage_external: -; NumSgprs: 48 +; TotalNumSgprs: 48 ; NumVgprs: 24 ; GCN: ScratchSize: 16384 ; @@ -190,7 +190,7 @@ define amdgpu_kernel void @usage_external() #0 { declare void @external_recurse() #2 ; GCN-LABEL: {{^}}usage_external_recurse: -; NumSgprs: 48 +; TotalNumSgprs: 48 ; NumVgprs: 24 ; GCN: ScratchSize: 16384 ; @@ -234,10 +234,11 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 { ; Make sure there's no assert when a sgpr96 is used. ; GCN-LABEL: {{^}}count_use_sgpr96_external_call ; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}] -; CI: NumSgprs: 84 -; VI-NOBUG: NumSgprs: 86 -; VI-BUG: NumSgprs: 96 -; GCN: NumVgprs: 50 +; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr) +; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr) +; CI: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+4 +; VI-BUG: TotalNumSgprs: 96 +; GCN: NumVgprs: count_use_sgpr96_external_call.num_vgpr define amdgpu_kernel void @count_use_sgpr96_external_call() { entry: tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> ) #1 @@ -248,10 +249,11 @@ entry: ; Make sure there's no assert when a sgpr160 is used. ; GCN-LABEL: {{^}}count_use_sgpr160_external_call ; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}] -; CI: NumSgprs: 84 -; VI-NOBUG: NumSgprs: 86 -; VI-BUG: NumSgprs: 96 -; GCN: NumVgprs: 50 +; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr) +; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr) +; CI: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+4 +; VI-BUG: TotalNumSgprs: 96 +; GCN: NumVgprs: count_use_sgpr160_external_call.num_vgpr define amdgpu_kernel void @count_use_sgpr160_external_call() { entry: tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> ) #1 @@ -262,10 +264,11 @@ entry: ; Make sure there's no assert when a vgpr160 is used. ; GCN-LABEL: {{^}}count_use_vgpr160_external_call ; GCN: ; vgpr160 v[{{[0-9]+}}:{{[0-9]+}}] -; CI: NumSgprs: 84 -; VI-NOBUG: NumSgprs: 86 -; VI-BUG: NumSgprs: 96 -; GCN: NumVgprs: 50 +; GCN: .set count_use_vgpr160_external_call.num_vgpr, max(5, amdgpu.max_num_vgpr) +; GCN: .set count_use_vgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr) +; CI: TotalNumSgprs: count_use_vgpr160_external_call.numbered_sgpr+4 +; VI-BUG: TotalNumSgprs: 96 +; GCN: NumVgprs: count_use_vgpr160_external_call.num_vgpr define amdgpu_kernel void @count_use_vgpr160_external_call() { entry: tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> ) #1 @@ -273,6 +276,27 @@ entry: ret void } +; GCN: .set amdgpu.max_num_vgpr, 50 +; GCN: .set amdgpu.max_num_agpr, 0 +; GCN: .set amdgpu.max_num_sgpr, 80 + +; GCN-LABEL: amdhsa.kernels: +; GCN: .name: count_use_sgpr96_external_call +; CI: .sgpr_count: 84 +; VI-NOBUG: .sgpr_count: 86 +; VI-BUG: .sgpr_count: 96 +; GCN: .vgpr_count: 50 +; GCN: .name: count_use_sgpr160_external_call +; CI: .sgpr_count: 84 +; VI-NOBUG: .sgpr_count: 86 +; VI-BUG: .sgpr_count: 96 +; GCN: .vgpr_count: 50 +; GCN: .name: count_use_vgpr160_external_call +; CI: .sgpr_count: 84 +; VI-NOBUG: .sgpr_count: 86 +; VI-BUG: .sgpr_count: 96 +; GCN: .vgpr_count: 50 + attributes #0 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } attributes #1 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } attributes #2 = { nounwind noinline } diff --git a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll index 643f2619840a22..ede57f1a0a04ce 100644 --- a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll +++ b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll @@ -33,6 +33,7 @@ bb2: ; GCN-LABEL: {{^}}preserve_condition_undef_flag: ; GCN-NOT: vcc +; GCN: s_endpgm define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) { bb0: %tmp = icmp sgt i32 %arg1, 4 diff --git a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll index 9d93609b1e8813..f198833059572b 100644 --- a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll +++ b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll @@ -1,8 +1,8 @@ ; REQUIRES: asserts -; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s -; RUN: not --crash llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s +; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s -; CHECK: function must have been generated already +; CHECK-NOT: func define internal i32 @func() { ret i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll index e4ffedd686ac93..02eb1ad9453291 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -25,11 +25,11 @@ ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; NumSgprs: 8 -; VI-NOXNACK: ; NumSgprs: 8 -; VI-XNACK: ; NumSgprs: 12 -; GFX9-ARCH-FLAT: ; NumSgprs: 14 -; GFX10-ARCH-FLAT: ; NumSgprs: 8 +; CI: ; TotalNumSgprs: 8 +; VI-NOXNACK: ; TotalNumSgprs: 8 +; VI-XNACK: ; TotalNumSgprs: 12 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8 define amdgpu_kernel void @no_vcc_no_flat() { entry: call void asm sideeffect "", "~{s7}"() @@ -42,11 +42,11 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; NumSgprs: 10 -; VI-NOXNACK: ; NumSgprs: 10 -; VI-XNACK: ; NumSgprs: 12 -; GFX9-ARCH-FLAT: ; NumSgprs: 14 -; GFX10-ARCH-FLAT: ; NumSgprs: 10 +; CI: ; TotalNumSgprs: 10 +; VI-NOXNACK: ; TotalNumSgprs: 10 +; VI-XNACK: ; TotalNumSgprs: 12 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10 define amdgpu_kernel void @vcc_no_flat() { entry: call void asm sideeffect "", "~{s7},~{vcc}"() @@ -59,11 +59,11 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; NumSgprs: 12 -; VI-NOXNACK: ; NumSgprs: 14 -; VI-XNACK: ; NumSgprs: 14 -; GFX9-ARCH-FLAT: ; NumSgprs: 14 -; GFX10-ARCH-FLAT: ; NumSgprs: 8 +; CI: ; TotalNumSgprs: 12 +; VI-NOXNACK: ; TotalNumSgprs: 14 +; VI-XNACK: ; TotalNumSgprs: 14 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8 define amdgpu_kernel void @no_vcc_flat() { entry: call void asm sideeffect "", "~{s7},~{flat_scratch}"() @@ -76,11 +76,11 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: ; NumSgprs: 12 -; VI-NOXNACK: ; NumSgprs: 14 -; VI-XNACK: ; NumSgprs: 14 -; GFX9-ARCH-FLAT: ; NumSgprs: 14 -; GFX10-ARCH-FLAT: ; NumSgprs: 10 +; CI: ; TotalNumSgprs: 12 +; VI-NOXNACK: ; TotalNumSgprs: 14 +; VI-XNACK: ; TotalNumSgprs: 14 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10 define amdgpu_kernel void @vcc_flat() { entry: call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"() @@ -96,11 +96,11 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: NumSgprs: 4 -; VI-NOXNACK: NumSgprs: 6 -; VI-XNACK: NumSgprs: 6 -; GFX9-ARCH-FLAT: ; NumSgprs: 6 -; GFX10-ARCH-FLAT: ; NumSgprs: 0 +; CI: TotalNumSgprs: 4 +; VI-NOXNACK: TotalNumSgprs: 6 +; VI-XNACK: TotalNumSgprs: 6 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 define amdgpu_kernel void @use_flat_scr() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch}"() @@ -113,11 +113,11 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: NumSgprs: 4 -; VI-NOXNACK: NumSgprs: 6 -; VI-XNACK: NumSgprs: 6 -; GFX9-ARCH-FLAT: ; NumSgprs: 6 -; GFX10-ARCH-FLAT: ; NumSgprs: 0 +; CI: TotalNumSgprs: 4 +; VI-NOXNACK: TotalNumSgprs: 6 +; VI-XNACK: TotalNumSgprs: 6 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 define amdgpu_kernel void @use_flat_scr_lo() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"() @@ -130,11 +130,11 @@ entry: ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 -; CI: NumSgprs: 4 -; VI-NOXNACK: NumSgprs: 6 -; VI-XNACK: NumSgprs: 6 -; GFX9-ARCH-FLAT: ; NumSgprs: 6 -; GFX10-ARCH-FLAT: ; NumSgprs: 0 +; CI: TotalNumSgprs: 4 +; VI-NOXNACK: TotalNumSgprs: 6 +; VI-XNACK: TotalNumSgprs: 6 +; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 +; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 define amdgpu_kernel void @use_flat_scr_hi() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"() diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll new file mode 100644 index 00000000000000..9e3264eb9c07f2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll @@ -0,0 +1,531 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 < %s | FileCheck -check-prefix=GCN %s + +; Functions that don't make calls should have constants as its resource usage as no resource information has to be propagated. + +; GCN-LABEL: {{^}}use_vcc: +; GCN: .set use_vcc.num_vgpr, 0 +; GCN: .set use_vcc.num_agpr, 0 +; GCN: .set use_vcc.numbered_sgpr, 32 +; GCN: .set use_vcc.private_seg_size, 0 +; GCN: .set use_vcc.uses_vcc, 1 +; GCN: .set use_vcc.uses_flat_scratch, 0 +; GCN: .set use_vcc.has_dyn_sized_stack, 0 +; GCN: .set use_vcc.has_recursion, 0 +; GCN: .set use_vcc.has_indirect_call, 0 +; GCN: TotalNumSgprs: 36 +; GCN: NumVgprs: 0 +; GCN: ScratchSize: 0 +define void @use_vcc() #1 { + call void asm sideeffect "", "~{vcc}" () #0 + ret void +} + +; GCN-LABEL: {{^}}indirect_use_vcc: +; GCN: .set indirect_use_vcc.num_vgpr, max(41, use_vcc.num_vgpr) +; GCN: .set indirect_use_vcc.num_agpr, max(0, use_vcc.num_agpr) +; GCN: .set indirect_use_vcc.numbered_sgpr, max(34, use_vcc.numbered_sgpr) +; GCN: .set indirect_use_vcc.private_seg_size, 16+(max(use_vcc.private_seg_size)) +; GCN: .set indirect_use_vcc.uses_vcc, or(1, use_vcc.uses_vcc) +; GCN: .set indirect_use_vcc.uses_flat_scratch, or(0, use_vcc.uses_flat_scratch) +; GCN: .set indirect_use_vcc.has_dyn_sized_stack, or(0, use_vcc.has_dyn_sized_stack) +; GCN: .set indirect_use_vcc.has_recursion, or(0, use_vcc.has_recursion) +; GCN: .set indirect_use_vcc.has_indirect_call, or(0, use_vcc.has_indirect_call) +; GCN: TotalNumSgprs: 38 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 16 +define void @indirect_use_vcc() #1 { + call void @use_vcc() + ret void +} + +; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel: +; GCN: .set indirect_2level_use_vcc_kernel.num_vgpr, max(32, indirect_use_vcc.num_vgpr) +; GCN: .set indirect_2level_use_vcc_kernel.num_agpr, max(0, indirect_use_vcc.num_agpr) +; GCN: .set indirect_2level_use_vcc_kernel.numbered_sgpr, max(33, indirect_use_vcc.numbered_sgpr) +; GCN: .set indirect_2level_use_vcc_kernel.private_seg_size, 0+(max(indirect_use_vcc.private_seg_size)) +; GCN: .set indirect_2level_use_vcc_kernel.uses_vcc, or(1, indirect_use_vcc.uses_vcc) +; GCN: .set indirect_2level_use_vcc_kernel.uses_flat_scratch, or(1, indirect_use_vcc.uses_flat_scratch) +; GCN: .set indirect_2level_use_vcc_kernel.has_dyn_sized_stack, or(0, indirect_use_vcc.has_dyn_sized_stack) +; GCN: .set indirect_2level_use_vcc_kernel.has_recursion, or(0, indirect_use_vcc.has_recursion) +; GCN: .set indirect_2level_use_vcc_kernel.has_indirect_call, or(0, indirect_use_vcc.has_indirect_call) +; GCN: TotalNumSgprs: 40 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 16 +define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 { + call void @indirect_use_vcc() + ret void +} + +; GCN-LABEL: {{^}}use_flat_scratch: +; GCN: .set use_flat_scratch.num_vgpr, 0 +; GCN: .set use_flat_scratch.num_agpr, 0 +; GCN: .set use_flat_scratch.numbered_sgpr, 32 +; GCN: .set use_flat_scratch.private_seg_size, 0 +; GCN: .set use_flat_scratch.uses_vcc, 0 +; GCN: .set use_flat_scratch.uses_flat_scratch, 1 +; GCN: .set use_flat_scratch.has_dyn_sized_stack, 0 +; GCN: .set use_flat_scratch.has_recursion, 0 +; GCN: .set use_flat_scratch.has_indirect_call, 0 +; GCN: TotalNumSgprs: 38 +; GCN: NumVgprs: 0 +; GCN: ScratchSize: 0 +define void @use_flat_scratch() #1 { + call void asm sideeffect "", "~{flat_scratch}" () #0 + ret void +} + +; GCN-LABEL: {{^}}indirect_use_flat_scratch: +; GCN: .set indirect_use_flat_scratch.num_vgpr, max(41, use_flat_scratch.num_vgpr) +; GCN: .set indirect_use_flat_scratch.num_agpr, max(0, use_flat_scratch.num_agpr) +; GCN: .set indirect_use_flat_scratch.numbered_sgpr, max(34, use_flat_scratch.numbered_sgpr) +; GCN: .set indirect_use_flat_scratch.private_seg_size, 16+(max(use_flat_scratch.private_seg_size)) +; GCN: .set indirect_use_flat_scratch.uses_vcc, or(1, use_flat_scratch.uses_vcc) +; GCN: .set indirect_use_flat_scratch.uses_flat_scratch, or(0, use_flat_scratch.uses_flat_scratch) +; GCN: .set indirect_use_flat_scratch.has_dyn_sized_stack, or(0, use_flat_scratch.has_dyn_sized_stack) +; GCN: .set indirect_use_flat_scratch.has_recursion, or(0, use_flat_scratch.has_recursion) +; GCN: .set indirect_use_flat_scratch.has_indirect_call, or(0, use_flat_scratch.has_indirect_call) +; GCN: TotalNumSgprs: 40 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 16 +define void @indirect_use_flat_scratch() #1 { + call void @use_flat_scratch() + ret void +} + +; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel: +; GCN: .set indirect_2level_use_flat_scratch_kernel.num_vgpr, max(32, indirect_use_flat_scratch.num_vgpr) +; GCN: .set indirect_2level_use_flat_scratch_kernel.num_agpr, max(0, indirect_use_flat_scratch.num_agpr) +; GCN: .set indirect_2level_use_flat_scratch_kernel.numbered_sgpr, max(33, indirect_use_flat_scratch.numbered_sgpr) +; GCN: .set indirect_2level_use_flat_scratch_kernel.private_seg_size, 0+(max(indirect_use_flat_scratch.private_seg_size)) +; GCN: .set indirect_2level_use_flat_scratch_kernel.uses_vcc, or(1, indirect_use_flat_scratch.uses_vcc) +; GCN: .set indirect_2level_use_flat_scratch_kernel.uses_flat_scratch, or(1, indirect_use_flat_scratch.uses_flat_scratch) +; GCN: .set indirect_2level_use_flat_scratch_kernel.has_dyn_sized_stack, or(0, indirect_use_flat_scratch.has_dyn_sized_stack) +; GCN: .set indirect_2level_use_flat_scratch_kernel.has_recursion, or(0, indirect_use_flat_scratch.has_recursion) +; GCN: .set indirect_2level_use_flat_scratch_kernel.has_indirect_call, or(0, indirect_use_flat_scratch.has_indirect_call) +; GCN: TotalNumSgprs: 40 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 16 +define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace(1) %out) #0 { + call void @indirect_use_flat_scratch() + ret void +} + +; GCN-LABEL: {{^}}use_10_vgpr: +; GCN: .set use_10_vgpr.num_vgpr, 10 +; GCN: .set use_10_vgpr.num_agpr, 0 +; GCN: .set use_10_vgpr.numbered_sgpr, 32 +; GCN: .set use_10_vgpr.private_seg_size, 0 +; GCN: .set use_10_vgpr.uses_vcc, 0 +; GCN: .set use_10_vgpr.uses_flat_scratch, 0 +; GCN: .set use_10_vgpr.has_dyn_sized_stack, 0 +; GCN: .set use_10_vgpr.has_recursion, 0 +; GCN: .set use_10_vgpr.has_indirect_call, 0 +; GCN: TotalNumSgprs: 36 +; GCN: NumVgprs: 10 +; GCN: ScratchSize: 0 +define void @use_10_vgpr() #1 { + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4}"() #0 + call void asm sideeffect "", "~{v5},~{v6},~{v7},~{v8},~{v9}"() #0 + ret void +} + +; GCN-LABEL: {{^}}indirect_use_10_vgpr: +; GCN: .set indirect_use_10_vgpr.num_vgpr, max(41, use_10_vgpr.num_vgpr) +; GCN: .set indirect_use_10_vgpr.num_agpr, max(0, use_10_vgpr.num_agpr) +; GCN: .set indirect_use_10_vgpr.numbered_sgpr, max(34, use_10_vgpr.numbered_sgpr) +; GCN: .set indirect_use_10_vgpr.private_seg_size, 16+(max(use_10_vgpr.private_seg_size)) +; GCN: .set indirect_use_10_vgpr.uses_vcc, or(1, use_10_vgpr.uses_vcc) +; GCN: .set indirect_use_10_vgpr.uses_flat_scratch, or(0, use_10_vgpr.uses_flat_scratch) +; GCN: .set indirect_use_10_vgpr.has_dyn_sized_stack, or(0, use_10_vgpr.has_dyn_sized_stack) +; GCN: .set indirect_use_10_vgpr.has_recursion, or(0, use_10_vgpr.has_recursion) +; GCN: .set indirect_use_10_vgpr.has_indirect_call, or(0, use_10_vgpr.has_indirect_call) +; GCN: TotalNumSgprs: 38 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 16 +define void @indirect_use_10_vgpr() #0 { + call void @use_10_vgpr() + ret void +} + +; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr: +; GCN: .set indirect_2_level_use_10_vgpr.num_vgpr, max(32, indirect_use_10_vgpr.num_vgpr) +; GCN: .set indirect_2_level_use_10_vgpr.num_agpr, max(0, indirect_use_10_vgpr.num_agpr) +; GCN: .set indirect_2_level_use_10_vgpr.numbered_sgpr, max(33, indirect_use_10_vgpr.numbered_sgpr) +; GCN: .set indirect_2_level_use_10_vgpr.private_seg_size, 0+(max(indirect_use_10_vgpr.private_seg_size)) +; GCN: .set indirect_2_level_use_10_vgpr.uses_vcc, or(1, indirect_use_10_vgpr.uses_vcc) +; GCN: .set indirect_2_level_use_10_vgpr.uses_flat_scratch, or(1, indirect_use_10_vgpr.uses_flat_scratch) +; GCN: .set indirect_2_level_use_10_vgpr.has_dyn_sized_stack, or(0, indirect_use_10_vgpr.has_dyn_sized_stack) +; GCN: .set indirect_2_level_use_10_vgpr.has_recursion, or(0, indirect_use_10_vgpr.has_recursion) +; GCN: .set indirect_2_level_use_10_vgpr.has_indirect_call, or(0, indirect_use_10_vgpr.has_indirect_call) +; GCN: TotalNumSgprs: 40 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 16 +define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 { + call void @indirect_use_10_vgpr() + ret void +} + +; GCN-LABEL: {{^}}use_50_vgpr: +; GCN: .set use_50_vgpr.num_vgpr, 50 +; GCN: .set use_50_vgpr.num_agpr, 0 +; GCN: .set use_50_vgpr.numbered_sgpr, 32 +; GCN: .set use_50_vgpr.private_seg_size, 0 +; GCN: .set use_50_vgpr.uses_vcc, 0 +; GCN: .set use_50_vgpr.uses_flat_scratch, 0 +; GCN: .set use_50_vgpr.has_dyn_sized_stack, 0 +; GCN: .set use_50_vgpr.has_recursion, 0 +; GCN: .set use_50_vgpr.has_indirect_call, 0 +; GCN: TotalNumSgprs: 36 +; GCN: NumVgprs: 50 +; GCN: ScratchSize: 0 +define void @use_50_vgpr() #1 { + call void asm sideeffect "", "~{v49}"() #0 + ret void +} + +; GCN-LABEL: {{^}}indirect_use_50_vgpr: +; GCN: .set indirect_use_50_vgpr.num_vgpr, max(41, use_50_vgpr.num_vgpr) +; GCN: .set indirect_use_50_vgpr.num_agpr, max(0, use_50_vgpr.num_agpr) +; GCN: .set indirect_use_50_vgpr.numbered_sgpr, max(34, use_50_vgpr.numbered_sgpr) +; GCN: .set indirect_use_50_vgpr.private_seg_size, 16+(max(use_50_vgpr.private_seg_size)) +; GCN: .set indirect_use_50_vgpr.uses_vcc, or(1, use_50_vgpr.uses_vcc) +; GCN: .set indirect_use_50_vgpr.uses_flat_scratch, or(0, use_50_vgpr.uses_flat_scratch) +; GCN: .set indirect_use_50_vgpr.has_dyn_sized_stack, or(0, use_50_vgpr.has_dyn_sized_stack) +; GCN: .set indirect_use_50_vgpr.has_recursion, or(0, use_50_vgpr.has_recursion) +; GCN: .set indirect_use_50_vgpr.has_indirect_call, or(0, use_50_vgpr.has_indirect_call) +; GCN: TotalNumSgprs: 38 +; GCN: NumVgprs: 50 +; GCN: ScratchSize: 16 +define void @indirect_use_50_vgpr() #0 { + call void @use_50_vgpr() + ret void +} + +; GCN-LABEL: {{^}}use_80_sgpr: +; GCN: .set use_80_sgpr.num_vgpr, 1 +; GCN: .set use_80_sgpr.num_agpr, 0 +; GCN: .set use_80_sgpr.numbered_sgpr, 80 +; GCN: .set use_80_sgpr.private_seg_size, 8 +; GCN: .set use_80_sgpr.uses_vcc, 0 +; GCN: .set use_80_sgpr.uses_flat_scratch, 0 +; GCN: .set use_80_sgpr.has_dyn_sized_stack, 0 +; GCN: .set use_80_sgpr.has_recursion, 0 +; GCN: .set use_80_sgpr.has_indirect_call, 0 +; GCN: TotalNumSgprs: 84 +; GCN: NumVgprs: 1 +; GCN: ScratchSize: 8 +define void @use_80_sgpr() #1 { + call void asm sideeffect "", "~{s79}"() #0 + ret void +} + +; GCN-LABEL: {{^}}indirect_use_80_sgpr: +; GCN: .set indirect_use_80_sgpr.num_vgpr, max(41, use_80_sgpr.num_vgpr) +; GCN: .set indirect_use_80_sgpr.num_agpr, max(0, use_80_sgpr.num_agpr) +; GCN: .set indirect_use_80_sgpr.numbered_sgpr, max(34, use_80_sgpr.numbered_sgpr) +; GCN: .set indirect_use_80_sgpr.private_seg_size, 16+(max(use_80_sgpr.private_seg_size)) +; GCN: .set indirect_use_80_sgpr.uses_vcc, or(1, use_80_sgpr.uses_vcc) +; GCN: .set indirect_use_80_sgpr.uses_flat_scratch, or(0, use_80_sgpr.uses_flat_scratch) +; GCN: .set indirect_use_80_sgpr.has_dyn_sized_stack, or(0, use_80_sgpr.has_dyn_sized_stack) +; GCN: .set indirect_use_80_sgpr.has_recursion, or(0, use_80_sgpr.has_recursion) +; GCN: .set indirect_use_80_sgpr.has_indirect_call, or(0, use_80_sgpr.has_indirect_call) +; GCN: TotalNumSgprs: 84 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 24 +define void @indirect_use_80_sgpr() #1 { + call void @use_80_sgpr() + ret void +} + +; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr: +; GCN: .set indirect_2_level_use_80_sgpr.num_vgpr, max(32, indirect_use_80_sgpr.num_vgpr) +; GCN: .set indirect_2_level_use_80_sgpr.num_agpr, max(0, indirect_use_80_sgpr.num_agpr) +; GCN: .set indirect_2_level_use_80_sgpr.numbered_sgpr, max(33, indirect_use_80_sgpr.numbered_sgpr) +; GCN: .set indirect_2_level_use_80_sgpr.private_seg_size, 0+(max(indirect_use_80_sgpr.private_seg_size)) +; GCN: .set indirect_2_level_use_80_sgpr.uses_vcc, or(1, indirect_use_80_sgpr.uses_vcc) +; GCN: .set indirect_2_level_use_80_sgpr.uses_flat_scratch, or(1, indirect_use_80_sgpr.uses_flat_scratch) +; GCN: .set indirect_2_level_use_80_sgpr.has_dyn_sized_stack, or(0, indirect_use_80_sgpr.has_dyn_sized_stack) +; GCN: .set indirect_2_level_use_80_sgpr.has_recursion, or(0, indirect_use_80_sgpr.has_recursion) +; GCN: .set indirect_2_level_use_80_sgpr.has_indirect_call, or(0, indirect_use_80_sgpr.has_indirect_call) +; GCN: TotalNumSgprs: 86 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 24 +define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 { + call void @indirect_use_80_sgpr() + ret void +} + +; GCN-LABEL: {{^}}use_stack0: +; GCN: .set use_stack0.num_vgpr, 1 +; GCN: .set use_stack0.num_agpr, 0 +; GCN: .set use_stack0.numbered_sgpr, 33 +; GCN: .set use_stack0.private_seg_size, 2052 +; GCN: .set use_stack0.uses_vcc, 0 +; GCN: .set use_stack0.uses_flat_scratch, 0 +; GCN: .set use_stack0.has_dyn_sized_stack, 0 +; GCN: .set use_stack0.has_recursion, 0 +; GCN: .set use_stack0.has_indirect_call, 0 +; GCN: TotalNumSgprs: 37 +; GCN: NumVgprs: 1 +; GCN: ScratchSize: 2052 +define void @use_stack0() #1 { + %alloca = alloca [512 x i32], align 4, addrspace(5) + call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0 + ret void +} + +; GCN-LABEL: {{^}}use_stack1: +; GCN: .set use_stack1.num_vgpr, 1 +; GCN: .set use_stack1.num_agpr, 0 +; GCN: .set use_stack1.numbered_sgpr, 33 +; GCN: .set use_stack1.private_seg_size, 404 +; GCN: .set use_stack1.uses_vcc, 0 +; GCN: .set use_stack1.uses_flat_scratch, 0 +; GCN: .set use_stack1.has_dyn_sized_stack, 0 +; GCN: .set use_stack1.has_recursion, 0 +; GCN: .set use_stack1.has_indirect_call, 0 +; GCN: TotalNumSgprs: 37 +; GCN: NumVgprs: 1 +; GCN: ScratchSize: 404 +define void @use_stack1() #1 { + %alloca = alloca [100 x i32], align 4, addrspace(5) + call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0 + ret void +} + +; GCN-LABEL: {{^}}indirect_use_stack: +; GCN: .set indirect_use_stack.num_vgpr, max(41, use_stack0.num_vgpr) +; GCN: .set indirect_use_stack.num_agpr, max(0, use_stack0.num_agpr) +; GCN: .set indirect_use_stack.numbered_sgpr, max(34, use_stack0.numbered_sgpr) +; GCN: .set indirect_use_stack.private_seg_size, 80+(max(use_stack0.private_seg_size)) +; GCN: .set indirect_use_stack.uses_vcc, or(1, use_stack0.uses_vcc) +; GCN: .set indirect_use_stack.uses_flat_scratch, or(0, use_stack0.uses_flat_scratch) +; GCN: .set indirect_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack) +; GCN: .set indirect_use_stack.has_recursion, or(0, use_stack0.has_recursion) +; GCN: .set indirect_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call) +; GCN: TotalNumSgprs: 38 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 2132 +define void @indirect_use_stack() #1 { + %alloca = alloca [16 x i32], align 4, addrspace(5) + call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0 + call void @use_stack0() + ret void +} + +; GCN-LABEL: {{^}}indirect_2_level_use_stack: +; GCN: .set indirect_2_level_use_stack.num_vgpr, max(32, indirect_use_stack.num_vgpr) +; GCN: .set indirect_2_level_use_stack.num_agpr, max(0, indirect_use_stack.num_agpr) +; GCN: .set indirect_2_level_use_stack.numbered_sgpr, max(33, indirect_use_stack.numbered_sgpr) +; GCN: .set indirect_2_level_use_stack.private_seg_size, 0+(max(indirect_use_stack.private_seg_size)) +; GCN: .set indirect_2_level_use_stack.uses_vcc, or(1, indirect_use_stack.uses_vcc) +; GCN: .set indirect_2_level_use_stack.uses_flat_scratch, or(1, indirect_use_stack.uses_flat_scratch) +; GCN: .set indirect_2_level_use_stack.has_dyn_sized_stack, or(0, indirect_use_stack.has_dyn_sized_stack) +; GCN: .set indirect_2_level_use_stack.has_recursion, or(0, indirect_use_stack.has_recursion) +; GCN: .set indirect_2_level_use_stack.has_indirect_call, or(0, indirect_use_stack.has_indirect_call) +; GCN: TotalNumSgprs: 40 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 2132 +define amdgpu_kernel void @indirect_2_level_use_stack() #0 { + call void @indirect_use_stack() + ret void +} + + +; Should be maximum of callee usage +; GCN-LABEL: {{^}}multi_call_use_use_stack: +; GCN: .set multi_call_use_use_stack.num_vgpr, max(41, use_stack0.num_vgpr, use_stack1.num_vgpr) +; GCN: .set multi_call_use_use_stack.num_agpr, max(0, use_stack0.num_agpr, use_stack1.num_agpr) +; GCN: .set multi_call_use_use_stack.numbered_sgpr, max(42, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr) +; GCN: .set multi_call_use_use_stack.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size)) +; GCN: .set multi_call_use_use_stack.uses_vcc, or(1, use_stack0.uses_vcc, use_stack1.uses_vcc) +; GCN: .set multi_call_use_use_stack.uses_flat_scratch, or(1, use_stack0.uses_flat_scratch, use_stack1.uses_flat_scratch) +; GCN: .set multi_call_use_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack) +; GCN: .set multi_call_use_use_stack.has_recursion, or(0, use_stack0.has_recursion, use_stack1.has_recursion) +; GCN: .set multi_call_use_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call, use_stack1.has_indirect_call) +; GCN: TotalNumSgprs: 48 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 2052 +define amdgpu_kernel void @multi_call_use_use_stack() #0 { + call void @use_stack0() + call void @use_stack1() + ret void +} + +declare void @external() #0 + +; GCN-LABEL: {{^}}multi_call_with_external: +; GCN: .set multi_call_with_external.num_vgpr, max(41, amdgpu.max_num_vgpr) +; GCN: .set multi_call_with_external.num_agpr, max(0, amdgpu.max_num_agpr) +; GCN: .set multi_call_with_external.numbered_sgpr, max(42, amdgpu.max_num_sgpr) +; GCN: .set multi_call_with_external.private_seg_size, 0 +; GCN: .set multi_call_with_external.uses_vcc, 1 +; GCN: .set multi_call_with_external.uses_flat_scratch, 1 +; GCN: .set multi_call_with_external.has_dyn_sized_stack, 1 +; GCN: .set multi_call_with_external.has_recursion, 0 +; GCN: .set multi_call_with_external.has_indirect_call, 1 +; GCN: TotalNumSgprs: multi_call_with_external.numbered_sgpr+6 +; GCN: NumVgprs: multi_call_with_external.num_vgpr +; GCN: ScratchSize: 0 +define amdgpu_kernel void @multi_call_with_external() #0 { + call void @use_stack0() + call void @use_stack1() + call void @external() + ret void +} + +; GCN-LABEL: {{^}}usage_external: +; GCN: .set usage_external.num_vgpr, max(32, amdgpu.max_num_vgpr) +; GCN: .set usage_external.num_agpr, max(0, amdgpu.max_num_agpr) +; GCN: .set usage_external.numbered_sgpr, max(33, amdgpu.max_num_sgpr) +; GCN: .set usage_external.private_seg_size, 0 +; GCN: .set usage_external.uses_vcc, 1 +; GCN: .set usage_external.uses_flat_scratch, 1 +; GCN: .set usage_external.has_dyn_sized_stack, 1 +; GCN: .set usage_external.has_recursion, 0 +; GCN: .set usage_external.has_indirect_call, 1 +; GCN: TotalNumSgprs: usage_external.numbered_sgpr+6 +; GCN: NumVgprs: usage_external.num_vgpr +; GCN: ScratchSize: 0 +define amdgpu_kernel void @usage_external() #0 { + call void @external() + ret void +} + +declare void @external_recurse() #2 + +; GCN-LABEL: {{^}}usage_external_recurse: +; GCN: .set usage_external_recurse.num_vgpr, max(32, amdgpu.max_num_vgpr) +; GCN: .set usage_external_recurse.num_agpr, max(0, amdgpu.max_num_agpr) +; GCN: .set usage_external_recurse.numbered_sgpr, max(33, amdgpu.max_num_sgpr) +; GCN: .set usage_external_recurse.private_seg_size, 0 +; GCN: .set usage_external_recurse.uses_vcc, 1 +; GCN: .set usage_external_recurse.uses_flat_scratch, 1 +; GCN: .set usage_external_recurse.has_dyn_sized_stack, 1 +; GCN: .set usage_external_recurse.has_recursion, 1 +; GCN: .set usage_external_recurse.has_indirect_call, 1 +; GCN: TotalNumSgprs: usage_external_recurse.numbered_sgpr+6 +; GCN: NumVgprs: usage_external_recurse.num_vgpr +; GCN: ScratchSize: 0 +define amdgpu_kernel void @usage_external_recurse() #0 { + call void @external_recurse() + ret void +} + +; GCN-LABEL: {{^}}direct_recursion_use_stack: +; GCN: .set direct_recursion_use_stack.num_vgpr, 41 +; GCN: .set direct_recursion_use_stack.num_agpr, 0 +; GCN: .set direct_recursion_use_stack.numbered_sgpr, 36 +; GCN: .set direct_recursion_use_stack.private_seg_size, 2064 +; GCN: .set direct_recursion_use_stack.uses_vcc, 1 +; GCN: .set direct_recursion_use_stack.uses_flat_scratch, 0 +; GCN: .set direct_recursion_use_stack.has_dyn_sized_stack, 0 +; GCN: .set direct_recursion_use_stack.has_recursion, 1 +; GCN: .set direct_recursion_use_stack.has_indirect_call, 0 +; GCN: TotalNumSgprs: 40 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 2064 +define void @direct_recursion_use_stack(i32 %val) #2 { + %alloca = alloca [512 x i32], align 4, addrspace(5) + call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0 + %cmp = icmp eq i32 %val, 0 + br i1 %cmp, label %ret, label %call + +call: + %val.sub1 = sub i32 %val, 1 + call void @direct_recursion_use_stack(i32 %val.sub1) + br label %ret + +ret: + ret void +} + +; GCN-LABEL: {{^}}usage_direct_recursion: +; GCN: .set usage_direct_recursion.num_vgpr, max(32, direct_recursion_use_stack.num_vgpr) +; GCN: .set usage_direct_recursion.num_agpr, max(0, direct_recursion_use_stack.num_agpr) +; GCN: .set usage_direct_recursion.numbered_sgpr, max(33, direct_recursion_use_stack.numbered_sgpr) +; GCN: .set usage_direct_recursion.private_seg_size, 0+(max(direct_recursion_use_stack.private_seg_size)) +; GCN: .set usage_direct_recursion.uses_vcc, or(1, direct_recursion_use_stack.uses_vcc) +; GCN: .set usage_direct_recursion.uses_flat_scratch, or(1, direct_recursion_use_stack.uses_flat_scratch) +; GCN: .set usage_direct_recursion.has_dyn_sized_stack, or(0, direct_recursion_use_stack.has_dyn_sized_stack) +; GCN: .set usage_direct_recursion.has_recursion, or(1, direct_recursion_use_stack.has_recursion) +; GCN: .set usage_direct_recursion.has_indirect_call, or(0, direct_recursion_use_stack.has_indirect_call) +; GCN: TotalNumSgprs: 42 +; GCN: NumVgprs: 41 +; GCN: ScratchSize: 2064 +define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 { + call void @direct_recursion_use_stack(i32 %n) + ret void +} + +; Make sure there's no assert when a sgpr96 is used. +; GCN-LABEL: {{^}}count_use_sgpr96_external_call +; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(32, amdgpu.max_num_vgpr) +; GCN: .set count_use_sgpr96_external_call.num_agpr, max(0, amdgpu.max_num_agpr) +; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr) +; GCN: .set count_use_sgpr96_external_call.private_seg_size, 0 +; GCN: .set count_use_sgpr96_external_call.uses_vcc, 1 +; GCN: .set count_use_sgpr96_external_call.uses_flat_scratch, 1 +; GCN: .set count_use_sgpr96_external_call.has_dyn_sized_stack, 1 +; GCN: .set count_use_sgpr96_external_call.has_recursion, 0 +; GCN: .set count_use_sgpr96_external_call.has_indirect_call, 1 +; GCN: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+6 +; GCN: NumVgprs: count_use_sgpr96_external_call.num_vgpr +; GCN: ScratchSize: 0 +define amdgpu_kernel void @count_use_sgpr96_external_call() { +entry: + tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> ) #1 + call void @external() + ret void +} + +; Make sure there's no assert when a sgpr160 is used. +; GCN-LABEL: {{^}}count_use_sgpr160_external_call +; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(32, amdgpu.max_num_vgpr) +; GCN: .set count_use_sgpr160_external_call.num_agpr, max(0, amdgpu.max_num_agpr) +; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr) +; GCN: .set count_use_sgpr160_external_call.private_seg_size, 0 +; GCN: .set count_use_sgpr160_external_call.uses_vcc, 1 +; GCN: .set count_use_sgpr160_external_call.uses_flat_scratch, 1 +; GCN: .set count_use_sgpr160_external_call.has_dyn_sized_stack, 1 +; GCN: .set count_use_sgpr160_external_call.has_recursion, 0 +; GCN: .set count_use_sgpr160_external_call.has_indirect_call, 1 +; GCN: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+6 +; GCN: NumVgprs: count_use_sgpr160_external_call.num_vgpr +; GCN: ScratchSize: 0 +define amdgpu_kernel void @count_use_sgpr160_external_call() { +entry: + tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> ) #1 + call void @external() + ret void +} + +; Make sure there's no assert when a vgpr160 is used. +; GCN-LABEL: {{^}}count_use_vgpr160_external_call +; GCN: .set count_use_vgpr160_external_call.num_vgpr, max(32, amdgpu.max_num_vgpr) +; GCN: .set count_use_vgpr160_external_call.num_agpr, max(0, amdgpu.max_num_agpr) +; GCN: .set count_use_vgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr) +; GCN: .set count_use_vgpr160_external_call.private_seg_size, 0 +; GCN: .set count_use_vgpr160_external_call.uses_vcc, 1 +; GCN: .set count_use_vgpr160_external_call.uses_flat_scratch, 1 +; GCN: .set count_use_vgpr160_external_call.has_dyn_sized_stack, 1 +; GCN: .set count_use_vgpr160_external_call.has_recursion, 0 +; GCN: .set count_use_vgpr160_external_call.has_indirect_call, 1 +; GCN: TotalNumSgprs: count_use_vgpr160_external_call.numbered_sgpr+6 +; GCN: NumVgprs: count_use_vgpr160_external_call.num_vgpr +; GCN: ScratchSize: 0 +define amdgpu_kernel void @count_use_vgpr160_external_call() { +entry: + tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> ) #1 + call void @external() + ret void +} + +; Added at the of the .s are the module level maximums +; GCN: .set amdgpu.max_num_vgpr, 50 +; GCN: .set amdgpu.max_num_agpr, 0 +; GCN: .set amdgpu.max_num_sgpr, 80 + +attributes #0 = { nounwind noinline norecurse } +attributes #1 = { nounwind noinline norecurse } +attributes #2 = { nounwind noinline } diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll b/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll index f20d720c3876ba..dce4162c246247 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll @@ -41,7 +41,7 @@ entry: } ; FIXME: This should warn too -; ERR-NOT: warning +; ERR-NOT: warning: inline asm clobber list contains reserved registers define amdgpu_kernel void @def_exec(ptr addrspace(1) %ptr) { entry: %exec = call i64 asm sideeffect "; def $0", "={exec}"() diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index 9d1368b2ec105a..e7c77d3123e825 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -3,6 +3,18 @@ declare i32 @llvm.amdgcn.workitem.id.x() +define <2 x i64> @f1() #0 { +; GFX11-LABEL: f1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + ret <2 x i64> zeroinitializer +} + define void @f0() { ; GFX11-LABEL: f0: ; GFX11: ; %bb.0: ; %bb @@ -36,18 +48,6 @@ bb: ret void } -define <2 x i64> @f1() #0 { -; GFX11-LABEL: f1: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - ret <2 x i64> zeroinitializer -} - ; FIXME: This generates "instid1(/* invalid instid value */)". define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) { ; GFX11-LABEL: f2: diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll index b49931379b84a5..957f404c8cdbed 100644 --- a/llvm/test/CodeGen/AMDGPU/ipra.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra.ll @@ -30,7 +30,7 @@ define hidden void @func() #1 { ; GCN-NOT: writelane ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8 -; GCN: ; NumSgprs: 37 +; GCN: ; TotalNumSgprs: 37 ; GCN: ; NumVgprs: 9 define amdgpu_kernel void @kernel_call() #0 { %vgpr = load volatile i32, ptr addrspace(1) undef @@ -48,7 +48,7 @@ define amdgpu_kernel void @kernel_call() #0 { ; GCN-NOT: readlane ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8 -; GCN: ; NumSgprs: 34 +; GCN: ; TotalNumSgprs: 34 ; GCN: ; NumVgprs: 10 define void @func_regular_call() #1 { %vgpr = load volatile i32, ptr addrspace(1) undef @@ -64,7 +64,7 @@ define void @func_regular_call() #1 { ; GCN-NEXT: s_addc_u32 s17, ; GCN-NEXT: s_setpc_b64 s[16:17] -; GCN: ; NumSgprs: 32 +; GCN: ; TotalNumSgprs: 32 ; GCN: ; NumVgprs: 8 define void @func_tail_call() #1 { tail call void @func() @@ -77,7 +77,7 @@ define void @func_tail_call() #1 { ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8 ; GCN: s_setpc_b64 -; GCN: ; NumSgprs: 34 +; GCN: ; TotalNumSgprs: 34 ; GCN: ; NumVgprs: 10 define void @func_call_tail_call() #1 { %vgpr = load volatile i32, ptr addrspace(1) undef @@ -105,13 +105,6 @@ define void @test_funcx2() #0 { ret void } -; GCN-LABEL: {{^}}wombat: -define weak amdgpu_kernel void @wombat(ptr %arg, ptr %arg2) { -bb: - call void @hoge() #0 - ret void -} - ; Make sure we save/restore the return address around the call. ; Function Attrs: norecurse define internal void @hoge() #2 { @@ -128,6 +121,13 @@ bb: ret void } +; GCN-LABEL: {{^}}wombat: +define weak amdgpu_kernel void @wombat(ptr %arg, ptr %arg2) { +bb: + call void @hoge() #0 + ret void +} + declare dso_local void @eggs() diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 7bf1b8746fd87b..c9e24b721c41e1 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -149,12 +149,9 @@ ; GCN-O0-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O0-NEXT: Machine Optimization Remark Emitter ; GCN-O0-NEXT: Stack Frame Layout Analysis -; GCN-O0-NEXT: Function register usage analysis -; GCN-O0-NEXT: FunctionPass Manager -; GCN-O0-NEXT: Lazy Machine Block Frequency Analysis -; GCN-O0-NEXT: Machine Optimization Remark Emitter -; GCN-O0-NEXT: AMDGPU Assembly Printer -; GCN-O0-NEXT: Free MachineFunction +; GCN-O0-NEXT: Function register usage analysis +; GCN-O0-NEXT: AMDGPU Assembly Printer +; GCN-O0-NEXT: Free MachineFunction ; GCN-O1:Target Library Information ; GCN-O1-NEXT:Target Pass Configuration @@ -427,12 +424,9 @@ ; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-NEXT: Machine Optimization Remark Emitter ; GCN-O1-NEXT: Stack Frame Layout Analysis -; GCN-O1-NEXT: Function register usage analysis -; GCN-O1-NEXT: FunctionPass Manager -; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis -; GCN-O1-NEXT: Machine Optimization Remark Emitter -; GCN-O1-NEXT: AMDGPU Assembly Printer -; GCN-O1-NEXT: Free MachineFunction +; GCN-O1-NEXT: Function register usage analysis +; GCN-O1-NEXT: AMDGPU Assembly Printer +; GCN-O1-NEXT: Free MachineFunction ; GCN-O1-OPTS:Target Library Information ; GCN-O1-OPTS-NEXT:Target Pass Configuration @@ -733,12 +727,9 @@ ; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter ; GCN-O1-OPTS-NEXT: Stack Frame Layout Analysis -; GCN-O1-OPTS-NEXT: Function register usage analysis -; GCN-O1-OPTS-NEXT: FunctionPass Manager -; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis -; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter -; GCN-O1-OPTS-NEXT: AMDGPU Assembly Printer -; GCN-O1-OPTS-NEXT: Free MachineFunction +; GCN-O1-OPTS-NEXT: Function register usage analysis +; GCN-O1-OPTS-NEXT: AMDGPU Assembly Printer +; GCN-O1-OPTS-NEXT: Free MachineFunction ; GCN-O2:Target Library Information ; GCN-O2-NEXT:Target Pass Configuration @@ -1045,12 +1036,9 @@ ; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O2-NEXT: Machine Optimization Remark Emitter ; GCN-O2-NEXT: Stack Frame Layout Analysis -; GCN-O2-NEXT: Function register usage analysis -; GCN-O2-NEXT: FunctionPass Manager -; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis -; GCN-O2-NEXT: Machine Optimization Remark Emitter -; GCN-O2-NEXT: AMDGPU Assembly Printer -; GCN-O2-NEXT: Free MachineFunction +; GCN-O2-NEXT: Function register usage analysis +; GCN-O2-NEXT: AMDGPU Assembly Printer +; GCN-O2-NEXT: Free MachineFunction ; GCN-O3:Target Library Information ; GCN-O3-NEXT:Target Pass Configuration @@ -1369,12 +1357,9 @@ ; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O3-NEXT: Machine Optimization Remark Emitter ; GCN-O3-NEXT: Stack Frame Layout Analysis -; GCN-O3-NEXT: Function register usage analysis -; GCN-O3-NEXT: FunctionPass Manager -; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis -; GCN-O3-NEXT: Machine Optimization Remark Emitter -; GCN-O3-NEXT: AMDGPU Assembly Printer -; GCN-O3-NEXT: Free MachineFunction +; GCN-O3-NEXT: Function register usage analysis +; GCN-O3-NEXT: AMDGPU Assembly Printer +; GCN-O3-NEXT: Free MachineFunction define void @empty() { ret void diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll index 34dcdaf29677e4..b508ffff8050a8 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll @@ -9,6 +9,19 @@ @lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1 @lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16 +; GCN-LABEL: {{^}}f0: +; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0 +; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3 +; GCN: ds_write_b8 [[NULL]], [[TREE]] +define void @f0() { +; OPT-LABEL: @f0() { +; OPT-NEXT: store i8 3, ptr addrspace(3) @llvm.amdgcn.module.lds, align 1 +; OPT-NEXT: ret void +; + store i8 3, ptr addrspace(3) @lds.size.1.align.1, align 1 + ret void +} + ; GCN-LABEL: {{^}}k0: ; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 @@ -29,16 +42,3 @@ define amdgpu_kernel void @k0() { call void @f0() ret void } - -; GCN-LABEL: {{^}}f0: -; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0 -; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3 -; GCN: ds_write_b8 [[NULL]], [[TREE]] -define void @f0() { -; OPT-LABEL: @f0() { -; OPT-NEXT: store i8 3, ptr addrspace(3) @llvm.amdgcn.module.lds, align 1 -; OPT-NEXT: ret void -; - store i8 3, ptr addrspace(3) @lds.size.1.align.1, align 1 - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll index 27b71dd471a839..aa16937d7d897d 100644 --- a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll +++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll @@ -24,6 +24,55 @@ store i32 0, ptr addrspace(3) @used_by_kernel } ; CHECK: ; LDSByteSize: 4 bytes +define void @nonkernel() { +; GFX9-LABEL: nonkernel: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: ds_write_b32 v0, v0 offset:8 +; GFX9-NEXT: ds_write_b64 v0, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: nonkernel: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: ds_write_b32 v0, v0 offset:8 +; GFX10-NEXT: ds_write_b64 v0, v[0:1] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; G_GFX9-LABEL: nonkernel: +; G_GFX9: ; %bb.0: +; G_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; G_GFX9-NEXT: v_mov_b32_e32 v2, 0 +; G_GFX9-NEXT: v_mov_b32_e32 v3, 8 +; G_GFX9-NEXT: v_mov_b32_e32 v0, 0 +; G_GFX9-NEXT: v_mov_b32_e32 v1, 0 +; G_GFX9-NEXT: ds_write_b32 v3, v2 +; G_GFX9-NEXT: ds_write_b64 v2, v[0:1] +; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX9-NEXT: s_setpc_b64 s[30:31] +; +; G_GFX10-LABEL: nonkernel: +; G_GFX10: ; %bb.0: +; G_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; G_GFX10-NEXT: v_mov_b32_e32 v2, 0 +; G_GFX10-NEXT: v_mov_b32_e32 v3, 8 +; G_GFX10-NEXT: v_mov_b32_e32 v0, 0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, 0 +; G_GFX10-NEXT: ds_write_b32 v3, v2 +; G_GFX10-NEXT: ds_write_b64 v2, v[0:1] +; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX10-NEXT: s_setpc_b64 s[30:31] + store i32 0, ptr addrspace(3) @used_by_both + store double 0.0, ptr addrspace(3) @used_by_function + ret void +} + ; Needs to allocate both variables, store to used_by_both is at sizeof(double) define amdgpu_kernel void @withcall() { ; GFX9-LABEL: withcall: @@ -171,55 +220,5 @@ define amdgpu_kernel void @nocall_false_sharing() { } ; CHECK: ; LDSByteSize: 4 bytes - -define void @nonkernel() { -; GFX9-LABEL: nonkernel: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: ds_write_b32 v0, v0 offset:8 -; GFX9-NEXT: ds_write_b64 v0, v[0:1] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: nonkernel: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: ds_write_b32 v0, v0 offset:8 -; GFX10-NEXT: ds_write_b64 v0, v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; G_GFX9-LABEL: nonkernel: -; G_GFX9: ; %bb.0: -; G_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; G_GFX9-NEXT: v_mov_b32_e32 v2, 0 -; G_GFX9-NEXT: v_mov_b32_e32 v3, 8 -; G_GFX9-NEXT: v_mov_b32_e32 v0, 0 -; G_GFX9-NEXT: v_mov_b32_e32 v1, 0 -; G_GFX9-NEXT: ds_write_b32 v3, v2 -; G_GFX9-NEXT: ds_write_b64 v2, v[0:1] -; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX9-NEXT: s_setpc_b64 s[30:31] -; -; G_GFX10-LABEL: nonkernel: -; G_GFX10: ; %bb.0: -; G_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v2, 0 -; G_GFX10-NEXT: v_mov_b32_e32 v3, 8 -; G_GFX10-NEXT: v_mov_b32_e32 v0, 0 -; G_GFX10-NEXT: v_mov_b32_e32 v1, 0 -; G_GFX10-NEXT: ds_write_b32 v3, v2 -; G_GFX10-NEXT: ds_write_b64 v2, v[0:1] -; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: s_setpc_b64 s[30:31] - store i32 0, ptr addrspace(3) @used_by_both - store double 0.0, ptr addrspace(3) @used_by_function - ret void -} - !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll index 6d18f354e65422..a2baa56ea0c989 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s ; CHECK-LABEL: {{^}}_amdgpu_cs_main: -; CHECK: ; NumSgprs: 4 +; CHECK: ; TotalNumSgprs: 4 ; CHECK: ; NumVgprs: 2 ; CHECK: .amdgpu_pal_metadata ; CHECK-NEXT: --- diff --git a/llvm/test/CodeGen/AMDGPU/recursion.ll b/llvm/test/CodeGen/AMDGPU/recursion.ll index d58477c194ea62..c0d228e1254e64 100644 --- a/llvm/test/CodeGen/AMDGPU/recursion.ll +++ b/llvm/test/CodeGen/AMDGPU/recursion.ll @@ -3,7 +3,11 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=V5 %s ; CHECK-LABEL: {{^}}recursive: +; CHECK: .set recursive.private_seg_size, 16+(max(16384)) ; CHECK: ScratchSize: 16 + +; V5-LABEL: {{^}}recursive: +; V5: .set recursive.has_recursion, 1 define void @recursive() { call void @recursive() store volatile i32 0, ptr addrspace(1) undef @@ -11,18 +15,22 @@ define void @recursive() { } ; CHECK-LABEL: {{^}}tail_recursive: +; CHECK: .set tail_recursive.private_seg_size, 0 ; CHECK: ScratchSize: 0 define void @tail_recursive() { tail call void @tail_recursive() ret void } +; CHECK: .set calls_tail_recursive.private_seg_size, 0+(max(tail_recursive.private_seg_size)) define void @calls_tail_recursive() norecurse { tail call void @tail_recursive() ret void } ; CHECK-LABEL: {{^}}tail_recursive_with_stack: +; CHECK: .set tail_recursive_with_stack.private_seg_size, 8 +; CHECK: .set tail_recursive_with_stack.has_recursion, 1 define void @tail_recursive_with_stack() { %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca @@ -33,11 +41,11 @@ define void @tail_recursive_with_stack() { ; For an arbitrary recursive call, report a large number for unknown stack ; usage for code object v4 and older ; CHECK-LABEL: {{^}}calls_recursive: -; CHECK: .amdhsa_private_segment_fixed_size 16400{{$}} +; CHECK: .set calls_recursive.private_seg_size, 0+(max(16384, recursive.private_seg_size)) ; ; V5-LABEL: {{^}}calls_recursive: -; V5: .amdhsa_private_segment_fixed_size 0{{$}} -; V5: .amdhsa_uses_dynamic_stack 1 +; V5: .set calls_recursive.private_seg_size, 0+(max(recursive.private_seg_size)) +; V5: .set calls_recursive.has_dyn_sized_stack, or(0, recursive.has_dyn_sized_stack) define amdgpu_kernel void @calls_recursive() { call void @recursive() ret void @@ -46,7 +54,7 @@ define amdgpu_kernel void @calls_recursive() { ; Make sure we do not report a huge stack size for tail recursive ; functions ; CHECK-LABEL: {{^}}kernel_indirectly_calls_tail_recursive: -; CHECK: .amdhsa_private_segment_fixed_size 0{{$}} +; CHECK: .set kernel_indirectly_calls_tail_recursive.private_seg_size, 0+(max(calls_tail_recursive.private_seg_size)) define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() { call void @calls_tail_recursive() ret void @@ -57,22 +65,22 @@ define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() { ; in the kernel. ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive: -; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}} +; CHECK: .set kernel_calls_tail_recursive.private_seg_size, 0+(max(16384, tail_recursive.private_seg_size)) ; ; V5-LABEL: {{^}}kernel_calls_tail_recursive: -; V5: .amdhsa_private_segment_fixed_size 0{{$}} -; V5: .amdhsa_uses_dynamic_stack 1 +; V5: .set kernel_calls_tail_recursive.private_seg_size, 0+(max(tail_recursive.private_seg_size)) +; V5: .set kernel_calls_tail_recursive.has_recursion, or(1, tail_recursive.has_recursion) define amdgpu_kernel void @kernel_calls_tail_recursive() { call void @tail_recursive() ret void } ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive_with_stack: -; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}} +; CHECK: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+(max(16384, tail_recursive_with_stack.private_seg_size)) ; ; V5-LABEL: {{^}}kernel_calls_tail_recursive_with_stack: -; V5: .amdhsa_private_segment_fixed_size 8{{$}} -; V5: .amdhsa_uses_dynamic_stack 1 +; V5: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+(max(tail_recursive_with_stack.private_seg_size)) +; V5: .set kernel_calls_tail_recursive_with_stack.has_dyn_sized_stack, or(0, tail_recursive_with_stack.has_dyn_sized_stack) define amdgpu_kernel void @kernel_calls_tail_recursive_with_stack() { call void @tail_recursive_with_stack() ret void diff --git a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll index 8d12b3fe626da8..35e11ad6a648ba 100644 --- a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll +++ b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll @@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0 ; SI-LABEL: {{^}}foo: ; SI: .section .AMDGPU.csdata ; SI: ; Kernel info: -; SI: ; NumSgprs: {{[0-9]+}} +; SI: ; TotalNumSgprs: {{[0-9]+}} ; SI: ; NumVgprs: {{[0-9]+}} define amdgpu_kernel void @foo(ptr addrspace(1) noalias %out, ptr addrspace(1) %abase, ptr addrspace(1) %bbase) nounwind { %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0); diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll index 002de8bb4eb510..8bbae59f468f1d 100644 --- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll +++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll @@ -2,7 +2,7 @@ ; RUN: FileCheck -check-prefix=REMARK %s < %t ; STDERR: remark: foo.cl:27:0: Function Name: test_kernel -; STDERR-NEXT: remark: foo.cl:27:0: SGPRs: 28 +; STDERR-NEXT: remark: foo.cl:27:0: TotalSGPRs: 28 ; STDERR-NEXT: remark: foo.cl:27:0: VGPRs: 9 ; STDERR-NEXT: remark: foo.cl:27:0: AGPRs: 43 ; STDERR-NEXT: remark: foo.cl:27:0: ScratchSize [bytes/lane]: 0 @@ -27,7 +27,7 @@ ; REMARK-NEXT: DebugLoc: { File: foo.cl, Line: 27, Column: 0 } ; REMARK-NEXT: Function: test_kernel ; REMARK-NEXT: Args: -; REMARK-NEXT: - String: ' SGPRs: ' +; REMARK-NEXT: - String: ' TotalSGPRs: ' ; REMARK-NEXT: - NumSGPR: '28' ; REMARK-NEXT: ... ; REMARK-NEXT: --- !Analysis @@ -122,7 +122,7 @@ define void @test_func() !dbg !6 { } ; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel -; STDERR-NEXT: remark: foo.cl:8:0: SGPRs: 4 +; STDERR-NEXT: remark: foo.cl:8:0: TotalSGPRs: 4 ; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: 0 ; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: 0 ; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: 0 @@ -141,12 +141,12 @@ define void @empty_func() !dbg !8 { } ; STDERR: remark: foo.cl:64:0: Function Name: test_indirect_call -; STDERR-NEXT: remark: foo.cl:64:0: SGPRs: 39 -; STDERR-NEXT: remark: foo.cl:64:0: VGPRs: 32 -; STDERR-NEXT: remark: foo.cl:64:0: AGPRs: 10 +; STDERR-NEXT: remark: foo.cl:64:0: TotalSGPRs: test_indirect_call.numbered_sgpr+6 +; STDERR-NEXT: remark: foo.cl:64:0: VGPRs: test_indirect_call.num_vgpr +; STDERR-NEXT: remark: foo.cl:64:0: AGPRs: test_indirect_call.num_agpr ; STDERR-NEXT: remark: foo.cl:64:0: ScratchSize [bytes/lane]: 0 ; STDERR-NEXT: remark: foo.cl:64:0: Dynamic Stack: True -; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: 8 +; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_call.numbered_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_call.num_agpr, test_indirect_call.num_vgpr), 1, 0)) ; STDERR-NEXT: remark: foo.cl:64:0: SGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:64:0: VGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:64:0: LDS Size [bytes/block]: 0 @@ -159,12 +159,12 @@ define amdgpu_kernel void @test_indirect_call() !dbg !9 { } ; STDERR: remark: foo.cl:74:0: Function Name: test_indirect_w_static_stack -; STDERR-NEXT: remark: foo.cl:74:0: SGPRs: 39 -; STDERR-NEXT: remark: foo.cl:74:0: VGPRs: 32 -; STDERR-NEXT: remark: foo.cl:74:0: AGPRs: 10 +; STDERR-NEXT: remark: foo.cl:74:0: TotalSGPRs: test_indirect_w_static_stack.numbered_sgpr+6 +; STDERR-NEXT: remark: foo.cl:74:0: VGPRs: test_indirect_w_static_stack.num_vgpr +; STDERR-NEXT: remark: foo.cl:74:0: AGPRs: test_indirect_w_static_stack.num_agpr ; STDERR-NEXT: remark: foo.cl:74:0: ScratchSize [bytes/lane]: 144 ; STDERR-NEXT: remark: foo.cl:74:0: Dynamic Stack: True -; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: 8 +; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_w_static_stack.numbered_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_w_static_stack.num_agpr, test_indirect_w_static_stack.num_vgpr), 1, 0)) ; STDERR-NEXT: remark: foo.cl:74:0: SGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:74:0: VGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:74:0: LDS Size [bytes/block]: 0 diff --git a/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll b/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll index bba59ba4d80302..5d5aad76afd095 100644 --- a/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll +++ b/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll @@ -1,6 +1,6 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN-V5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN-V5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN,ALL %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN-V5,ALL %s +; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN-V5,ALL %s ; Make sure there's no assertion when trying to report the resource ; usage for a function which becomes dead during codegen. @@ -21,9 +21,10 @@ define internal fastcc void @unreachable() { ; GCN-NOT: s_swappc_b64 ; GCN: s_endpgm -; GCN: .amdhsa_private_segment_fixed_size 0 -; GCN-NOT: .amdhsa_uses_dynamic_stack 0 -; GCN-V5: .amdhsa_uses_dynamic_stack 0 +; GCN-NOT: .amdhsa_uses_dynamic_stack +; GCN-V5: .amdhsa_uses_dynamic_stack +; ALL: .set entry.private_seg_size, 0 +; ALL: .set entry.has_dyn_sized_stack, 0 define amdgpu_kernel void @entry() { bb0: br i1 false, label %bb1, label %bb2