-
Notifications
You must be signed in to change notification settings - Fork 11.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AtomicExpand: Copy metadata from atomicrmw to cmpxchg #109409
base: users/arsenm/amdgpu-add-baseline-tests-cmpxchg-expansion
Are you sure you want to change the base?
AtomicExpand: Copy metadata from atomicrmw to cmpxchg #109409
Conversation
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesWhen expanding an atomicrmw with a cmpxchg, preserve any metadata The initial load should also probably receive the same metadata Patch is 863.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109409.diff 20 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/AtomicExpandUtils.h b/llvm/include/llvm/CodeGen/AtomicExpandUtils.h
index 1cb410a0c31c69..feb05de20b4571 100644
--- a/llvm/include/llvm/CodeGen/AtomicExpandUtils.h
+++ b/llvm/include/llvm/CodeGen/AtomicExpandUtils.h
@@ -20,10 +20,11 @@ class Value;
/// Parameters (see the expansion example below):
/// (the builder, %addr, %loaded, %new_val, ordering,
-/// /* OUT */ %success, /* OUT */ %new_loaded)
-using CreateCmpXchgInstFun =
- function_ref<void(IRBuilderBase &, Value *, Value *, Value *, Align,
- AtomicOrdering, SyncScope::ID, Value *&, Value *&)>;
+/// /* OUT */ %success, /* OUT */ %new_loaded,
+/// %MetadataSrc)
+using CreateCmpXchgInstFun = function_ref<void(
+ IRBuilderBase &, Value *, Value *, Value *, Align, AtomicOrdering,
+ SyncScope::ID, Value *&, Value *&, Instruction *)>;
/// Expand an atomic RMW instruction into a loop utilizing
/// cmpxchg. You'll want to make sure your target machine likes cmpxchg
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 3d4e2cb196a16a..5a3e529e5ebd02 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -98,7 +98,7 @@ class AtomicExpandImpl {
IRBuilderBase &Builder, Type *ResultType, Value *Addr, Align AddrAlign,
AtomicOrdering MemOpOrder, SyncScope::ID SSID,
function_ref<Value *(IRBuilderBase &, Value *)> PerformOp,
- CreateCmpXchgInstFun CreateCmpXchg);
+ CreateCmpXchgInstFun CreateCmpXchg, Instruction *MetadataSrc);
bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI);
bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
@@ -600,7 +600,8 @@ void AtomicExpandImpl::expandAtomicStore(StoreInst *SI) {
static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,
Value *Loaded, Value *NewVal, Align AddrAlign,
AtomicOrdering MemOpOrder, SyncScope::ID SSID,
- Value *&Success, Value *&NewLoaded) {
+ Value *&Success, Value *&NewLoaded,
+ Instruction *MetadataSrc) {
Type *OrigTy = NewVal->getType();
// This code can go away when cmpxchg supports FP and vector types.
@@ -612,9 +613,12 @@ static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,
Loaded = Builder.CreateBitCast(Loaded, IntTy);
}
- Value *Pair = Builder.CreateAtomicCmpXchg(
+ AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
Addr, Loaded, NewVal, AddrAlign, MemOpOrder,
AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID);
+ if (MetadataSrc)
+ Pair->copyMetadata(*MetadataSrc);
+
Success = Builder.CreateExtractValue(Pair, 1, "success");
NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
@@ -951,9 +955,9 @@ void AtomicExpandImpl::expandPartwordAtomicRMW(
Value *OldResult;
if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) {
- OldResult = insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr,
- PMV.AlignedAddrAlignment, MemOpOrder, SSID,
- PerformPartwordOp, createCmpXchgInstFun);
+ OldResult = insertRMWCmpXchgLoop(
+ Builder, PMV.WordType, PMV.AlignedAddr, PMV.AlignedAddrAlignment,
+ MemOpOrder, SSID, PerformPartwordOp, createCmpXchgInstFun, AI);
} else {
assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC);
OldResult = insertRMWLLSCLoop(Builder, PMV.WordType, PMV.AlignedAddr,
@@ -1591,7 +1595,7 @@ Value *AtomicExpandImpl::insertRMWCmpXchgLoop(
IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign,
AtomicOrdering MemOpOrder, SyncScope::ID SSID,
function_ref<Value *(IRBuilderBase &, Value *)> PerformOp,
- CreateCmpXchgInstFun CreateCmpXchg) {
+ CreateCmpXchgInstFun CreateCmpXchg, Instruction *MetadataSrc) {
LLVMContext &Ctx = Builder.getContext();
BasicBlock *BB = Builder.GetInsertBlock();
Function *F = BB->getParent();
@@ -1637,7 +1641,7 @@ Value *AtomicExpandImpl::insertRMWCmpXchgLoop(
MemOpOrder == AtomicOrdering::Unordered
? AtomicOrdering::Monotonic
: MemOpOrder,
- SSID, Success, NewLoaded);
+ SSID, Success, NewLoaded, MetadataSrc);
assert(Success && NewLoaded);
Loaded->addIncoming(NewLoaded, LoopBB);
@@ -1686,7 +1690,7 @@ bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
return buildAtomicRMWValue(AI->getOperation(), Builder, Loaded,
AI->getValOperand());
},
- CreateCmpXchg);
+ CreateCmpXchg, /*MetadataSrc=*/AI);
AI->replaceAllUsesWith(Loaded);
AI->eraseFromParent();
@@ -1838,11 +1842,15 @@ void AtomicExpandImpl::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
expandAtomicRMWToCmpXchg(
I, [this](IRBuilderBase &Builder, Value *Addr, Value *Loaded,
Value *NewVal, Align Alignment, AtomicOrdering MemOpOrder,
- SyncScope::ID SSID, Value *&Success, Value *&NewLoaded) {
+ SyncScope::ID SSID, Value *&Success, Value *&NewLoaded,
+ Instruction *MetadataSrc) {
// Create the CAS instruction normally...
AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
Addr, Loaded, NewVal, Alignment, MemOpOrder,
AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID);
+ if (MetadataSrc)
+ Pair->copyMetadata(*MetadataSrc);
+
Success = Builder.CreateExtractValue(Pair, 1, "success");
NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
index d3fb9d8ee522e7..443c5d18e68949 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
@@ -187,7 +187,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -204,7 +204,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -221,7 +221,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -238,7 +238,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -260,7 +260,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -292,7 +292,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -309,7 +309,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -326,7 +326,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -343,7 +343,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -365,7 +365,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -382,7 +382,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -409,7 +409,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -426,7 +426,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -443,7 +443,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -460,7 +460,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -482,7 +482,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -514,7 +514,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -531,7 +531,7 @@ define float @tes...
[truncated]
|
edf2464
to
f85fb78
Compare
4596ffa
to
500dc8a
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure we can assume all metadata which applies to an atomicrmw also makes sense on the generated cmpxchg. I mean, most metadata probably does, but say we start allowing !align
metadata on atomicrmw...
In terms of the generated code, I think we're fine; we don't rely on the load producing a value that's consistent with atomic ordering, I think. That said, it probably should be atomic, because strictly speaking a race is UB if we use a non-atomic load. |
f85fb78
to
caecd58
Compare
500dc8a
to
512e83a
Compare
caecd58
to
4934c7d
Compare
512e83a
to
d9f79c2
Compare
4934c7d
to
b69ead1
Compare
d9f79c2
to
6418d44
Compare
When expanding an atomicrmw with a cmpxchg, preserve any metadata attached to it. This will avoid unwanted double expansions in a future commit. The initial load should also probably receive the same metadata (which for some reason is not emitted as an atomic).
b69ead1
to
fe3d55c
Compare
6418d44
to
841f2f9
Compare
When expanding an atomicrmw with a cmpxchg, preserve any metadata
attached to it. This will avoid unwanted double expansions
in a future commit.
The initial load should also probably receive the same metadata
(which for some reason is not emitted as an atomic).