-
Notifications
You must be signed in to change notification settings - Fork 14.6k
[AMDGPU][gfx12] Clean-up implementation of waits before SCOPE_SYS stores #150587
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][gfx12] Clean-up implementation of waits before SCOPE_SYS stores #150587
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) ChangesWe can do it all in finalizeStore if we ensure it always sees the stores. This also removes the waits before volatile LDS stores which never needed it, that was a bug until now. Full diff: https://github.com/llvm/llvm-project/pull/150587.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 4456d9b241425..d6337a85a7361 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -321,7 +321,7 @@ class SICacheControl {
bool IsNonTemporal,
bool IsLastUse = false) const = 0;
- virtual bool finalizeStore(MachineBasicBlock::iterator &MI, bool Atomic) const {
+ virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
return false;
};
@@ -602,7 +602,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
bool IsVolatile, bool IsNonTemporal,
bool IsLastUse) const override;
- bool finalizeStore(MachineBasicBlock::iterator &MI, bool Atomic) const override;
+ bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
@@ -2526,9 +2526,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
if (IsVolatile) {
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
- if (Op == SIMemOp::STORE)
- Changed |= insertWaitsBeforeSystemScopeStore(MI);
-
// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be
@@ -2541,9 +2538,8 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}
-bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI,
- bool Atomic) const {
- MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
+bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
+ MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
if (!CPol)
return false;
@@ -2558,7 +2554,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI,
// GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
// space.
- if (TII->mayAccessScratchThroughFlat(*MI) && Scope == CPol::SCOPE_CU)
+ if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU)
return setScope(MI, CPol::SCOPE_SE);
return false;
@@ -2662,6 +2658,8 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
assert(!MI->mayLoad() && MI->mayStore());
bool Changed = false;
+ // FIXME: Necessary hack because iterator can lose track of the store.
+ MachineInstr &StoreMI = *MI;
if (MOI.isAtomic()) {
if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
@@ -2678,7 +2676,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);
- Changed |= CC->finalizeStore(MI, /*Atomic=*/true);
+ Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
return Changed;
}
@@ -2691,7 +2689,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
// GFX12 specific, scope(desired coherence domain in cache hierarchy) is
// instruction field, do not confuse it with atomic scope.
- Changed |= CC->finalizeStore(MI, /*Atomic=*/false);
+ Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
return Changed;
}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index bc2508411ed6b..5e5e3bf83d610 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -415,11 +415,6 @@ define amdgpu_kernel void @local_volatile_store_0(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
-; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: ds_store_b32 v0, v1
; GFX12-WGP-NEXT: s_endpgm
;
@@ -432,11 +427,6 @@ define amdgpu_kernel void @local_volatile_store_0(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
-; GFX12-CU-NEXT: s_wait_samplecnt 0x0
-; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(3) %out) {
@@ -562,11 +552,6 @@ define amdgpu_kernel void @local_volatile_store_1(
; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
-; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
-; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
-; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: ds_store_b32 v0, v1
; GFX12-WGP-NEXT: s_endpgm
;
@@ -583,11 +568,6 @@ define amdgpu_kernel void @local_volatile_store_1(
; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT: s_wait_loadcnt 0x0
-; GFX12-CU-NEXT: s_wait_samplecnt 0x0
-; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT: s_wait_kmcnt 0x0
-; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(3) %out) {
|
e4821db
to
6c5ec02
Compare
Merge activity
|
5c225b5
to
7f59751
Compare
1ab0333
to
1e3bb0c
Compare
db04f10
to
d591d5d
Compare
We can do it all in finalizeStore if we ensure it always sees the stores. For that, I needed to fix a hidden bug where finalizeStore wouldn't see all stores because sometimes the iterator got out-of-sync and didn't point to the store anymore. This also removes the waits before volatile LDS stores which never needed it, that was a bug until now.
d591d5d
to
8d9e7d5
Compare
We can do it all in finalizeStore if we ensure it always sees the stores.
For that, I needed to fix a hidden bug where finalizeStore wouldn't see all stores
because sometimes the iterator got out-of-sync and didn't point to the store anymore.
This also removes the waits before volatile LDS stores which never needed it, that was a bug until now.