Skip to content

[AMDGPU][gfx12] Clean-up implementation of waits before SCOPE_SYS stores #150587

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 9 additions & 13 deletions llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,8 +321,7 @@ class SICacheControl {
bool IsNonTemporal,
bool IsLastUse = false) const = 0;

virtual bool finalizeStore(MachineBasicBlock::iterator &MI,
bool Atomic) const {
virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
return false;
};

Expand Down Expand Up @@ -603,8 +602,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
bool IsVolatile, bool IsNonTemporal,
bool IsLastUse) const override;

bool finalizeStore(MachineBasicBlock::iterator &MI,
bool Atomic) const override;
bool finalizeStore(MachineInstr &MI, bool Atomic) const override;

bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
Expand Down Expand Up @@ -2538,9 +2536,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
if (IsVolatile) {
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);

if (Op == SIMemOp::STORE)
Changed |= insertWaitsBeforeSystemScopeStore(MI);

// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be
Expand All @@ -2553,9 +2548,8 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}

bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI,
bool Atomic) const {
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
if (!CPol)
return false;

Expand All @@ -2570,7 +2564,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI,

// GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
// space.
if (TII->mayAccessScratchThroughFlat(*MI) && Scope == CPol::SCOPE_CU)
if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU)
return setScope(MI, CPol::SCOPE_SE);

return false;
Expand Down Expand Up @@ -2674,6 +2668,8 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
assert(!MI->mayLoad() && MI->mayStore());

bool Changed = false;
// FIXME: Necessary hack because iterator can lose track of the store.
MachineInstr &StoreMI = *MI;

if (MOI.isAtomic()) {
if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
Expand All @@ -2690,7 +2686,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);

Changed |= CC->finalizeStore(MI, /*Atomic=*/true);
Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
return Changed;
}

Expand All @@ -2703,7 +2699,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,

// GFX12 specific, scope(desired coherence domain in cache hierarchy) is
// instruction field, do not confuse it with atomic scope.
Changed |= CC->finalizeStore(MI, /*Atomic=*/false);
Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
return Changed;
}

Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,6 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_fma_mixlo_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_store_b16 v[0:1], v3, off scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
Expand Down
20 changes: 0 additions & 20 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
Original file line number Diff line number Diff line change
Expand Up @@ -415,11 +415,6 @@ define amdgpu_kernel void @local_volatile_store_0(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: ds_store_b32 v0, v1
; GFX12-WGP-NEXT: s_endpgm
;
Expand All @@ -432,11 +427,6 @@ define amdgpu_kernel void @local_volatile_store_0(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(3) %out) {
Expand Down Expand Up @@ -562,11 +552,6 @@ define amdgpu_kernel void @local_volatile_store_1(
; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: ds_store_b32 v0, v1
; GFX12-WGP-NEXT: s_endpgm
;
Expand All @@ -583,11 +568,6 @@ define amdgpu_kernel void @local_volatile_store_1(
; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(3) %out) {
Expand Down
Loading