diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 607825ea50e7c..f1262e11d6fa9 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -321,8 +321,7 @@ class SICacheControl { bool IsNonTemporal, bool IsLastUse = false) const = 0; - virtual bool finalizeStore(MachineBasicBlock::iterator &MI, - bool Atomic) const { + virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const { return false; }; @@ -603,8 +602,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { bool IsVolatile, bool IsNonTemporal, bool IsLastUse) const override; - bool finalizeStore(MachineBasicBlock::iterator &MI, - bool Atomic) const override; + bool finalizeStore(MachineInstr &MI, bool Atomic) const override; bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, @@ -2538,9 +2536,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( if (IsVolatile) { Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); - if (Op == SIMemOp::STORE) - Changed |= insertWaitsBeforeSystemScopeStore(MI); - // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not // request cross address space as only the global address space can be @@ -2553,9 +2548,8 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( return Changed; } -bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI, - bool Atomic) const { - MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); +bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { + MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); if (!CPol) return false; @@ -2570,7 +2564,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI, // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address // space. - if (TII->mayAccessScratchThroughFlat(*MI) && Scope == CPol::SCOPE_CU) + if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU) return setScope(MI, CPol::SCOPE_SE); return false; @@ -2674,6 +2668,8 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, assert(!MI->mayLoad() && MI->mayStore()); bool Changed = false; + // FIXME: Necessary hack because iterator can lose track of the store. + MachineInstr &StoreMI = *MI; if (MOI.isAtomic()) { if (MOI.getOrdering() == AtomicOrdering::Monotonic || @@ -2690,7 +2686,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE); - Changed |= CC->finalizeStore(MI, /*Atomic=*/true); + Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true); return Changed; } @@ -2703,7 +2699,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, // GFX12 specific, scope(desired coherence domain in cache hierarchy) is // instruction field, do not confuse it with atomic scope. - Changed |= CC->finalizeStore(MI, /*Atomic=*/false); + Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false); return Changed; } diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll index 84123e6c1e521..393581fd148f6 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll @@ -141,7 +141,6 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_fma_mixlo_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1] ; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_store_b16 v[0:1], v3, off scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index bc2508411ed6b..5e5e3bf83d610 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -415,11 +415,6 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; @@ -432,11 +427,6 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_loadcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { @@ -562,11 +552,6 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; @@ -583,11 +568,6 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_loadcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) {