Skip to content

Commit 1ab0333

Browse files
committed
[AMDGPU][gfx12] Clean-up implementation of waits before SCOPE_SYS stores
We can do it all in finalizeStore if we ensure it always sees the stores. For that, I needed to fix a hidden bug where finalizeStore wouldn't see all stores because sometimes the iterator got out-of-sync and didn't point to the store anymore. This also removes the waits before volatile LDS stores which never needed it, that was a bug until now.
1 parent 5ad7ef6 commit 1ab0333

File tree

2 files changed

+9
-33
lines changed

2 files changed

+9
-33
lines changed

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -321,8 +321,7 @@ class SICacheControl {
321321
bool IsNonTemporal,
322322
bool IsLastUse = false) const = 0;
323323

324-
virtual bool finalizeStore(MachineBasicBlock::iterator &MI,
325-
bool Atomic) const {
324+
virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
326325
return false;
327326
};
328327

@@ -603,8 +602,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
603602
bool IsVolatile, bool IsNonTemporal,
604603
bool IsLastUse) const override;
605604

606-
bool finalizeStore(MachineBasicBlock::iterator &MI,
607-
bool Atomic) const override;
605+
bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
608606

609607
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
610608
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
@@ -2538,9 +2536,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
25382536
if (IsVolatile) {
25392537
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
25402538

2541-
if (Op == SIMemOp::STORE)
2542-
Changed |= insertWaitsBeforeSystemScopeStore(MI);
2543-
25442539
// Ensure operation has completed at system scope to cause all volatile
25452540
// operations to be visible outside the program in a global order. Do not
25462541
// request cross address space as only the global address space can be
@@ -2553,9 +2548,8 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
25532548
return Changed;
25542549
}
25552550

2556-
bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI,
2557-
bool Atomic) const {
2558-
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2551+
bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
2552+
MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
25592553
if (!CPol)
25602554
return false;
25612555

@@ -2570,7 +2564,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI,
25702564

25712565
// GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
25722566
// space.
2573-
if (TII->mayAccessScratchThroughFlat(*MI) && Scope == CPol::SCOPE_CU)
2567+
if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU)
25742568
return setScope(MI, CPol::SCOPE_SE);
25752569

25762570
return false;
@@ -2674,6 +2668,8 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
26742668
assert(!MI->mayLoad() && MI->mayStore());
26752669

26762670
bool Changed = false;
2671+
// FIXME: Necessary hack because iterator can lose track of the store.
2672+
MachineInstr &StoreMI = *MI;
26772673

26782674
if (MOI.isAtomic()) {
26792675
if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
@@ -2690,7 +2686,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
26902686
MOI.getIsCrossAddressSpaceOrdering(),
26912687
Position::BEFORE);
26922688

2693-
Changed |= CC->finalizeStore(MI, /*Atomic=*/true);
2689+
Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
26942690
return Changed;
26952691
}
26962692

@@ -2703,7 +2699,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
27032699

27042700
// GFX12 specific, scope(desired coherence domain in cache hierarchy) is
27052701
// instruction field, do not confuse it with atomic scope.
2706-
Changed |= CC->finalizeStore(MI, /*Atomic=*/false);
2702+
Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
27072703
return Changed;
27082704
}
27092705

llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -415,11 +415,6 @@ define amdgpu_kernel void @local_volatile_store_0(
415415
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
416416
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
417417
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
418-
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
419-
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
420-
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
421-
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
422-
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
423418
; GFX12-WGP-NEXT: ds_store_b32 v0, v1
424419
; GFX12-WGP-NEXT: s_endpgm
425420
;
@@ -432,11 +427,6 @@ define amdgpu_kernel void @local_volatile_store_0(
432427
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
433428
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
434429
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
435-
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
436-
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
437-
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
438-
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
439-
; GFX12-CU-NEXT: s_wait_storecnt 0x0
440430
; GFX12-CU-NEXT: ds_store_b32 v0, v1
441431
; GFX12-CU-NEXT: s_endpgm
442432
ptr addrspace(1) %in, ptr addrspace(3) %out) {
@@ -562,11 +552,6 @@ define amdgpu_kernel void @local_volatile_store_1(
562552
; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2
563553
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
564554
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
565-
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
566-
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
567-
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
568-
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
569-
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
570555
; GFX12-WGP-NEXT: ds_store_b32 v0, v1
571556
; GFX12-WGP-NEXT: s_endpgm
572557
;
@@ -583,11 +568,6 @@ define amdgpu_kernel void @local_volatile_store_1(
583568
; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2
584569
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
585570
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
586-
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
587-
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
588-
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
589-
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
590-
; GFX12-CU-NEXT: s_wait_storecnt 0x0
591571
; GFX12-CU-NEXT: ds_store_b32 v0, v1
592572
; GFX12-CU-NEXT: s_endpgm
593573
ptr addrspace(1) %in, ptr addrspace(3) %out) {

0 commit comments

Comments
 (0)