Skip to content

Commit 6c5ec02

Browse files
committed
[AMDGPU][gfx12] Clean-up implementation of waits before SCOPE_SYS stores
We can do it all in finalizeStore if we ensure it always sees the stores. For that, I needed to fix a hidden bug where finalizeStore wouldn't see all stores because sometimes the iterator got out-of-sync and didn't point to the store anymore. This also removes the waits before volatile LDS stores which never needed it, that was a bug until now.
1 parent fc3c1dd commit 6c5ec02

File tree

2 files changed

+9
-33
lines changed

2 files changed

+9
-33
lines changed

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -321,8 +321,7 @@ class SICacheControl {
321321
bool IsNonTemporal,
322322
bool IsLastUse = false) const = 0;
323323

324-
virtual bool finalizeStore(MachineBasicBlock::iterator &MI,
325-
bool Atomic) const {
324+
virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
326325
return false;
327326
};
328327

@@ -603,8 +602,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
603602
bool IsVolatile, bool IsNonTemporal,
604603
bool IsLastUse) const override;
605604

606-
bool finalizeStore(MachineBasicBlock::iterator &MI,
607-
bool Atomic) const override;
605+
bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
608606

609607
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
610608
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
@@ -2528,9 +2526,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
25282526
if (IsVolatile) {
25292527
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
25302528

2531-
if (Op == SIMemOp::STORE)
2532-
Changed |= insertWaitsBeforeSystemScopeStore(MI);
2533-
25342529
// Ensure operation has completed at system scope to cause all volatile
25352530
// operations to be visible outside the program in a global order. Do not
25362531
// request cross address space as only the global address space can be
@@ -2543,9 +2538,8 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
25432538
return Changed;
25442539
}
25452540

2546-
bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI,
2547-
bool Atomic) const {
2548-
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2541+
bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
2542+
MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
25492543
if (!CPol)
25502544
return false;
25512545

@@ -2560,7 +2554,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI,
25602554

25612555
// GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
25622556
// space.
2563-
if (TII->mayAccessScratchThroughFlat(*MI) && Scope == CPol::SCOPE_CU)
2557+
if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU)
25642558
return setScope(MI, CPol::SCOPE_SE);
25652559

25662560
return false;
@@ -2664,6 +2658,8 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
26642658
assert(!MI->mayLoad() && MI->mayStore());
26652659

26662660
bool Changed = false;
2661+
// FIXME: Necessary hack because iterator can lose track of the store.
2662+
MachineInstr &StoreMI = *MI;
26672663

26682664
if (MOI.isAtomic()) {
26692665
if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
@@ -2680,7 +2676,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
26802676
MOI.getIsCrossAddressSpaceOrdering(),
26812677
Position::BEFORE);
26822678

2683-
Changed |= CC->finalizeStore(MI, /*Atomic=*/true);
2679+
Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
26842680
return Changed;
26852681
}
26862682

@@ -2693,7 +2689,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
26932689

26942690
// GFX12 specific, scope(desired coherence domain in cache hierarchy) is
26952691
// instruction field, do not confuse it with atomic scope.
2696-
Changed |= CC->finalizeStore(MI, /*Atomic=*/false);
2692+
Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
26972693
return Changed;
26982694
}
26992695

llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -415,11 +415,6 @@ define amdgpu_kernel void @local_volatile_store_0(
415415
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
416416
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
417417
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
418-
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
419-
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
420-
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
421-
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
422-
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
423418
; GFX12-WGP-NEXT: ds_store_b32 v0, v1
424419
; GFX12-WGP-NEXT: s_endpgm
425420
;
@@ -432,11 +427,6 @@ define amdgpu_kernel void @local_volatile_store_0(
432427
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
433428
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
434429
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
435-
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
436-
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
437-
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
438-
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
439-
; GFX12-CU-NEXT: s_wait_storecnt 0x0
440430
; GFX12-CU-NEXT: ds_store_b32 v0, v1
441431
; GFX12-CU-NEXT: s_endpgm
442432
ptr addrspace(1) %in, ptr addrspace(3) %out) {
@@ -562,11 +552,6 @@ define amdgpu_kernel void @local_volatile_store_1(
562552
; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2
563553
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
564554
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
565-
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
566-
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
567-
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
568-
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
569-
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
570555
; GFX12-WGP-NEXT: ds_store_b32 v0, v1
571556
; GFX12-WGP-NEXT: s_endpgm
572557
;
@@ -583,11 +568,6 @@ define amdgpu_kernel void @local_volatile_store_1(
583568
; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2
584569
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
585570
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0
586-
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
587-
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
588-
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
589-
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
590-
; GFX12-CU-NEXT: s_wait_storecnt 0x0
591571
; GFX12-CU-NEXT: ds_store_b32 v0, v1
592572
; GFX12-CU-NEXT: s_endpgm
593573
ptr addrspace(1) %in, ptr addrspace(3) %out) {

0 commit comments

Comments
 (0)