Skip to content

Commit 0f8a62d

Browse files
committed
[AMDGPU][gfx1250] Use SCOPE_SE for stores that may hit scratch
1 parent fdd7f9c commit 0f8a62d

File tree

6 files changed

+194
-79
lines changed

6 files changed

+194
-79
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 1 addition & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -552,7 +552,7 @@ class SIInsertWaitcnts {
552552
(!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
553553
// FLAT and SCRATCH instructions may access scratch. Other VMEM
554554
// instructions do not.
555-
if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
555+
if (TII->mayAccessScratchThroughFlat(Inst))
556556
return SCRATCH_WRITE_ACCESS;
557557
return VMEM_WRITE_ACCESS;
558558
}
@@ -565,7 +565,6 @@ class SIInsertWaitcnts {
565565

566566
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
567567
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
568-
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
569568
bool isVmemAccess(const MachineInstr &MI) const;
570569
bool generateWaitcntInstBefore(MachineInstr &MI,
571570
WaitcntBrackets &ScoreBrackets,
@@ -2160,32 +2159,6 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
21602159
return false;
21612160
}
21622161

2163-
// This is a flat memory operation. Check to see if it has memory tokens for
2164-
// either scratch or FLAT.
2165-
bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
2166-
const MachineInstr &MI) const {
2167-
assert(TII->isFLAT(MI));
2168-
2169-
// SCRATCH instructions always access scratch.
2170-
if (TII->isFLATScratch(MI))
2171-
return true;
2172-
2173-
// GLOBAL instructions never access scratch.
2174-
if (TII->isFLATGlobal(MI))
2175-
return false;
2176-
2177-
// If there are no memory operands then conservatively assume the flat
2178-
// operation may access scratch.
2179-
if (MI.memoperands_empty())
2180-
return true;
2181-
2182-
// See if any memory operand specifies an address space that involves scratch.
2183-
return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
2184-
unsigned AS = Memop->getAddrSpace();
2185-
return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
2186-
});
2187-
}
2188-
21892162
bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
21902163
return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
21912164
(TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4249,6 +4249,32 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
42494249
Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
42504250
}
42514251

4252+
bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
4253+
if (!isFLAT(MI) || isFLATGlobal(MI))
4254+
return false;
4255+
4256+
// If scratch is not initialized, we can never access it.
4257+
if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
4258+
return false;
4259+
4260+
// SCRATCH instructions always access scratch.
4261+
if (isFLATScratch(MI))
4262+
return true;
4263+
4264+
// If there are no memory operands then conservatively assume the flat
4265+
// operation may access scratch.
4266+
if (MI.memoperands_empty())
4267+
return true;
4268+
4269+
// TODO (?): Does this need to be taught how to read noalias.addrspace ?
4270+
4271+
// See if any memory operand specifies an address space that involves scratch.
4272+
return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
4273+
unsigned AS = Memop->getAddrSpace();
4274+
return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
4275+
});
4276+
}
4277+
42524278
bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
42534279
// Skip the full operand and register alias search modifiesRegister
42544280
// does. There's only a handful of instructions that touch this, it's only an

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
678678
return get(Opcode).TSFlags & SIInstrFlags::FLAT;
679679
}
680680

681+
/// \returns true for SCRATCH_ instructions, or FLAT_ instructions with
682+
/// SCRATCH_ memory operands.
683+
/// Conservatively correct; will return true if \p MI cannot be proven
684+
/// to not hit scratch.
685+
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
686+
681687
static bool isBlockLoadStore(uint16_t Opcode) {
682688
switch (Opcode) {
683689
case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,7 @@ class SICacheControl {
321321
bool IsNonTemporal,
322322
bool IsLastUse = false) const = 0;
323323

324-
virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
324+
virtual bool finalizeStore(MachineBasicBlock::iterator &MI, bool Atomic) const {
325325
return false;
326326
};
327327

@@ -602,7 +602,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
602602
bool IsVolatile, bool IsNonTemporal,
603603
bool IsLastUse) const override;
604604

605-
bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
605+
bool finalizeStore(MachineBasicBlock::iterator &MI, bool Atomic) const override;
606606

607607
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
608608
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
@@ -2551,11 +2551,25 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
25512551
return Changed;
25522552
}
25532553

2554-
bool SIGfx12CacheControl::expandSystemScopeStore(
2555-
MachineBasicBlock::iterator &MI) const {
2554+
bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI,
2555+
bool Atomic) const {
25562556
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2557-
if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
2558-
return insertWaitsBeforeSystemScopeStore(MI);
2557+
if (!CPol)
2558+
return false;
2559+
2560+
const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2561+
2562+
// GFX12.0 only: Extra waits needed before system scope stores.
2563+
if (!ST.hasGFX1250Insts()) {
2564+
if (!Atomic && Scope == CPol::SCOPE_SYS)
2565+
return insertWaitsBeforeSystemScopeStore(MI);
2566+
return false;
2567+
}
2568+
2569+
// GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
2570+
// space.
2571+
if (TII->mayAccessScratchThroughFlat(*MI) && Scope == CPol::SCOPE_CU)
2572+
return setScope(MI, CPol::SCOPE_SE);
25592573

25602574
return false;
25612575
}
@@ -2674,6 +2688,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
26742688
MOI.getIsCrossAddressSpaceOrdering(),
26752689
Position::BEFORE);
26762690

2691+
Changed |= CC->finalizeStore(MI, /*Atomic=*/true);
26772692
return Changed;
26782693
}
26792694

@@ -2686,7 +2701,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
26862701

26872702
// GFX12 specific, scope(desired coherence domain in cache hierarchy) is
26882703
// instruction field, do not confuse it with atomic scope.
2689-
Changed |= CC->expandSystemScopeStore(MI);
2704+
Changed |= CC->finalizeStore(MI, /*Atomic=*/false);
26902705
return Changed;
26912706
}
26922707

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s
3+
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s
4+
5+
; Test that stores that may hit scratch are correctly promoted to SCOPE_SE.
6+
7+
define void @test_scratch_store(ptr addrspace(5) %ptr, i32 %val) {
8+
; GCN-LABEL: test_scratch_store:
9+
; GCN: ; %bb.0:
10+
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
11+
; GCN-NEXT: s_wait_kmcnt 0x0
12+
; GCN-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SE
13+
; GCN-NEXT: s_set_pc_i64 s[30:31]
14+
store i32 %val, ptr addrspace(5) %ptr
15+
ret void
16+
}
17+
18+
define void @test_unknown_flat_store(ptr %ptr, i32 %val) {
19+
; GCN-LABEL: test_unknown_flat_store:
20+
; GCN: ; %bb.0:
21+
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
22+
; GCN-NEXT: s_wait_kmcnt 0x0
23+
; GCN-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
24+
; GCN-NEXT: s_wait_dscnt 0x0
25+
; GCN-NEXT: s_set_pc_i64 s[30:31]
26+
store i32 %val, ptr %ptr
27+
ret void
28+
}
29+
30+
define void @test_flat_store_no_scratch_alloc(ptr %ptr, i32 %val) #0 {
31+
; GCN-LABEL: test_flat_store_no_scratch_alloc:
32+
; GCN: ; %bb.0:
33+
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
34+
; GCN-NEXT: s_wait_kmcnt 0x0
35+
; GCN-NEXT: flat_store_b32 v[0:1], v2
36+
; GCN-NEXT: s_wait_dscnt 0x0
37+
; GCN-NEXT: s_set_pc_i64 s[30:31]
38+
store i32 %val, ptr %ptr
39+
ret void
40+
}
41+
42+
; TODO: handle
43+
define void @test_flat_store_noalias_addrspace(ptr %ptr, i32 %val) {
44+
; GCN-LABEL: test_flat_store_noalias_addrspace:
45+
; GCN: ; %bb.0:
46+
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
47+
; GCN-NEXT: s_wait_kmcnt 0x0
48+
; GCN-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
49+
; GCN-NEXT: s_wait_dscnt 0x0
50+
; GCN-NEXT: s_set_pc_i64 s[30:31]
51+
store i32 %val, ptr %ptr, !noalias.addrspace !{i32 5, i32 6}
52+
ret void
53+
}
54+
55+
; TODO: would be nice to handle too
56+
define void @test_flat_store_select(ptr addrspace(1) %a, ptr addrspace(3) %b, i1 %cond, i32 %val) {
57+
; GCN-SDAG-LABEL: test_flat_store_select:
58+
; GCN-SDAG: ; %bb.0:
59+
; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
60+
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
61+
; GCN-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v2
62+
; GCN-SDAG-NEXT: v_and_b32_e32 v3, 1, v3
63+
; GCN-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
64+
; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo
65+
; GCN-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, s1, vcc_lo
66+
; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
67+
; GCN-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
68+
; GCN-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v2, v0
69+
; GCN-SDAG-NEXT: flat_store_b32 v[0:1], v4 scope:SCOPE_SE
70+
; GCN-SDAG-NEXT: s_wait_dscnt 0x0
71+
; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31]
72+
;
73+
; GCN-GISEL-LABEL: test_flat_store_select:
74+
; GCN-GISEL: ; %bb.0:
75+
; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
76+
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
77+
; GCN-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v2
78+
; GCN-GISEL-NEXT: v_and_b32_e32 v3, 1, v3
79+
; GCN-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
80+
; GCN-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo
81+
; GCN-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, s1, vcc_lo
82+
; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
83+
; GCN-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
84+
; GCN-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v5, v1
85+
; GCN-GISEL-NEXT: flat_store_b32 v[0:1], v4 scope:SCOPE_SE
86+
; GCN-GISEL-NEXT: s_wait_dscnt 0x0
87+
; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31]
88+
%a.ascast = addrspacecast ptr addrspace(1) %a to ptr
89+
%b.ascast = addrspacecast ptr addrspace(3) %b to ptr
90+
%ptr = select i1 %cond, ptr %a.ascast, ptr %b.ascast
91+
store i32 %val, ptr %ptr
92+
ret void
93+
}
94+
95+
attributes #0 = { "amdgpu-no-flat-scratch-init" }

llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll

Lines changed: 44 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -124,27 +124,27 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
124124
; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
125125
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
126126
; GCN-SDAG-NEXT: s_clause 0xd
127-
; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:52
128-
; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:48
129-
; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:44
130-
; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 offset:40
131-
; GCN-SDAG-NEXT: scratch_store_b32 off, v44, s32 offset:36
132-
; GCN-SDAG-NEXT: scratch_store_b32 off, v45, s32 offset:32
133-
; GCN-SDAG-NEXT: scratch_store_b32 off, v56, s32 offset:28
134-
; GCN-SDAG-NEXT: scratch_store_b32 off, v57, s32 offset:24
135-
; GCN-SDAG-NEXT: scratch_store_b32 off, v58, s32 offset:20
136-
; GCN-SDAG-NEXT: scratch_store_b32 off, v59, s32 offset:16
137-
; GCN-SDAG-NEXT: scratch_store_b32 off, v60, s32 offset:12
138-
; GCN-SDAG-NEXT: scratch_store_b32 off, v61, s32 offset:8
139-
; GCN-SDAG-NEXT: scratch_store_b32 off, v62, s32 offset:4
140-
; GCN-SDAG-NEXT: scratch_store_b32 off, v63, s32
127+
; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:52 scope:SCOPE_SE
128+
; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:48 scope:SCOPE_SE
129+
; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:44 scope:SCOPE_SE
130+
; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 offset:40 scope:SCOPE_SE
131+
; GCN-SDAG-NEXT: scratch_store_b32 off, v44, s32 offset:36 scope:SCOPE_SE
132+
; GCN-SDAG-NEXT: scratch_store_b32 off, v45, s32 offset:32 scope:SCOPE_SE
133+
; GCN-SDAG-NEXT: scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
134+
; GCN-SDAG-NEXT: scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
135+
; GCN-SDAG-NEXT: scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
136+
; GCN-SDAG-NEXT: scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
137+
; GCN-SDAG-NEXT: scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
138+
; GCN-SDAG-NEXT: scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
139+
; GCN-SDAG-NEXT: scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
140+
; GCN-SDAG-NEXT: scratch_store_b32 off, v63, s32 scope:SCOPE_SE
141141
; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:224
142142
; GCN-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
143143
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
144-
; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:56 ; 16-byte Folded Spill
144+
; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:56 scope:SCOPE_SE ; 16-byte Folded Spill
145145
; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:240
146146
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
147-
; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:72 ; 16-byte Folded Spill
147+
; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:72 scope:SCOPE_SE ; 16-byte Folded Spill
148148
; GCN-SDAG-NEXT: s_clause 0xd
149149
; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:192
150150
; GCN-SDAG-NEXT: global_load_b128 v[14:17], v[0:1], off offset:208
@@ -206,27 +206,27 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
206206
; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
207207
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
208208
; GCN-GISEL-NEXT: s_clause 0xf
209-
; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:60
210-
; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:56
211-
; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:52
212-
; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:48
213-
; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:44
214-
; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 offset:40
215-
; GCN-GISEL-NEXT: scratch_store_b32 off, v46, s32 offset:36
216-
; GCN-GISEL-NEXT: scratch_store_b32 off, v47, s32 offset:32
217-
; GCN-GISEL-NEXT: scratch_store_b32 off, v56, s32 offset:28
218-
; GCN-GISEL-NEXT: scratch_store_b32 off, v57, s32 offset:24
219-
; GCN-GISEL-NEXT: scratch_store_b32 off, v58, s32 offset:20
220-
; GCN-GISEL-NEXT: scratch_store_b32 off, v59, s32 offset:16
221-
; GCN-GISEL-NEXT: scratch_store_b32 off, v60, s32 offset:12
222-
; GCN-GISEL-NEXT: scratch_store_b32 off, v61, s32 offset:8
223-
; GCN-GISEL-NEXT: scratch_store_b32 off, v62, s32 offset:4
224-
; GCN-GISEL-NEXT: scratch_store_b32 off, v63, s32
209+
; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:60 scope:SCOPE_SE
210+
; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:56 scope:SCOPE_SE
211+
; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:52 scope:SCOPE_SE
212+
; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:48 scope:SCOPE_SE
213+
; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:44 scope:SCOPE_SE
214+
; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 offset:40 scope:SCOPE_SE
215+
; GCN-GISEL-NEXT: scratch_store_b32 off, v46, s32 offset:36 scope:SCOPE_SE
216+
; GCN-GISEL-NEXT: scratch_store_b32 off, v47, s32 offset:32 scope:SCOPE_SE
217+
; GCN-GISEL-NEXT: scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
218+
; GCN-GISEL-NEXT: scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
219+
; GCN-GISEL-NEXT: scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
220+
; GCN-GISEL-NEXT: scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
221+
; GCN-GISEL-NEXT: scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
222+
; GCN-GISEL-NEXT: scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
223+
; GCN-GISEL-NEXT: scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
224+
; GCN-GISEL-NEXT: scratch_store_b32 off, v63, s32 scope:SCOPE_SE
225225
; GCN-GISEL-NEXT: s_wait_xcnt 0x8
226226
; GCN-GISEL-NEXT: v_dual_mov_b32 v46, v3 :: v_dual_mov_b32 v47, v4
227227
; GCN-GISEL-NEXT: global_load_b128 v[2:5], v[0:1], off offset:32
228228
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
229-
; GCN-GISEL-NEXT: scratch_store_b128 off, v[2:5], s32 offset:80 ; 16-byte Folded Spill
229+
; GCN-GISEL-NEXT: scratch_store_b128 off, v[2:5], s32 offset:80 scope:SCOPE_SE ; 16-byte Folded Spill
230230
; GCN-GISEL-NEXT: s_clause 0xe
231231
; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:48
232232
; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off offset:64
@@ -244,7 +244,7 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
244244
; GCN-GISEL-NEXT: global_load_b128 v[60:63], v[0:1], off offset:16
245245
; GCN-GISEL-NEXT: global_load_b128 v[0:3], v[0:1], off offset:240
246246
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
247-
; GCN-GISEL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:64 ; 16-byte Folded Spill
247+
; GCN-GISEL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:64 scope:SCOPE_SE ; 16-byte Folded Spill
248248
; GCN-GISEL-NEXT: scratch_load_b128 v[0:3], off, s32 offset:80 th:TH_LOAD_LU ; 16-byte Folded Reload
249249
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
250250
; GCN-GISEL-NEXT: s_clause 0xe
@@ -299,10 +299,10 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
299299
; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
300300
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
301301
; GCN-SDAG-NEXT: s_clause 0x3
302-
; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:12
303-
; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:8
304-
; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4
305-
; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32
302+
; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:12 scope:SCOPE_SE
303+
; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:8 scope:SCOPE_SE
304+
; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4 scope:SCOPE_SE
305+
; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 scope:SCOPE_SE
306306
; GCN-SDAG-NEXT: s_clause 0x7
307307
; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:112
308308
; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:96
@@ -385,12 +385,12 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
385385
; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
386386
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
387387
; GCN-GISEL-NEXT: s_clause 0x5
388-
; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:20
389-
; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:16
390-
; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:12
391-
; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:8
392-
; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:4
393-
; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32
388+
; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:20 scope:SCOPE_SE
389+
; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:16 scope:SCOPE_SE
390+
; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:12 scope:SCOPE_SE
391+
; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:8 scope:SCOPE_SE
392+
; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:4 scope:SCOPE_SE
393+
; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 scope:SCOPE_SE
394394
; GCN-GISEL-NEXT: s_clause 0x7
395395
; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:80
396396
; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off

0 commit comments

Comments
 (0)