Skip to content

Commit 1ea2ac8

Browse files
committed
[AMDGPU][gfx1250] Add cu-store option
Determines whether we can use `SCOPE_CU` stores (on by default), or whether all stores must be done at `SCOPE_SE` minimum.
1 parent a6532c2 commit 1ea2ac8

File tree

11 files changed

+145
-8
lines changed

11 files changed

+145
-8
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -768,6 +768,9 @@ For example:
768768
performant than code generated for XNACK replay
769769
disabled.
770770

771+
cu-stores TODO On GFX12.5, controls whether ``scope:SCOPE_CU`` stores may be used.
772+
If disabled, all stores will be done at ``scope:SCOPE_SE`` or greater.
773+
771774
=============== ============================ ==================================================
772775

773776
.. _amdgpu-target-id:
@@ -5107,7 +5110,9 @@ The fields used by CP for code objects before V3 also match those specified in
51075110
and must be 0,
51085111
>454 1 bit ENABLE_SGPR_PRIVATE_SEGMENT
51095112
_SIZE
5110-
457:455 3 bits Reserved, must be 0.
5113+
455 1 bit USES_CU_STORES GFX12.5: Whether the ``cu-stores`` target attribute is enabled.
5114+
If 0, then all stores are ``SCOPE_SE`` or higher.
5115+
457:456 2 bits Reserved, must be 0.
51115116
458 1 bit ENABLE_WAVEFRONT_SIZE32 GFX6-GFX9
51125117
Reserved, must be 0.
51135118
GFX10-GFX11
@@ -18188,6 +18193,8 @@ terminated by an ``.end_amdhsa_kernel`` directive.
1818818193
GFX942)
1818918194
``.amdhsa_user_sgpr_private_segment_size`` 0 GFX6-GFX12 Controls ENABLE_SGPR_PRIVATE_SEGMENT_SIZE in
1819018195
:ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`.
18196+
``.amdhsa_uses_cu_stores`` 0 GFX12.5 Controls USES_CU_STORES in
18197+
:ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`.
1819118198
``.amdhsa_wavefront_size32`` Target GFX10-GFX12 Controls ENABLE_WAVEFRONT_SIZE32 in
1819218199
Feature :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`.
1819318200
Specific

llvm/include/llvm/Support/AMDHSAKernelDescriptor.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,8 @@ enum : int32_t {
223223
KERNEL_CODE_PROPERTY(ENABLE_SGPR_DISPATCH_ID, 4, 1),
224224
KERNEL_CODE_PROPERTY(ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1),
225225
KERNEL_CODE_PROPERTY(ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1),
226-
KERNEL_CODE_PROPERTY(RESERVED0, 7, 3),
226+
KERNEL_CODE_PROPERTY(RESERVED0, 7, 2),
227+
KERNEL_CODE_PROPERTY(USES_CU_STORES, 9, 1), // GFX12.5 +cu-stores
227228
KERNEL_CODE_PROPERTY(ENABLE_WAVEFRONT_SIZE32, 10, 1), // GFX10+
228229
KERNEL_CODE_PROPERTY(USES_DYNAMIC_STACK, 11, 1),
229230
KERNEL_CODE_PROPERTY(RESERVED1, 12, 4),

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,11 @@ def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch",
284284
"HasSafeCUPrefetch",
285285
"true",
286286
"VMEM CU scope prefetches do not fail on illegal address"
287+
288+
def FeatureCUStores : SubtargetFeature<"cu-stores",
289+
"HasCUStores",
290+
"true",
291+
"Whether SCOPE_CU stores can be used on GFX12.5"
287292
>;
288293

289294
def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard",
@@ -1988,6 +1993,7 @@ def FeatureISAVersion12 : FeatureSet<
19881993
def FeatureISAVersion12_50 : FeatureSet<
19891994
[FeatureGFX12,
19901995
FeatureGFX1250Insts,
1996+
FeatureCUStores,
19911997
FeatureCuMode,
19921998
Feature64BitLiterals,
19931999
FeatureLDSBankCount32,

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,7 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
552552
MCContext &Ctx = MF.getContext();
553553
uint16_t KernelCodeProperties = 0;
554554
const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
555+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
555556

556557
if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
557558
KernelCodeProperties |=
@@ -581,10 +582,13 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
581582
KernelCodeProperties |=
582583
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
583584
}
584-
if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
585+
if (ST.isWave32()) {
585586
KernelCodeProperties |=
586587
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
587588
}
589+
if (isGFX1250(ST) && ST.hasCUStores()) {
590+
KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES;
591+
}
588592

589593
// CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
590594
// un-evaluatable at this point so it cannot be conditionally checked here.

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6066,6 +6066,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
60666066
ExprVal, ValRange);
60676067
if (Val)
60686068
ImpliedUserSGPRCount += 1;
6069+
} else if (ID == ".amdhsa_uses_cu_stores") {
6070+
if (!isGFX1250())
6071+
return Error(IDRange.Start, "directive requires gfx12.5", IDRange);
6072+
6073+
PARSE_BITS_ENTRY(KD.kernel_code_properties,
6074+
KERNEL_CODE_PROPERTY_USES_CU_STORES, ExprVal, ValRange);
60696075
} else if (ID == ".amdhsa_wavefront_size32") {
60706076
EXPR_RESOLVE_OR_ERROR(EvaluatableExpr);
60716077
if (IVersion.Major < 10)

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2556,6 +2556,9 @@ Expected<bool> AMDGPUDisassembler::decodeKernelDescriptorDirective(
25562556
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
25572557
PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
25582558
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
2559+
if (isGFX1250())
2560+
PRINT_DIRECTIVE(".amdhsa_uses_cu_stores",
2561+
KERNEL_CODE_PROPERTY_USES_CU_STORES);
25592562

25602563
if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0)
25612564
return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0,

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
248248
bool HasVmemPrefInsts = false;
249249
bool HasSafeSmemPrefetch = false;
250250
bool HasSafeCUPrefetch = false;
251+
bool HasCUStores = false;
251252
bool HasVcmpxExecWARHazard = false;
252253
bool HasLdsBranchVmemWARHazard = false;
253254
bool HasNSAtoVMEMBug = false;
@@ -998,6 +999,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
998999

9991000
bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }
10001001

1002+
bool hasCUStores() const { return HasCUStores; }
1003+
10011004
// Has s_cmpk_* instructions.
10021005
bool hasSCmpK() const { return getGeneration() < GFX12; }
10031006

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,11 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
440440
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
441441
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
442442
".amdhsa_user_sgpr_private_segment_size");
443+
if (isGFX1250(STI))
444+
PrintField(KD.kernel_code_properties,
445+
amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES_SHIFT,
446+
amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES,
447+
".amdhsa_uses_cu_stores");
443448
if (IVersion.Major >= 10)
444449
PrintField(KD.kernel_code_properties,
445450
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT,

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2564,7 +2564,9 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
25642564

25652565
// GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
25662566
// space.
2567-
if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU)
2567+
// We also require SCOPE_SE minimum if we not have the "cu-stores" feature.
2568+
if (Scope == CPol::SCOPE_CU &&
2569+
(!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI)))
25682570
return setScope(MI, CPol::SCOPE_SE);
25692571

25702572
return false;
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O3 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,CU %s
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O3 -mcpu=gfx1250 -mattr=-cu-stores < %s | FileCheck --check-prefixes=GCN,NOCU %s
3+
4+
; Check that if -cu-stores is used, we use SCOPE_SE minimum on all stores.
5+
6+
; GCN: flat_store:
7+
; CU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
8+
; NOCU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
9+
; GCN: .amdhsa_kernel flat_store
10+
; CU: .amdhsa_uses_cu_stores 1
11+
; NOCU: .amdhsa_uses_cu_stores 0
12+
define amdgpu_kernel void @flat_store(ptr %dst, i32 %val) {
13+
entry:
14+
store i32 %val, ptr %dst
15+
ret void
16+
}
17+
18+
; GCN: global_store:
19+
; CU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}}{{$}}
20+
; NOCU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
21+
; GCN: .amdhsa_kernel global_store
22+
; CU: .amdhsa_uses_cu_stores 1
23+
; NOCU: .amdhsa_uses_cu_stores 0
24+
define amdgpu_kernel void @global_store(ptr addrspace(1) %dst, i32 %val) {
25+
entry:
26+
store i32 %val, ptr addrspace(1) %dst
27+
ret void
28+
}
29+
30+
; GCN: local_store:
31+
; CU: ds_store_b32 v{{.*}}, v{{.*}}{{$}}
32+
; NOCU: ds_store_b32 v{{.*}}, v{{.*}}{{$}}
33+
; GCN: .amdhsa_kernel local_store
34+
; CU: .amdhsa_uses_cu_stores 1
35+
; NOCU: .amdhsa_uses_cu_stores 0
36+
define amdgpu_kernel void @local_store(ptr addrspace(3) %dst, i32 %val) {
37+
entry:
38+
store i32 %val, ptr addrspace(3) %dst
39+
ret void
40+
}
41+
42+
; GCN: scratch_store:
43+
; CU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
44+
; NOCU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
45+
; GCN: .amdhsa_kernel scratch_store
46+
; CU: .amdhsa_uses_cu_stores 1
47+
; NOCU: .amdhsa_uses_cu_stores 0
48+
define amdgpu_kernel void @scratch_store(ptr addrspace(5) %dst, i32 %val) {
49+
entry:
50+
store i32 %val, ptr addrspace(5) %dst
51+
ret void
52+
}
53+
54+
; GCN: flat_atomic_store:
55+
; CU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
56+
; NOCU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
57+
; GCN: .amdhsa_kernel flat_atomic_store
58+
; CU: .amdhsa_uses_cu_stores 1
59+
; NOCU: .amdhsa_uses_cu_stores 0
60+
define amdgpu_kernel void @flat_atomic_store(ptr %dst, i32 %val) {
61+
entry:
62+
store atomic i32 %val, ptr %dst syncscope("wavefront") unordered, align 4
63+
ret void
64+
}
65+
66+
; GCN: global_atomic_store:
67+
; CU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}}{{$}}
68+
; NOCU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE
69+
; GCN: .amdhsa_kernel global_atomic_store
70+
; CU: .amdhsa_uses_cu_stores 1
71+
; NOCU: .amdhsa_uses_cu_stores 0
72+
define amdgpu_kernel void @global_atomic_store(ptr addrspace(1) %dst, i32 %val) {
73+
entry:
74+
store atomic i32 %val, ptr addrspace(1) %dst syncscope("wavefront") unordered, align 4
75+
ret void
76+
}
77+
78+
; GCN: local_atomic_store:
79+
; CU: ds_store_b32 v{{.*}}, v{{.*}}{{$}}
80+
; NOCU: ds_store_b32 v{{.*}}, v{{.*}}{{$}}
81+
; GCN: .amdhsa_kernel local_atomic_store
82+
; CU: .amdhsa_uses_cu_stores 1
83+
; NOCU: .amdhsa_uses_cu_stores 0
84+
define amdgpu_kernel void @local_atomic_store(ptr addrspace(3) %dst, i32 %val) {
85+
entry:
86+
store atomic i32 %val, ptr addrspace(3) %dst syncscope("wavefront") unordered, align 4
87+
ret void
88+
}
89+
90+
; GCN: scratch_atomic_store:
91+
; CU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
92+
; NOCU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
93+
; GCN: .amdhsa_kernel scratch_atomic_store
94+
; CU: .amdhsa_uses_cu_stores 1
95+
; NOCU: .amdhsa_uses_cu_stores 0
96+
define amdgpu_kernel void @scratch_atomic_store(ptr addrspace(5) %dst, i32 %val) {
97+
entry:
98+
store atomic i32 %val, ptr addrspace(5) %dst syncscope("wavefront") unordered, align 4
99+
ret void
100+
}

0 commit comments

Comments
 (0)