Skip to content

Commit eb43b79

Browse files
changpengjayfoad
andauthored
[AMDGPU] Disable SGPR read hazard mitigation for gfx1250 (#150344)
Co-authored-by: Jay Foad <Jay.Foad@amd.com>
1 parent 31db0f0 commit eb43b79

File tree

10 files changed

+93
-602
lines changed

10 files changed

+93
-602
lines changed

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1308,7 +1308,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
13081308

13091309
bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
13101310

1311-
bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }
1311+
bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; }
13121312

13131313
/// Return if operations acting on VGPR tuples require even alignment.
13141314
bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; }

llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll

Lines changed: 14 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1217,11 +1217,9 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
12171217
; GFX1250-NEXT: v_mov_b32_e32 v13, v10
12181218
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[12:13]
12191219
; GFX1250-NEXT: v_mul_lo_u32 v8, v8, v7
1220-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1220+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
12211221
; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13]
1222-
; GFX1250-NEXT: s_wait_alu 0xf1ff
12231222
; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v11, v8, s0
1224-
; GFX1250-NEXT: s_wait_alu 0xfffd
12251223
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
12261224
; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v8, v1, vcc_lo
12271225
; GFX1250-NEXT: v_mov_b32_e32 v1, v6
@@ -2865,19 +2863,17 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
28652863
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
28662864
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v1, v13, v[16:17]
28672865
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19]
2868-
; GFX1250-NEXT: s_wait_alu 0xf1ff
2866+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
28692867
; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
2870-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
28712868
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v2, v12, v[16:17]
2869+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
28722870
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
2873-
; GFX1250-NEXT: s_wait_alu 0xfffd
2874-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
28752871
; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
28762872
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], null, v0, v10, 0
2873+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
28772874
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v3, v11, v[16:17]
2878-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
28792875
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
2880-
; GFX1250-NEXT: s_wait_alu 0xfffd
2876+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
28812877
; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
28822878
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v4, v10, v[16:17]
28832879
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -2887,65 +2883,56 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
28872883
; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[16:17]
28882884
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21]
28892885
; GFX1250-NEXT: v_mov_b32_e32 v20, v19
2890-
; GFX1250-NEXT: s_wait_alu 0xfffd
28912886
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v24, vcc_lo
2892-
; GFX1250-NEXT: s_wait_alu 0xf1ff
28932887
; GFX1250-NEXT: v_cndmask_b32_e64 v19, 0, 1, s0
28942888
; GFX1250-NEXT: v_mov_b32_e32 v21, v22
28952889
; GFX1250-NEXT: v_mul_lo_u32 v22, v5, v10
28962890
; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v2, v8, v[16:17]
2897-
; GFX1250-NEXT: s_wait_alu 0xfffd
2891+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
28982892
; GFX1250-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v19, vcc_lo
28992893
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v0, v13, v[20:21]
29002894
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
29012895
; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v20, v25
29022896
; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11
29032897
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], vcc_lo, v1, v12, v[16:17]
2904-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
2898+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
29052899
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21]
2906-
; GFX1250-NEXT: s_wait_alu 0xf1ff
29072900
; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2
2901+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
29082902
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17]
2909-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
29102903
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19]
29112904
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], null, v0, v8, 0
2905+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
29122906
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
29132907
; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13
2914-
; GFX1250-NEXT: s_wait_alu 0xf1ff
29152908
; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2
29162909
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
29172910
; GFX1250-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24
2918-
; GFX1250-NEXT: s_wait_alu 0xf1ff
2919-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
29202911
; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2
29212912
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
2913+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
29222914
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19]
29232915
; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v15
29242916
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
2925-
; GFX1250-NEXT: s_wait_alu 0xf1ff
29262917
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
29272918
; GFX1250-NEXT: v_mul_lo_u32 v9, v1, v14
29282919
; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2
29292920
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
29302921
; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19]
2931-
; GFX1250-NEXT: s_wait_alu 0xf1ff
2922+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29322923
; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
2933-
; GFX1250-NEXT: s_wait_alu 0xf1ff
29342924
; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v27, v13, s2
2935-
; GFX1250-NEXT: s_wait_alu 0xf1ff
2936-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
2925+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29372926
; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v2, v10, s2
2938-
; GFX1250-NEXT: s_wait_alu 0xf1ff
29392927
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v6, v11, s2
2940-
; GFX1250-NEXT: s_wait_alu 0xf1ff
2928+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29412929
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v23, v0, s2
29422930
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v9, s5
29432931
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29442932
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v20, s4
29452933
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v29, s3
2946-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2934+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29472935
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v25, s1
2948-
; GFX1250-NEXT: s_wait_alu 0xfffd
29492936
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
29502937
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29512938
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v26, s0

llvm/test/CodeGen/AMDGPU/bf16-conversions.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,6 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
218218
; GFX1250-NEXT: v_cndmask_b32_e64 v2, -1, 1, s1
219219
; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[6:7]|
220220
; GFX1250-NEXT: v_dual_add_nc_u32 v1, v8, v2 :: v_dual_bitop2_b32 v10, 1, v8 bitop3:0x40
221-
; GFX1250-NEXT: s_wait_alu 0xf1ff
222221
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
223222
; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s1
224223
; GFX1250-NEXT: v_and_b32_e32 v11, 1, v9
@@ -229,7 +228,6 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
229228
; GFX1250-NEXT: s_or_b32 vcc_lo, s1, vcc_lo
230229
; GFX1250-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo
231230
; GFX1250-NEXT: s_or_b32 vcc_lo, s2, s0
232-
; GFX1250-NEXT: s_wait_alu 0xfffe
233231
; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
234232
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
235233
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1

llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,6 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
168168
; GCN-NEXT: s_sleep 0
169169
; GCN-NEXT: s_sleep 0
170170
; GCN-NEXT: .LBB3_2: ; %bb3
171-
; GCN-NEXT: s_wait_alu 0xfffe
172171
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0
173172
; GCN-NEXT: global_store_b32 v[0:1], v2, off scope:SCOPE_SYS
174173
; GCN-NEXT: s_wait_storecnt 0x0
@@ -589,7 +588,7 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32
589588
; GCN-NEXT: v_mov_b32_e32 v1, 0
590589
; GCN-NEXT: s_wait_kmcnt 0x0
591590
; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
592-
; GCN-NEXT: s_wait_alu 0xfffe
591+
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
593592
; GCN-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
594593
; GCN-NEXT: global_store_b32 v1, v0, s[0:1]
595594
; GCN-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/carryout-selection.ll

Lines changed: 18 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -822,10 +822,9 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
822822
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
823823
; GFX1250-NEXT: s_wait_kmcnt 0x0
824824
; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[6:7]
825-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
825+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
826826
; GFX1250-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
827827
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
828-
; GFX1250-NEXT: s_wait_alu 0xf1ff
829828
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
830829
; GFX1250-NEXT: s_clause 0x1
831830
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -1803,10 +1802,9 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
18031802
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
18041803
; GFX1250-NEXT: s_wait_kmcnt 0x0
18051804
; GFX1250-NEXT: s_sub_nc_u64 s[6:7], s[4:5], s[6:7]
1806-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
1805+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
18071806
; GFX1250-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
18081807
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
1809-
; GFX1250-NEXT: s_wait_alu 0xf1ff
18101808
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
18111809
; GFX1250-NEXT: s_clause 0x1
18121810
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -3136,126 +3134,105 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
31363134
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_3)
31373135
; GFX1250-NEXT: s_fmac_f32 s0, s1, 0x4f800000
31383136
; GFX1250-NEXT: v_s_rcp_f32 s0, s0
3139-
; GFX1250-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
3137+
; GFX1250-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
31403138
; GFX1250-NEXT: s_mul_f32 s0, s0, 0x5f7ffffc
3141-
; GFX1250-NEXT: s_wait_alu 0xfffe
31423139
; GFX1250-NEXT: s_mul_f32 s1, s0, 0x2f800000
3143-
; GFX1250-NEXT: s_wait_alu 0xfffe
3144-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
3140+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
31453141
; GFX1250-NEXT: s_trunc_f32 s1, s1
3146-
; GFX1250-NEXT: s_wait_alu 0xfffe
31473142
; GFX1250-NEXT: s_fmac_f32 s0, s1, 0xcf800000
31483143
; GFX1250-NEXT: s_cvt_u32_f32 s5, s1
31493144
; GFX1250-NEXT: s_mov_b32 s1, 0
3150-
; GFX1250-NEXT: s_wait_alu 0xfffe
3145+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
31513146
; GFX1250-NEXT: s_cvt_u32_f32 s4, s0
3152-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
31533147
; GFX1250-NEXT: s_mul_u64 s[12:13], s[6:7], s[4:5]
3148+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
31543149
; GFX1250-NEXT: s_mul_hi_u32 s15, s4, s13
31553150
; GFX1250-NEXT: s_mul_i32 s14, s4, s13
31563151
; GFX1250-NEXT: s_mul_hi_u32 s0, s4, s12
31573152
; GFX1250-NEXT: s_mul_i32 s17, s5, s12
3158-
; GFX1250-NEXT: s_wait_alu 0xfffe
31593153
; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[0:1], s[14:15]
31603154
; GFX1250-NEXT: s_mul_hi_u32 s16, s5, s12
31613155
; GFX1250-NEXT: s_mul_hi_u32 s18, s5, s13
31623156
; GFX1250-NEXT: s_add_co_u32 s0, s14, s17
31633157
; GFX1250-NEXT: s_add_co_ci_u32 s0, s15, s16
31643158
; GFX1250-NEXT: s_mul_i32 s12, s5, s13
31653159
; GFX1250-NEXT: s_add_co_ci_u32 s13, s18, 0
3166-
; GFX1250-NEXT: s_wait_alu 0xfffe
3160+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
31673161
; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[0:1], s[12:13]
3168-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
31693162
; GFX1250-NEXT: v_add_co_u32 v0, s0, s4, s12
31703163
; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
31713164
; GFX1250-NEXT: s_add_co_ci_u32 s5, s5, s13
31723165
; GFX1250-NEXT: v_readfirstlane_b32 s4, v0
3173-
; GFX1250-NEXT: s_wait_alu 0xfffe
31743166
; GFX1250-NEXT: s_mul_u64 s[6:7], s[6:7], s[4:5]
31753167
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
31763168
; GFX1250-NEXT: s_mul_hi_u32 s13, s4, s7
31773169
; GFX1250-NEXT: s_mul_i32 s12, s4, s7
31783170
; GFX1250-NEXT: s_mul_hi_u32 s0, s4, s6
31793171
; GFX1250-NEXT: s_mul_i32 s15, s5, s6
3180-
; GFX1250-NEXT: s_wait_alu 0xfffe
31813172
; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[0:1], s[12:13]
31823173
; GFX1250-NEXT: s_mul_hi_u32 s14, s5, s6
31833174
; GFX1250-NEXT: s_mul_hi_u32 s4, s5, s7
3184-
; GFX1250-NEXT: s_wait_alu 0xfffe
31853175
; GFX1250-NEXT: s_add_co_u32 s0, s12, s15
31863176
; GFX1250-NEXT: s_add_co_ci_u32 s0, s13, s14
31873177
; GFX1250-NEXT: s_mul_i32 s6, s5, s7
31883178
; GFX1250-NEXT: s_add_co_ci_u32 s7, s4, 0
3189-
; GFX1250-NEXT: s_wait_alu 0xfffe
3179+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
31903180
; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[0:1], s[6:7]
3191-
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
31923181
; GFX1250-NEXT: v_add_co_u32 v0, s0, v0, s6
31933182
; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
31943183
; GFX1250-NEXT: s_add_co_ci_u32 s0, s5, s7
31953184
; GFX1250-NEXT: v_readfirstlane_b32 s7, v0
3196-
; GFX1250-NEXT: s_wait_alu 0xfffe
31973185
; GFX1250-NEXT: s_mul_hi_u32 s5, s10, s0
31983186
; GFX1250-NEXT: s_mul_i32 s4, s10, s0
31993187
; GFX1250-NEXT: s_mul_hi_u32 s12, s11, s0
32003188
; GFX1250-NEXT: s_mul_i32 s6, s11, s0
32013189
; GFX1250-NEXT: s_mul_hi_u32 s0, s10, s7
32023190
; GFX1250-NEXT: s_mul_i32 s13, s11, s7
3203-
; GFX1250-NEXT: s_wait_alu 0xfffe
32043191
; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[0:1], s[4:5]
32053192
; GFX1250-NEXT: s_mul_hi_u32 s0, s11, s7
3206-
; GFX1250-NEXT: s_wait_alu 0xfffe
32073193
; GFX1250-NEXT: s_add_co_u32 s4, s4, s13
32083194
; GFX1250-NEXT: s_add_co_ci_u32 s0, s5, s0
32093195
; GFX1250-NEXT: s_add_co_ci_u32 s7, s12, 0
3210-
; GFX1250-NEXT: s_wait_alu 0xfffe
3196+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
32113197
; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[0:1], s[6:7]
3212-
; GFX1250-NEXT: s_wait_alu 0xfffe
32133198
; GFX1250-NEXT: s_and_b64 s[6:7], s[4:5], lit64(0xffffffff00000000)
3214-
; GFX1250-NEXT: s_wait_alu 0xfffe
3199+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
32153200
; GFX1250-NEXT: s_or_b32 s6, s6, s4
3216-
; GFX1250-NEXT: s_wait_alu 0xfffe
32173201
; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], s[6:7]
32183202
; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[6:7], 2
3219-
; GFX1250-NEXT: s_wait_alu 0xfffe
32203203
; GFX1250-NEXT: v_sub_co_u32 v0, s0, s10, s4
32213204
; GFX1250-NEXT: s_sub_co_i32 s4, s11, s5
32223205
; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
32233206
; GFX1250-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
32243207
; GFX1250-NEXT: v_sub_co_u32 v1, s12, v0, s2
3225-
; GFX1250-NEXT: s_wait_alu 0xfffe
32263208
; GFX1250-NEXT: s_sub_co_ci_u32 s4, s4, s3
32273209
; GFX1250-NEXT: s_cmp_lg_u32 s12, 0
32283210
; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[6:7], 1
32293211
; GFX1250-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1
3230-
; GFX1250-NEXT: s_wait_alu 0xfffe
32313212
; GFX1250-NEXT: s_sub_co_ci_u32 s4, s4, 0
3232-
; GFX1250-NEXT: s_wait_alu 0xfffe
3213+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
32333214
; GFX1250-NEXT: s_cmp_ge_u32 s4, s3
32343215
; GFX1250-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
32353216
; GFX1250-NEXT: s_cselect_b32 s14, -1, 0
32363217
; GFX1250-NEXT: s_cmp_eq_u32 s4, s3
32373218
; GFX1250-NEXT: s_cselect_b32 vcc_lo, -1, 0
32383219
; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
3239-
; GFX1250-NEXT: s_wait_alu 0xfffe
32403220
; GFX1250-NEXT: v_cndmask_b32_e32 v1, s14, v1, vcc_lo
32413221
; GFX1250-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0
32423222
; GFX1250-NEXT: s_sub_co_ci_u32 s0, s11, s5
3243-
; GFX1250-NEXT: s_wait_alu 0xfffe
3223+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
32443224
; GFX1250-NEXT: s_cmp_ge_u32 s0, s3
3245-
; GFX1250-NEXT: s_wait_alu 0xfffd
32463225
; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
32473226
; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
32483227
; GFX1250-NEXT: s_cmp_eq_u32 s0, s3
32493228
; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
32503229
; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
3251-
; GFX1250-NEXT: s_wait_alu 0xfffe
3230+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
32523231
; GFX1250-NEXT: v_cndmask_b32_e64 v0, s4, v0, s0
3253-
; GFX1250-NEXT: s_wait_alu 0xfffd
32543232
; GFX1250-NEXT: v_cndmask_b32_e32 v2, s12, v2, vcc_lo
32553233
; GFX1250-NEXT: v_cndmask_b32_e32 v1, s13, v3, vcc_lo
3256-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
32573234
; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
3258-
; GFX1250-NEXT: s_wait_alu 0xfffd
3235+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
32593236
; GFX1250-NEXT: v_cndmask_b32_e32 v1, s7, v1, vcc_lo
32603237
; GFX1250-NEXT: v_cndmask_b32_e32 v0, s6, v2, vcc_lo
32613238
; GFX1250-NEXT: s_cbranch_execnz .LBB16_3
@@ -3269,31 +3246,25 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
32693246
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
32703247
; GFX1250-NEXT: v_cvt_u32_f32_e32 v0, v0
32713248
; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
3272-
; GFX1250-NEXT: s_wait_alu 0xfffe
32733249
; GFX1250-NEXT: s_mul_i32 s1, s1, s0
3274-
; GFX1250-NEXT: s_wait_alu 0xfffe
3250+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
32753251
; GFX1250-NEXT: s_mul_hi_u32 s1, s0, s1
3276-
; GFX1250-NEXT: s_wait_alu 0xfffe
32773252
; GFX1250-NEXT: s_add_co_i32 s0, s0, s1
3278-
; GFX1250-NEXT: s_wait_alu 0xfffe
3253+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
32793254
; GFX1250-NEXT: s_mul_hi_u32 s0, s10, s0
3280-
; GFX1250-NEXT: s_wait_alu 0xfffe
32813255
; GFX1250-NEXT: s_mul_i32 s1, s0, s2
32823256
; GFX1250-NEXT: s_add_co_i32 s3, s0, 1
3283-
; GFX1250-NEXT: s_wait_alu 0xfffe
32843257
; GFX1250-NEXT: s_sub_co_i32 s1, s10, s1
3285-
; GFX1250-NEXT: s_wait_alu 0xfffe
3258+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
32863259
; GFX1250-NEXT: s_sub_co_i32 s4, s1, s2
32873260
; GFX1250-NEXT: s_cmp_ge_u32 s1, s2
32883261
; GFX1250-NEXT: s_cselect_b32 s0, s3, s0
3289-
; GFX1250-NEXT: s_wait_alu 0xfffe
32903262
; GFX1250-NEXT: s_cselect_b32 s1, s4, s1
32913263
; GFX1250-NEXT: s_add_co_i32 s3, s0, 1
3292-
; GFX1250-NEXT: s_wait_alu 0xfffe
32933264
; GFX1250-NEXT: s_cmp_ge_u32 s1, s2
32943265
; GFX1250-NEXT: s_mov_b32 s1, 0
32953266
; GFX1250-NEXT: s_cselect_b32 s0, s3, s0
3296-
; GFX1250-NEXT: s_wait_alu 0xfffe
3267+
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
32973268
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
32983269
; GFX1250-NEXT: .LBB16_3:
32993270
; GFX1250-NEXT: v_mov_b32_e32 v2, 0

0 commit comments

Comments
 (0)