@@ -822,10 +822,9 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
822
822
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
823
823
; GFX1250-NEXT: s_wait_kmcnt 0x0
824
824
; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[6:7]
825
- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2 ) | instid1(VALU_DEP_2)
825
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1 ) | instid1(VALU_DEP_2)
826
826
; GFX1250-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
827
827
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
828
- ; GFX1250-NEXT: s_wait_alu 0xf1ff
829
828
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
830
829
; GFX1250-NEXT: s_clause 0x1
831
830
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -1803,10 +1802,9 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
1803
1802
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
1804
1803
; GFX1250-NEXT: s_wait_kmcnt 0x0
1805
1804
; GFX1250-NEXT: s_sub_nc_u64 s[6:7], s[4:5], s[6:7]
1806
- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2 ) | instid1(VALU_DEP_2)
1805
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1 ) | instid1(VALU_DEP_2)
1807
1806
; GFX1250-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
1808
1807
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
1809
- ; GFX1250-NEXT: s_wait_alu 0xf1ff
1810
1808
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
1811
1809
; GFX1250-NEXT: s_clause 0x1
1812
1810
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -3136,126 +3134,105 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
3136
3134
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_3)
3137
3135
; GFX1250-NEXT: s_fmac_f32 s0, s1, 0x4f800000
3138
3136
; GFX1250-NEXT: v_s_rcp_f32 s0, s0
3139
- ; GFX1250-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1 ) | instid1(SALU_CYCLE_2 )
3137
+ ; GFX1250-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT ) | instid1(SALU_CYCLE_3 )
3140
3138
; GFX1250-NEXT: s_mul_f32 s0, s0, 0x5f7ffffc
3141
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3142
3139
; GFX1250-NEXT: s_mul_f32 s1, s0, 0x2f800000
3143
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3144
- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
3140
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
3145
3141
; GFX1250-NEXT: s_trunc_f32 s1, s1
3146
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3147
3142
; GFX1250-NEXT: s_fmac_f32 s0, s1, 0xcf800000
3148
3143
; GFX1250-NEXT: s_cvt_u32_f32 s5, s1
3149
3144
; GFX1250-NEXT: s_mov_b32 s1, 0
3150
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3145
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
3151
3146
; GFX1250-NEXT: s_cvt_u32_f32 s4, s0
3152
- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3153
3147
; GFX1250-NEXT: s_mul_u64 s[12:13], s[6:7], s[4:5]
3148
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3154
3149
; GFX1250-NEXT: s_mul_hi_u32 s15, s4, s13
3155
3150
; GFX1250-NEXT: s_mul_i32 s14, s4, s13
3156
3151
; GFX1250-NEXT: s_mul_hi_u32 s0, s4, s12
3157
3152
; GFX1250-NEXT: s_mul_i32 s17, s5, s12
3158
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3159
3153
; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[0:1], s[14:15]
3160
3154
; GFX1250-NEXT: s_mul_hi_u32 s16, s5, s12
3161
3155
; GFX1250-NEXT: s_mul_hi_u32 s18, s5, s13
3162
3156
; GFX1250-NEXT: s_add_co_u32 s0, s14, s17
3163
3157
; GFX1250-NEXT: s_add_co_ci_u32 s0, s15, s16
3164
3158
; GFX1250-NEXT: s_mul_i32 s12, s5, s13
3165
3159
; GFX1250-NEXT: s_add_co_ci_u32 s13, s18, 0
3166
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3160
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3167
3161
; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[0:1], s[12:13]
3168
- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3169
3162
; GFX1250-NEXT: v_add_co_u32 v0, s0, s4, s12
3170
3163
; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
3171
3164
; GFX1250-NEXT: s_add_co_ci_u32 s5, s5, s13
3172
3165
; GFX1250-NEXT: v_readfirstlane_b32 s4, v0
3173
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3174
3166
; GFX1250-NEXT: s_mul_u64 s[6:7], s[6:7], s[4:5]
3175
3167
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3176
3168
; GFX1250-NEXT: s_mul_hi_u32 s13, s4, s7
3177
3169
; GFX1250-NEXT: s_mul_i32 s12, s4, s7
3178
3170
; GFX1250-NEXT: s_mul_hi_u32 s0, s4, s6
3179
3171
; GFX1250-NEXT: s_mul_i32 s15, s5, s6
3180
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3181
3172
; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[0:1], s[12:13]
3182
3173
; GFX1250-NEXT: s_mul_hi_u32 s14, s5, s6
3183
3174
; GFX1250-NEXT: s_mul_hi_u32 s4, s5, s7
3184
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3185
3175
; GFX1250-NEXT: s_add_co_u32 s0, s12, s15
3186
3176
; GFX1250-NEXT: s_add_co_ci_u32 s0, s13, s14
3187
3177
; GFX1250-NEXT: s_mul_i32 s6, s5, s7
3188
3178
; GFX1250-NEXT: s_add_co_ci_u32 s7, s4, 0
3189
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3179
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3190
3180
; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[0:1], s[6:7]
3191
- ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3192
3181
; GFX1250-NEXT: v_add_co_u32 v0, s0, v0, s6
3193
3182
; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
3194
3183
; GFX1250-NEXT: s_add_co_ci_u32 s0, s5, s7
3195
3184
; GFX1250-NEXT: v_readfirstlane_b32 s7, v0
3196
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3197
3185
; GFX1250-NEXT: s_mul_hi_u32 s5, s10, s0
3198
3186
; GFX1250-NEXT: s_mul_i32 s4, s10, s0
3199
3187
; GFX1250-NEXT: s_mul_hi_u32 s12, s11, s0
3200
3188
; GFX1250-NEXT: s_mul_i32 s6, s11, s0
3201
3189
; GFX1250-NEXT: s_mul_hi_u32 s0, s10, s7
3202
3190
; GFX1250-NEXT: s_mul_i32 s13, s11, s7
3203
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3204
3191
; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[0:1], s[4:5]
3205
3192
; GFX1250-NEXT: s_mul_hi_u32 s0, s11, s7
3206
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3207
3193
; GFX1250-NEXT: s_add_co_u32 s4, s4, s13
3208
3194
; GFX1250-NEXT: s_add_co_ci_u32 s0, s5, s0
3209
3195
; GFX1250-NEXT: s_add_co_ci_u32 s7, s12, 0
3210
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3196
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3211
3197
; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[0:1], s[6:7]
3212
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3213
3198
; GFX1250-NEXT: s_and_b64 s[6:7], s[4:5], lit64(0xffffffff00000000)
3214
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3199
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3215
3200
; GFX1250-NEXT: s_or_b32 s6, s6, s4
3216
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3217
3201
; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], s[6:7]
3218
3202
; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[6:7], 2
3219
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3220
3203
; GFX1250-NEXT: v_sub_co_u32 v0, s0, s10, s4
3221
3204
; GFX1250-NEXT: s_sub_co_i32 s4, s11, s5
3222
3205
; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
3223
3206
; GFX1250-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
3224
3207
; GFX1250-NEXT: v_sub_co_u32 v1, s12, v0, s2
3225
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3226
3208
; GFX1250-NEXT: s_sub_co_ci_u32 s4, s4, s3
3227
3209
; GFX1250-NEXT: s_cmp_lg_u32 s12, 0
3228
3210
; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[6:7], 1
3229
3211
; GFX1250-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1
3230
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3231
3212
; GFX1250-NEXT: s_sub_co_ci_u32 s4, s4, 0
3232
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3213
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3233
3214
; GFX1250-NEXT: s_cmp_ge_u32 s4, s3
3234
3215
; GFX1250-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
3235
3216
; GFX1250-NEXT: s_cselect_b32 s14, -1, 0
3236
3217
; GFX1250-NEXT: s_cmp_eq_u32 s4, s3
3237
3218
; GFX1250-NEXT: s_cselect_b32 vcc_lo, -1, 0
3238
3219
; GFX1250-NEXT: s_cmp_lg_u32 s0, 0
3239
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3240
3220
; GFX1250-NEXT: v_cndmask_b32_e32 v1, s14, v1, vcc_lo
3241
3221
; GFX1250-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0
3242
3222
; GFX1250-NEXT: s_sub_co_ci_u32 s0, s11, s5
3243
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3223
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3244
3224
; GFX1250-NEXT: s_cmp_ge_u32 s0, s3
3245
- ; GFX1250-NEXT: s_wait_alu 0xfffd
3246
3225
; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
3247
3226
; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
3248
3227
; GFX1250-NEXT: s_cmp_eq_u32 s0, s3
3249
3228
; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
3250
3229
; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
3251
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3230
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
3252
3231
; GFX1250-NEXT: v_cndmask_b32_e64 v0, s4, v0, s0
3253
- ; GFX1250-NEXT: s_wait_alu 0xfffd
3254
3232
; GFX1250-NEXT: v_cndmask_b32_e32 v2, s12, v2, vcc_lo
3255
3233
; GFX1250-NEXT: v_cndmask_b32_e32 v1, s13, v3, vcc_lo
3256
- ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3257
3234
; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
3258
- ; GFX1250-NEXT: s_wait_alu 0xfffd
3235
+ ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
3259
3236
; GFX1250-NEXT: v_cndmask_b32_e32 v1, s7, v1, vcc_lo
3260
3237
; GFX1250-NEXT: v_cndmask_b32_e32 v0, s6, v2, vcc_lo
3261
3238
; GFX1250-NEXT: s_cbranch_execnz .LBB16_3
@@ -3269,31 +3246,25 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
3269
3246
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3270
3247
; GFX1250-NEXT: v_cvt_u32_f32_e32 v0, v0
3271
3248
; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
3272
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3273
3249
; GFX1250-NEXT: s_mul_i32 s1, s1, s0
3274
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3250
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3275
3251
; GFX1250-NEXT: s_mul_hi_u32 s1, s0, s1
3276
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3277
3252
; GFX1250-NEXT: s_add_co_i32 s0, s0, s1
3278
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3253
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3279
3254
; GFX1250-NEXT: s_mul_hi_u32 s0, s10, s0
3280
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3281
3255
; GFX1250-NEXT: s_mul_i32 s1, s0, s2
3282
3256
; GFX1250-NEXT: s_add_co_i32 s3, s0, 1
3283
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3284
3257
; GFX1250-NEXT: s_sub_co_i32 s1, s10, s1
3285
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3258
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3286
3259
; GFX1250-NEXT: s_sub_co_i32 s4, s1, s2
3287
3260
; GFX1250-NEXT: s_cmp_ge_u32 s1, s2
3288
3261
; GFX1250-NEXT: s_cselect_b32 s0, s3, s0
3289
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3290
3262
; GFX1250-NEXT: s_cselect_b32 s1, s4, s1
3291
3263
; GFX1250-NEXT: s_add_co_i32 s3, s0, 1
3292
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3293
3264
; GFX1250-NEXT: s_cmp_ge_u32 s1, s2
3294
3265
; GFX1250-NEXT: s_mov_b32 s1, 0
3295
3266
; GFX1250-NEXT: s_cselect_b32 s0, s3, s0
3296
- ; GFX1250-NEXT: s_wait_alu 0xfffe
3267
+ ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3297
3268
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
3298
3269
; GFX1250-NEXT: .LBB16_3:
3299
3270
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
0 commit comments