Skip to content

Commit 286e1e8

Browse files
committed
[AMDGPU][GlobalISel] Combine for breaking s64 and/or into two s32 insts
When either one of the operands is all ones in high or low parts, splitting these opens up other opportunities for combines. One of two new instructions will either be removed or become a simple copy.
1 parent 33a8f9f commit 286e1e8

12 files changed

+137
-118
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCombine.td

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,25 @@ def zext_of_shift_amount_combines : GICombineGroup<[
151151
canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl
152152
]>;
153153

154+
// (and/or i64:x, i64:y) -> i64:(merge (and/or lo_32(x), lo_32(y)), (and/or hi_32(x), hi_32(y)))
155+
// when either x or y is all ones in low or high parts
156+
class combine_binop_s64_with_s32_mask<Instruction opcode> : GICombineRule<
157+
(defs root:$dst),
158+
(match (opcode $dst, i64:$x, i64:$y):$dst,
159+
[{ return Helper.matchConstantIs32BitMask(${x}.getReg()) ||
160+
Helper.matchConstantIs32BitMask(${y}.getReg()); }]),
161+
(apply (G_UNMERGE_VALUES i32:$x_lo, i32:$x_hi, $x),
162+
(G_UNMERGE_VALUES i32:$y_lo, i32:$y_hi, $y),
163+
(opcode i32:$lo, $x_lo, $y_lo),
164+
(opcode i32:$hi, $x_hi, $y_hi),
165+
(G_MERGE_VALUES $dst, $lo, $hi))>;
166+
167+
def combine_or_s64_with_s32_mask : combine_binop_s64_with_s32_mask<G_OR>;
168+
def combine_and_s64_with_s32_mask : combine_binop_s64_with_s32_mask<G_AND>;
169+
def binop_s64_with_s32_mask_combines : GICombineGroup<[
170+
combine_or_s64_with_s32_mask, combine_and_s64_with_s32_mask
171+
]>;
172+
154173
let Predicates = [Has16BitInsts, NotHasMed3_16] in {
155174
// For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
156175
// saves one instruction compared to the promotion.
@@ -180,15 +199,17 @@ def gfx8_combines : GICombineGroup<[expand_promoted_fmed3]>;
180199
def AMDGPUPreLegalizerCombiner: GICombiner<
181200
"AMDGPUPreLegalizerCombinerImpl",
182201
[all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16,
183-
foldable_fneg, combine_shuffle_vector_to_build_vector]> {
202+
foldable_fneg, combine_shuffle_vector_to_build_vector,
203+
binop_s64_with_s32_mask_combines]> {
184204
let CombineAllMethodName = "tryCombineAllImpl";
185205
}
186206

187207
def AMDGPUPostLegalizerCombiner: GICombiner<
188208
"AMDGPUPostLegalizerCombinerImpl",
189209
[all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp,
190210
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
191-
rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64]> {
211+
rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64,
212+
binop_s64_with_s32_mask_combines]> {
192213
let CombineAllMethodName = "tryCombineAllImpl";
193214
}
194215

llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "AMDGPUCombinerHelper.h"
1010
#include "GCNSubtarget.h"
1111
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
12+
#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
1213
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
1314
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
1415
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -516,3 +517,9 @@ bool AMDGPUCombinerHelper::matchCombineFmulWithSelectToFldexp(
516517

517518
return true;
518519
}
520+
521+
bool AMDGPUCombinerHelper::matchConstantIs32BitMask(Register Reg) const {
522+
const KnownBits &Known = VT->getKnownBits(Reg);
523+
return Known.One.extractBits(32, 0).isAllOnes() ||
524+
Known.One.extractBits(32, 32).isAllOnes();
525+
}

llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ class AMDGPUCombinerHelper : public CombinerHelper {
4343
bool matchCombineFmulWithSelectToFldexp(
4444
MachineInstr &MI, MachineInstr &Sel,
4545
std::function<void(MachineIRBuilder &)> &MatchInfo) const;
46+
47+
bool matchConstantIs32BitMask(Register Reg) const;
4648
};
4749

4850
} // namespace llvm

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-binop-s64-with-s32-mask.mir

Lines changed: 37 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,10 @@ body: |
1111
; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
1212
; CHECK-NEXT: {{ $}}
1313
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
14-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -4294967296
15-
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
16-
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[AND]](s64)
14+
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
15+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
16+
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV1]](s32)
17+
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
1718
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
1819
%0:_(s64) = COPY $sgpr0_sgpr1
1920
%1:_(s64) = G_CONSTANT i64 -4294967296
@@ -31,9 +32,10 @@ body: |
3132
; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
3233
; CHECK-NEXT: {{ $}}
3334
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
34-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -4294967296
35-
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
36-
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[AND]](s64)
35+
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
36+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
37+
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV1]](s32)
38+
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
3739
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
3840
%0:_(s64) = COPY $sgpr0_sgpr1
3941
%1:_(s64) = G_CONSTANT i64 -4294967296
@@ -52,9 +54,9 @@ body: |
5254
; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
5355
; CHECK-NEXT: {{ $}}
5456
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
55-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
56-
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
57-
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[AND]](s64)
57+
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
58+
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s32)
59+
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[ZEXT]](s64)
5860
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
5961
%0:_(s64) = COPY $sgpr0_sgpr1
6062
%1:_(s64) = G_CONSTANT i64 4294967295
@@ -72,9 +74,9 @@ body: |
7274
; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
7375
; CHECK-NEXT: {{ $}}
7476
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
75-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
76-
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
77-
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[AND]](s64)
77+
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
78+
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s32)
79+
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[ZEXT]](s64)
7880
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
7981
%0:_(s64) = COPY $sgpr0_sgpr1
8082
%1:_(s64) = G_CONSTANT i64 4294967295
@@ -91,14 +93,10 @@ body: |
9193
; CHECK-LABEL: name: test_and_mask_hi_with_merge_unmerge
9294
; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2
9395
; CHECK-NEXT: {{ $}}
94-
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
95-
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
96-
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
97-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -4294967296
98-
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C]]
99-
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64)
100-
; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32)
101-
; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32)
96+
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr1
97+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
98+
; CHECK-NEXT: $sgpr0 = COPY [[C]](s32)
99+
; CHECK-NEXT: $sgpr1 = COPY [[COPY]](s32)
102100
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
103101
%0:_(s32) = COPY $sgpr0
104102
%1:_(s32) = COPY $sgpr1
@@ -140,9 +138,10 @@ body: |
140138
; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
141139
; CHECK-NEXT: {{ $}}
142140
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
143-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -4294967296
144-
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[COPY]], [[C]]
145-
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[OR]](s64)
141+
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
142+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
143+
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s32), [[C]](s32)
144+
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
146145
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
147146
%0:_(s64) = COPY $sgpr0_sgpr1
148147
%1:_(s64) = G_CONSTANT i64 -4294967296
@@ -160,9 +159,10 @@ body: |
160159
; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
161160
; CHECK-NEXT: {{ $}}
162161
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
163-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -4294967296
164-
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[COPY]], [[C]]
165-
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[OR]](s64)
162+
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
163+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
164+
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[TRUNC]](s32), [[C]](s32)
165+
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
166166
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
167167
%0:_(s64) = COPY $sgpr0_sgpr1
168168
%1:_(s64) = G_CONSTANT i64 -4294967296
@@ -181,9 +181,10 @@ body: |
181181
; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
182182
; CHECK-NEXT: {{ $}}
183183
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
184-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
185-
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[COPY]], [[C]]
186-
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[OR]](s64)
184+
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
185+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
186+
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV1]](s32)
187+
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
187188
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
188189
%0:_(s64) = COPY $sgpr0_sgpr1
189190
%1:_(s64) = G_CONSTANT i64 4294967295
@@ -201,9 +202,10 @@ body: |
201202
; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
202203
; CHECK-NEXT: {{ $}}
203204
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
204-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
205-
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[COPY]], [[C]]
206-
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[OR]](s64)
205+
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
206+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
207+
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV1]](s32)
208+
; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
207209
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
208210
%0:_(s64) = COPY $sgpr0_sgpr1
209211
%1:_(s64) = G_CONSTANT i64 4294967295
@@ -221,13 +223,9 @@ body: |
221223
; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2
222224
; CHECK-NEXT: {{ $}}
223225
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
224-
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
225-
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
226-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -4294967296
227-
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[MV]], [[C]]
228-
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[OR]](s64)
229-
; CHECK-NEXT: $sgpr0 = COPY [[UV]](s32)
230-
; CHECK-NEXT: $sgpr1 = COPY [[UV1]](s32)
226+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
227+
; CHECK-NEXT: $sgpr0 = COPY [[COPY]](s32)
228+
; CHECK-NEXT: $sgpr1 = COPY [[C]](s32)
231229
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
232230
%0:_(s32) = COPY $sgpr0
233231
%1:_(s32) = COPY $sgpr1

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -227,54 +227,52 @@ exit:
227227
define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 x i32> inreg %.WorkgroupId, <3 x i32> %.LocalInvocationId) #0 {
228228
; GFX10-LABEL: single_lane_execution_attribute:
229229
; GFX10: ; %bb.0: ; %.entry
230-
; GFX10-NEXT: s_mov_b32 s6, 0
231-
; GFX10-NEXT: s_getpc_b64 s[4:5]
232-
; GFX10-NEXT: s_mov_b32 s7, -1
233-
; GFX10-NEXT: s_mov_b32 s2, s1
234-
; GFX10-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
235-
; GFX10-NEXT: s_mov_b32 s1, 0
230+
; GFX10-NEXT: s_getpc_b64 s[12:13]
231+
; GFX10-NEXT: s_mov_b32 s12, 0
232+
; GFX10-NEXT: s_mov_b32 s2, s0
233+
; GFX10-NEXT: s_mov_b32 s3, s12
236234
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v1, -1, 0
237-
; GFX10-NEXT: s_or_b64 s[12:13], s[4:5], s[0:1]
238-
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[12:13], 0x0
235+
; GFX10-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3]
236+
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
239237
; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1
240238
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1
241239
; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
242240
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
243-
; GFX10-NEXT: s_xor_b32 s3, vcc_lo, exec_lo
241+
; GFX10-NEXT: s_xor_b32 s2, vcc_lo, exec_lo
242+
; GFX10-NEXT: s_and_b32 vcc_lo, s2, exec_lo
244243
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
245244
; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen
246-
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s3
247245
; GFX10-NEXT: s_waitcnt vmcnt(0)
248246
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2
249247
; GFX10-NEXT: s_cbranch_vccnz .LBB4_4
250248
; GFX10-NEXT: ; %bb.1: ; %.preheader.preheader
251-
; GFX10-NEXT: s_mov_b32 s3, 0
249+
; GFX10-NEXT: s_mov_b32 s2, 0
252250
; GFX10-NEXT: .LBB4_2: ; %.preheader
253251
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
254-
; GFX10-NEXT: v_mov_b32_e32 v3, s1
252+
; GFX10-NEXT: v_mov_b32_e32 v3, s12
255253
; GFX10-NEXT: v_add_nc_u32_e32 v1, -1, v1
256-
; GFX10-NEXT: s_add_i32 s1, s1, 4
254+
; GFX10-NEXT: s_add_i32 s12, s12, 4
257255
; GFX10-NEXT: buffer_load_dword v3, v3, s[4:7], 0 offen
258256
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
259257
; GFX10-NEXT: s_waitcnt vmcnt(0)
260-
; GFX10-NEXT: v_readfirstlane_b32 s12, v3
261-
; GFX10-NEXT: s_add_i32 s3, s12, s3
258+
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
259+
; GFX10-NEXT: s_add_i32 s2, s3, s2
262260
; GFX10-NEXT: s_cbranch_vccnz .LBB4_2
263261
; GFX10-NEXT: ; %bb.3: ; %.preheader._crit_edge
264-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v2
265-
; GFX10-NEXT: s_or_b32 s1, s0, vcc_lo
266-
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
262+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s2, v2
263+
; GFX10-NEXT: s_or_b32 s2, s0, vcc_lo
264+
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
267265
; GFX10-NEXT: s_branch .LBB4_6
268266
; GFX10-NEXT: .LBB4_4:
269-
; GFX10-NEXT: s_mov_b32 s1, exec_lo
267+
; GFX10-NEXT: s_mov_b32 s2, exec_lo
270268
; GFX10-NEXT: ; implicit-def: $vgpr1
271-
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s1
269+
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
272270
; GFX10-NEXT: s_cbranch_vccz .LBB4_6
273271
; GFX10-NEXT: ; %bb.5: ; %.19
274272
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
275273
; GFX10-NEXT: v_or_b32_e32 v1, 2, v1
276274
; GFX10-NEXT: .LBB4_6: ; %.22
277-
; GFX10-NEXT: v_add_lshl_u32 v0, v0, s2, 2
275+
; GFX10-NEXT: v_add_lshl_u32 v0, v0, s1, 2
278276
; GFX10-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen
279277
; GFX10-NEXT: s_endpgm
280278
.entry:

llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-zextload-from-and.mir

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ body: |
1414
; CHECK-NEXT: {{ $}}
1515
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
1616
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), addrspace 1)
17-
; CHECK-NEXT: %k:_(s64) = G_CONSTANT i64 4294967295
18-
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], %k
19-
; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
17+
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[LOAD]](s64)
18+
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s32)
19+
; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT]](s64)
2020
%0:_(p1) = COPY $vgpr0_vgpr1
2121
%1:_(s64) = G_LOAD %0 :: (load (s64), align 8, addrspace 1)
2222
%k:_(s64) = G_CONSTANT i64 4294967295

llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -194,10 +194,8 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
194194
define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
195195
; CHECK-LABEL: s_sdiv_i64:
196196
; CHECK: ; %bb.0:
197-
; CHECK-NEXT: s_mov_b32 s6, 0
198197
; CHECK-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5]
199-
; CHECK-NEXT: s_mov_b32 s7, -1
200-
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
198+
; CHECK-NEXT: s_mov_b32 s0, 0
201199
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[0:1], 0
202200
; CHECK-NEXT: s_mov_b32 s0, 1
203201
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
@@ -218,7 +216,6 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
218216
; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
219217
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
220218
; CHECK-NEXT: s_subb_u32 s5, 0, s11
221-
; CHECK-NEXT: s_xor_b64 s[6:7], s[6:7], s[8:9]
222219
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
223220
; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
224221
; CHECK-NEXT: v_trunc_f32_e32 v2, v1
@@ -327,9 +324,10 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
327324
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
328325
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
329326
; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
330-
; CHECK-NEXT: v_xor_b32_e32 v0, s6, v0
327+
; CHECK-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9]
328+
; CHECK-NEXT: v_xor_b32_e32 v0, s0, v0
329+
; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
331330
; CHECK-NEXT: s_mov_b32 s0, 0
332-
; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0
333331
; CHECK-NEXT: s_branch .LBB1_3
334332
; CHECK-NEXT: .LBB1_2:
335333
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1

0 commit comments

Comments
 (0)