Skip to content

Commit 6947bbf

Browse files
committed
[CodeGen] Lower vector interleaves of const splats to a wider splat
When lowering vector.interleave we should check if every operand is the same splat and lower directly to a wider splat if so. This avoids having to create a VECTOR_INTERLEAVE node, which expects to return a struct of values that we then have to concatenate. We could also do this with a DAG combine that looks for concat_vectors(vector_interleave(splat, splat, ...)), but it seemed like a lot of unnecessary extra complexity. While here I fixed up the induction variable names to meet the coding standard.
1 parent 6d0ee9b commit 6947bbf

File tree

4 files changed

+83
-151
lines changed

4 files changed

+83
-151
lines changed

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12594,28 +12594,40 @@ void SelectionDAGBuilder::visitVectorDeinterleave(const CallInst &I,
1259412594
setValue(&I, Res);
1259512595
}
1259612596

12597-
void SelectionDAGBuilder::visitVectorInterleave(const CallInst &I,
12597+
void SelectionDAGBuilder::visitVectorInterleave(const CallInst &CI,
1259812598
unsigned Factor) {
1259912599
auto DL = getCurSDLoc();
1260012600
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12601-
EVT InVT = getValue(I.getOperand(0)).getValueType();
12602-
EVT OutVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
12601+
EVT InVT = getValue(CI.getOperand(0)).getValueType();
12602+
EVT OutVT = TLI.getValueType(DAG.getDataLayout(), CI.getType());
1260312603

1260412604
SmallVector<SDValue, 8> InVecs(Factor);
12605-
bool OperandsAreSame = false;
12606-
for (unsigned i = 0; i < Factor; ++i) {
12607-
InVecs[i] = getValue(I.getOperand(i));
12608-
assert(InVecs[i].getValueType() == InVecs[0].getValueType() &&
12605+
bool OperandsAreSame = true;
12606+
for (unsigned I = 0; I < Factor; ++I) {
12607+
InVecs[I] = getValue(CI.getOperand(I));
12608+
assert(InVecs[I].getValueType() == InVecs[0].getValueType() &&
1260912609
"Expected VTs to be the same");
12610+
if (InVecs[I] != InVecs[0])
12611+
OperandsAreSame = false;
12612+
}
12613+
12614+
if (OperandsAreSame) {
12615+
if (auto *C = dyn_cast<Constant>(CI.getOperand(0))) {
12616+
if (auto *SV = C->getSplatValue()) {
12617+
SDValue Res = DAG.getNode(ISD::SPLAT_VECTOR, DL, OutVT, getValue(SV));
12618+
setValue(&CI, Res);
12619+
return;
12620+
}
12621+
}
1261012622
}
1261112623

1261212624
// Use VECTOR_SHUFFLE for fixed-length vectors with factor of 2 to benefit
1261312625
// from existing legalisation and combines.
1261412626
if (OutVT.isFixedLengthVector() && Factor == 2) {
1261512627
unsigned NumElts = InVT.getVectorMinNumElements();
1261612628
SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, InVecs);
12617-
setValue(&I, DAG.getVectorShuffle(OutVT, DL, V, DAG.getUNDEF(OutVT),
12618-
createInterleaveMask(NumElts, 2)));
12629+
setValue(&CI, DAG.getVectorShuffle(OutVT, DL, V, DAG.getUNDEF(OutVT),
12630+
createInterleaveMask(NumElts, 2)));
1261912631
return;
1262012632
}
1262112633

@@ -12624,11 +12636,11 @@ void SelectionDAGBuilder::visitVectorInterleave(const CallInst &I,
1262412636
DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, DAG.getVTList(ValueVTs), InVecs);
1262512637

1262612638
SmallVector<SDValue, 8> Results(Factor);
12627-
for (unsigned i = 0; i < Factor; ++i)
12628-
Results[i] = Res.getValue(i);
12639+
for (unsigned I = 0; I < Factor; ++I)
12640+
Results[I] = Res.getValue(I);
1262912641

1263012642
Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Results);
12631-
setValue(&I, Res);
12643+
setValue(&CI, Res);
1263212644
}
1263312645

1263412646
void SelectionDAGBuilder::visitFreeze(const FreezeInst &I) {

llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll

Lines changed: 26 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -14,20 +14,19 @@ target triple = "aarch64"
1414
define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
1515
; CHECK-LABEL: complex_mul_v2f64:
1616
; CHECK: // %bb.0: // %entry
17+
; CHECK-NEXT: movi v0.2d, #0000000000000000
1718
; CHECK-NEXT: movi v1.2d, #0000000000000000
1819
; CHECK-NEXT: mov w8, #100 // =0x64
19-
; CHECK-NEXT: cntd x9
2020
; CHECK-NEXT: whilelo p1.d, xzr, x8
21+
; CHECK-NEXT: cntd x9
2122
; CHECK-NEXT: rdvl x10, #2
22-
; CHECK-NEXT: mov x11, x9
2323
; CHECK-NEXT: ptrue p0.d
24-
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
25-
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
24+
; CHECK-NEXT: mov x11, x9
2625
; CHECK-NEXT: .LBB0_1: // %vector.body
2726
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
2827
; CHECK-NEXT: zip2 p2.d, p1.d, p1.d
29-
; CHECK-NEXT: mov z6.d, z1.d
30-
; CHECK-NEXT: mov z7.d, z0.d
28+
; CHECK-NEXT: mov z6.d, z0.d
29+
; CHECK-NEXT: mov z7.d, z1.d
3130
; CHECK-NEXT: zip1 p1.d, p1.d, p1.d
3231
; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #1, mul vl]
3332
; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl]
@@ -39,14 +38,14 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
3938
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
4039
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
4140
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
42-
; CHECK-NEXT: mov z0.d, p2/m, z7.d
43-
; CHECK-NEXT: mov z1.d, p1/m, z6.d
41+
; CHECK-NEXT: mov z1.d, p2/m, z7.d
42+
; CHECK-NEXT: mov z0.d, p1/m, z6.d
4443
; CHECK-NEXT: whilelo p1.d, x11, x8
4544
; CHECK-NEXT: add x11, x11, x9
4645
; CHECK-NEXT: b.mi .LBB0_1
4746
; CHECK-NEXT: // %bb.2: // %exit.block
48-
; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
49-
; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d
47+
; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d
48+
; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d
5049
; CHECK-NEXT: faddv d0, p0, z2.d
5150
; CHECK-NEXT: faddv d1, p0, z1.d
5251
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
@@ -111,21 +110,20 @@ exit.block: ; preds = %vector.body
111110
define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr %cond) {
112111
; CHECK-LABEL: complex_mul_predicated_v2f64:
113112
; CHECK: // %bb.0: // %entry
113+
; CHECK-NEXT: movi v0.2d, #0000000000000000
114114
; CHECK-NEXT: movi v1.2d, #0000000000000000
115115
; CHECK-NEXT: cntd x9
116-
; CHECK-NEXT: mov w11, #100 // =0x64
117116
; CHECK-NEXT: neg x10, x9
117+
; CHECK-NEXT: mov w11, #100 // =0x64
118118
; CHECK-NEXT: ptrue p0.d
119119
; CHECK-NEXT: mov x8, xzr
120120
; CHECK-NEXT: and x10, x10, x11
121121
; CHECK-NEXT: rdvl x11, #2
122-
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
123-
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
124122
; CHECK-NEXT: .LBB1_1: // %vector.body
125123
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
126124
; CHECK-NEXT: ld1w { z2.d }, p0/z, [x2, x8, lsl #2]
127-
; CHECK-NEXT: mov z6.d, z1.d
128-
; CHECK-NEXT: mov z7.d, z0.d
125+
; CHECK-NEXT: mov z6.d, z0.d
126+
; CHECK-NEXT: mov z7.d, z1.d
129127
; CHECK-NEXT: add x8, x8, x9
130128
; CHECK-NEXT: cmpne p1.d, p0/z, z2.d, #0
131129
; CHECK-NEXT: cmp x10, x8
@@ -141,12 +139,12 @@ define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr %
141139
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
142140
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
143141
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
144-
; CHECK-NEXT: mov z0.d, p2/m, z7.d
145-
; CHECK-NEXT: mov z1.d, p1/m, z6.d
142+
; CHECK-NEXT: mov z1.d, p2/m, z7.d
143+
; CHECK-NEXT: mov z0.d, p1/m, z6.d
146144
; CHECK-NEXT: b.ne .LBB1_1
147145
; CHECK-NEXT: // %bb.2: // %exit.block
148-
; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
149-
; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d
146+
; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d
147+
; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d
150148
; CHECK-NEXT: faddv d0, p0, z2.d
151149
; CHECK-NEXT: faddv d1, p0, z1.d
152150
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
@@ -213,21 +211,20 @@ exit.block: ; preds = %vector.body
213211
define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, ptr %cond) {
214212
; CHECK-LABEL: complex_mul_predicated_x2_v2f64:
215213
; CHECK: // %bb.0: // %entry
214+
; CHECK-NEXT: movi v0.2d, #0000000000000000
216215
; CHECK-NEXT: movi v1.2d, #0000000000000000
217216
; CHECK-NEXT: mov w8, #100 // =0x64
218-
; CHECK-NEXT: cntd x9
219217
; CHECK-NEXT: whilelo p1.d, xzr, x8
218+
; CHECK-NEXT: cntd x9
220219
; CHECK-NEXT: rdvl x10, #2
221-
; CHECK-NEXT: cnth x11
222220
; CHECK-NEXT: ptrue p0.d
221+
; CHECK-NEXT: cnth x11
223222
; CHECK-NEXT: mov x12, x9
224-
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
225-
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
226223
; CHECK-NEXT: .LBB2_1: // %vector.body
227224
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
228225
; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2]
229-
; CHECK-NEXT: mov z6.d, z1.d
230-
; CHECK-NEXT: mov z7.d, z0.d
226+
; CHECK-NEXT: mov z6.d, z0.d
227+
; CHECK-NEXT: mov z7.d, z1.d
231228
; CHECK-NEXT: add x2, x2, x11
232229
; CHECK-NEXT: and z2.d, z2.d, #0xffffffff
233230
; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
@@ -243,14 +240,14 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
243240
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
244241
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
245242
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
246-
; CHECK-NEXT: mov z0.d, p2/m, z7.d
247-
; CHECK-NEXT: mov z1.d, p1/m, z6.d
243+
; CHECK-NEXT: mov z1.d, p2/m, z7.d
244+
; CHECK-NEXT: mov z0.d, p1/m, z6.d
248245
; CHECK-NEXT: whilelo p1.d, x12, x8
249246
; CHECK-NEXT: add x12, x12, x9
250247
; CHECK-NEXT: b.mi .LBB2_1
251248
; CHECK-NEXT: // %bb.2: // %exit.block
252-
; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
253-
; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d
249+
; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d
250+
; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d
254251
; CHECK-NEXT: faddv d0, p0, z2.d
255252
; CHECK-NEXT: faddv d1, p0, z1.d
256253
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0

llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,14 @@ target triple = "aarch64"
1414
define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
1515
; CHECK-LABEL: complex_mul_v2f64:
1616
; CHECK: // %bb.0: // %entry
17+
; CHECK-NEXT: movi v0.2d, #0000000000000000
1718
; CHECK-NEXT: movi v1.2d, #0000000000000000
1819
; CHECK-NEXT: cntd x8
19-
; CHECK-NEXT: mov w10, #100 // =0x64
2020
; CHECK-NEXT: neg x9, x8
21+
; CHECK-NEXT: mov w10, #100 // =0x64
2122
; CHECK-NEXT: ptrue p0.d
2223
; CHECK-NEXT: and x9, x9, x10
2324
; CHECK-NEXT: rdvl x10, #2
24-
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
25-
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
2625
; CHECK-NEXT: .LBB0_1: // %vector.body
2726
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
2827
; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
@@ -32,14 +31,14 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
3231
; CHECK-NEXT: ldr z5, [x1]
3332
; CHECK-NEXT: add x1, x1, x10
3433
; CHECK-NEXT: add x0, x0, x10
35-
; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0
36-
; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #0
37-
; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #90
38-
; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #90
34+
; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0
35+
; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0
36+
; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #90
37+
; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #90
3938
; CHECK-NEXT: b.ne .LBB0_1
4039
; CHECK-NEXT: // %bb.2: // %exit.block
41-
; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
42-
; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d
40+
; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d
41+
; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d
4342
; CHECK-NEXT: faddv d0, p0, z2.d
4443
; CHECK-NEXT: faddv d1, p0, z1.d
4544
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
@@ -183,17 +182,16 @@ exit.block: ; preds = %vector.body
183182
define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
184183
; CHECK-LABEL: complex_mul_v2f64_unrolled:
185184
; CHECK: // %bb.0: // %entry
185+
; CHECK-NEXT: movi v0.2d, #0000000000000000
186186
; CHECK-NEXT: movi v1.2d, #0000000000000000
187187
; CHECK-NEXT: cntw x8
188-
; CHECK-NEXT: mov w10, #1000 // =0x3e8
188+
; CHECK-NEXT: movi v2.2d, #0000000000000000
189+
; CHECK-NEXT: movi v3.2d, #0000000000000000
189190
; CHECK-NEXT: neg x9, x8
191+
; CHECK-NEXT: mov w10, #1000 // =0x3e8
190192
; CHECK-NEXT: ptrue p0.d
191193
; CHECK-NEXT: and x9, x9, x10
192194
; CHECK-NEXT: rdvl x10, #4
193-
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
194-
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
195-
; CHECK-NEXT: mov z2.d, z1.d
196-
; CHECK-NEXT: mov z3.d, z0.d
197195
; CHECK-NEXT: .LBB2_1: // %vector.body
198196
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
199197
; CHECK-NEXT: ldr z4, [x0, #1, mul vl]
@@ -207,20 +205,20 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
207205
; CHECK-NEXT: ldr z18, [x1, #3, mul vl]
208206
; CHECK-NEXT: ldr z19, [x1, #2, mul vl]
209207
; CHECK-NEXT: add x1, x1, x10
210-
; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #0
211-
; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0
208+
; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #0
209+
; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #0
212210
; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #0
213211
; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #0
214-
; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #90
215-
; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #90
212+
; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #90
213+
; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #90
216214
; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #90
217215
; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #90
218216
; CHECK-NEXT: b.ne .LBB2_1
219217
; CHECK-NEXT: // %bb.2: // %exit.block
220218
; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d
221-
; CHECK-NEXT: uzp1 z5.d, z1.d, z0.d
219+
; CHECK-NEXT: uzp1 z5.d, z0.d, z1.d
222220
; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d
223-
; CHECK-NEXT: uzp2 z0.d, z1.d, z0.d
221+
; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d
224222
; CHECK-NEXT: fadd z1.d, z4.d, z5.d
225223
; CHECK-NEXT: fadd z2.d, z2.d, z0.d
226224
; CHECK-NEXT: faddv d0, p0, z1.d
@@ -310,15 +308,15 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia
310308
; CHECK-LABEL: reduction_mix:
311309
; CHECK: // %bb.0: // %entry
312310
; CHECK-NEXT: movi v2.2d, #0000000000000000
311+
; CHECK-NEXT: movi v0.2d, #0000000000000000
313312
; CHECK-NEXT: cntd x9
314-
; CHECK-NEXT: mov w11, #100 // =0x64
313+
; CHECK-NEXT: movi v1.2d, #0000000000000000
315314
; CHECK-NEXT: neg x10, x9
315+
; CHECK-NEXT: mov w11, #100 // =0x64
316316
; CHECK-NEXT: ptrue p0.d
317317
; CHECK-NEXT: mov x8, xzr
318318
; CHECK-NEXT: and x10, x10, x11
319319
; CHECK-NEXT: rdvl x11, #2
320-
; CHECK-NEXT: zip2 z0.d, z2.d, z2.d
321-
; CHECK-NEXT: zip1 z1.d, z2.d, z2.d
322320
; CHECK-NEXT: .LBB3_1: // %vector.body
323321
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
324322
; CHECK-NEXT: ldr z3, [x0]
@@ -327,13 +325,13 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia
327325
; CHECK-NEXT: ld1w { z5.d }, p0/z, [x3, x8, lsl #2]
328326
; CHECK-NEXT: add x8, x8, x9
329327
; CHECK-NEXT: cmp x10, x8
330-
; CHECK-NEXT: fadd z0.d, z4.d, z0.d
331-
; CHECK-NEXT: fadd z1.d, z3.d, z1.d
328+
; CHECK-NEXT: fadd z1.d, z4.d, z1.d
329+
; CHECK-NEXT: fadd z0.d, z3.d, z0.d
332330
; CHECK-NEXT: add z2.d, z5.d, z2.d
333331
; CHECK-NEXT: b.ne .LBB3_1
334332
; CHECK-NEXT: // %bb.2: // %middle.block
335-
; CHECK-NEXT: uzp2 z3.d, z1.d, z0.d
336-
; CHECK-NEXT: uzp1 z1.d, z1.d, z0.d
333+
; CHECK-NEXT: uzp2 z3.d, z0.d, z1.d
334+
; CHECK-NEXT: uzp1 z1.d, z0.d, z1.d
337335
; CHECK-NEXT: uaddv d2, p0, z2.d
338336
; CHECK-NEXT: faddv d0, p0, z3.d
339337
; CHECK-NEXT: faddv d1, p0, z1.d

0 commit comments

Comments
 (0)