Skip to content

Commit cc31fbd

Browse files
committed
release/21.x: [AArch64,TTI] Disable RealUse check for vector insert/extract costs and Apple CPUs. (#146526)
Back-port #146526 (02d3738) for the 21.x release, just for Apple CPUs. As discussed during the review, the patch was landed just after the branch, to avoid regressions. We already did a careful performance analysis on Apple M series CPUs with this change and are seeing significant gains on a number of workloads, which we would like to enable for 21.x Original message: getVectorInstrCostHelper would return costs of zero for vector inserts/extracts that move data between GPR and vector registers, if there was no 'real' use, i.e. there was no corresponding existing instruction. This meant that passes like LoopVectorize and SLPVectorizer, which likely are the main users of the interface, would understimate the cost of insert/extracts that move data between GPR and vector registers, which has non-trivial costs. The patch removes the special case and only returns costs of zero for lane 0 if it there is no need to transfer between integer and vector registers. This impacts a number of SLP test, and most of them look like general improvements.I think the change should make things more accurate for any AArch64 target, but if not it could also just be Apple CPU specific. I am seeing +2% end-to-end improvements on SLP-heavy workloads. PR: #146526
1 parent 440f762 commit cc31fbd

File tree

3 files changed

+44
-25
lines changed

3 files changed

+44
-25
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3894,22 +3894,44 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
38943894
: ST->getVectorInsertExtractBaseCost();
38953895
}
38963896

3897+
/// Returns true of \p ProcFamily is Apple M1-M4 or any of the aligned A series
3898+
/// CPUs.
3899+
static bool isAppleMCoreLike(unsigned ProcFamily) {
3900+
switch (ProcFamily) {
3901+
case AArch64Subtarget::AppleA14:
3902+
case AArch64Subtarget::AppleA15:
3903+
case AArch64Subtarget::AppleA16:
3904+
case AArch64Subtarget::AppleM4:
3905+
return true;
3906+
default:
3907+
return false;
3908+
};
3909+
}
3910+
38973911
InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
38983912
TTI::TargetCostKind CostKind,
38993913
unsigned Index,
39003914
const Value *Op0,
39013915
const Value *Op1) const {
3916+
39023917
bool HasRealUse =
39033918
Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
3919+
if (isAppleMCoreLike(ST->getProcFamily())) {
3920+
if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
3921+
isa<PoisonValue>(Op0))
3922+
return 0;
3923+
HasRealUse = true;
3924+
}
39043925
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse);
39053926
}
39063927

39073928
InstructionCost AArch64TTIImpl::getVectorInstrCost(
39083929
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
39093930
Value *Scalar,
39103931
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
3911-
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
3912-
Scalar, ScalarUserAndIdx);
3932+
bool HasRealUse = isAppleMCoreLike(ST->getProcFamily());
3933+
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse,
3934+
nullptr, Scalar, ScalarUserAndIdx);
39133935
}
39143936

39153937
InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,

llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -60,24 +60,21 @@ define i16 @foo(i16 %in1, i16 %in2) {
6060
; APPLE-M1-LABEL: define i16 @foo(
6161
; APPLE-M1-SAME: i16 [[IN1:%.*]], i16 [[IN2:%.*]]) #[[ATTR0:[0-9]+]] {
6262
; APPLE-M1-NEXT: entry:
63-
; APPLE-M1-NEXT: [[TMP0:%.*]] = insertelement <2 x i16> poison, i16 [[IN1]], i32 0
64-
; APPLE-M1-NEXT: [[TMP1:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <2 x i32> zeroinitializer
65-
; APPLE-M1-NEXT: [[TMP2:%.*]] = zext <2 x i16> [[TMP1]] to <2 x i64>
66-
; APPLE-M1-NEXT: [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[IN2]], i32 0
67-
; APPLE-M1-NEXT: [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
68-
; APPLE-M1-NEXT: [[TMP5:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i64>
69-
; APPLE-M1-NEXT: [[TMP6:%.*]] = mul nuw nsw <2 x i64> [[TMP5]], [[TMP2]]
70-
; APPLE-M1-NEXT: [[TMP7:%.*]] = and <2 x i64> [[TMP6]], splat (i64 65535)
71-
; APPLE-M1-NEXT: [[TMP8:%.*]] = icmp ne <2 x i64> [[TMP7]], splat (i64 65533)
72-
; APPLE-M1-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
63+
; APPLE-M1-NEXT: [[ZEXT1_1:%.*]] = zext i16 [[IN1]] to i64
64+
; APPLE-M1-NEXT: [[ZEXT2_1:%.*]] = zext i16 [[IN2]] to i64
65+
; APPLE-M1-NEXT: [[TMP10:%.*]] = mul nuw nsw i64 [[ZEXT2_1]], [[ZEXT1_1]]
66+
; APPLE-M1-NEXT: [[AND1:%.*]] = and i64 [[TMP10]], 65535
67+
; APPLE-M1-NEXT: [[TMP9:%.*]] = icmp ne i64 [[AND1]], 65533
7368
; APPLE-M1-NEXT: [[ZEXT3_1:%.*]] = zext i1 [[TMP9]] to i16
74-
; APPLE-M1-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
7569
; APPLE-M1-NEXT: [[CMP2_1:%.*]] = icmp ne i64 [[TMP10]], 196605
7670
; APPLE-M1-NEXT: [[ZEXT4_1:%.*]] = zext i1 [[CMP2_1]] to i16
7771
; APPLE-M1-NEXT: [[ADD1:%.*]] = add nuw nsw i16 [[ZEXT3_1]], [[ZEXT4_1]]
78-
; APPLE-M1-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
72+
; APPLE-M1-NEXT: [[ZEXT1_2:%.*]] = zext i16 [[IN1]] to i64
73+
; APPLE-M1-NEXT: [[ZEXT2_2:%.*]] = zext i16 [[IN2]] to i64
74+
; APPLE-M1-NEXT: [[TMP12:%.*]] = mul nuw nsw i64 [[ZEXT2_2]], [[ZEXT1_2]]
75+
; APPLE-M1-NEXT: [[AND2:%.*]] = and i64 [[TMP12]], 65535
76+
; APPLE-M1-NEXT: [[TMP11:%.*]] = icmp ne i64 [[AND2]], 65533
7977
; APPLE-M1-NEXT: [[ZEXT3_2:%.*]] = zext i1 [[TMP11]] to i16
80-
; APPLE-M1-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
8178
; APPLE-M1-NEXT: [[CMP2_2:%.*]] = icmp ne i64 [[TMP12]], 196605
8279
; APPLE-M1-NEXT: [[ZEXT4_2:%.*]] = zext i1 [[CMP2_2]] to i16
8380
; APPLE-M1-NEXT: [[ADD2:%.*]] = add nuw nsw i16 [[ADD1]], [[ZEXT4_2]]

llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt -passes=vector-combine -mtriple=arm64-apple-darwinos -S %s | FileCheck --check-prefixes=CHECK,LIMIT-DEFAULT %s
3-
; RUN: opt -passes=vector-combine -mtriple=arm64-apple-darwinos -vector-combine-max-scan-instrs=2 -S %s | FileCheck --check-prefixes=CHECK,LIMIT2 %s
2+
; RUN: opt -passes=vector-combine -mtriple=arm64-apple-darwinos -mcpu=apple-m1 -S %s | FileCheck --check-prefixes=CHECK,LIMIT-DEFAULT %s
3+
; RUN: opt -passes=vector-combine -mtriple=arm64-apple-darwinos -mcpu=apple-m1 -vector-combine-max-scan-instrs=2 -S %s | FileCheck --check-prefixes=CHECK,LIMIT2 %s
44

55
define i32 @load_extract_idx_0(ptr %x) {
66
; CHECK-LABEL: @load_extract_idx_0(
@@ -669,10 +669,10 @@ define i1 @load_with_non_power_of_2_element_type_2(ptr %x) {
669669
; Scalarizing the load for multiple constant indices may not be profitable.
670670
define i32 @load_multiple_extracts_with_constant_idx(ptr %x) {
671671
; CHECK-LABEL: @load_multiple_extracts_with_constant_idx(
672-
; CHECK-NEXT: [[LV:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 16
673-
; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[LV]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
674-
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[LV]], [[SHIFT]]
675-
; CHECK-NEXT: [[RES:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
672+
; CHECK-NEXT: [[E_0:%.*]] = load i32, ptr [[TMP1:%.*]], align 16
673+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr [[TMP1]], i32 0, i32 1
674+
; CHECK-NEXT: [[E_1:%.*]] = load i32, ptr [[TMP2]], align 4
675+
; CHECK-NEXT: [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
676676
; CHECK-NEXT: ret i32 [[RES]]
677677
;
678678
%lv = load <4 x i32>, ptr %x
@@ -686,10 +686,10 @@ define i32 @load_multiple_extracts_with_constant_idx(ptr %x) {
686686
; because the vector large vector requires 2 vector registers.
687687
define i32 @load_multiple_extracts_with_constant_idx_profitable(ptr %x) {
688688
; CHECK-LABEL: @load_multiple_extracts_with_constant_idx_profitable(
689-
; CHECK-NEXT: [[LV:%.*]] = load <8 x i32>, ptr [[X:%.*]], align 16
690-
; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i32> [[LV]], <8 x i32> poison, <8 x i32> <i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
691-
; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> [[LV]], [[SHIFT]]
692-
; CHECK-NEXT: [[RES:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
689+
; CHECK-NEXT: [[E_0:%.*]] = load i32, ptr [[TMP1:%.*]], align 16
690+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x i32>, ptr [[TMP1]], i32 0, i32 6
691+
; CHECK-NEXT: [[E_1:%.*]] = load i32, ptr [[TMP2]], align 8
692+
; CHECK-NEXT: [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
693693
; CHECK-NEXT: ret i32 [[RES]]
694694
;
695695
%lv = load <8 x i32>, ptr %x, align 16

0 commit comments

Comments
 (0)