-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[AArch64] Enable preferZeroCompareBranch for AArch64 when we don't have fused cmp+br #150045
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@davemgreen Thoughts? |
@llvm/pr-subscribers-backend-arm @llvm/pr-subscribers-backend-systemz Author: AZero13 (AZero13) ChangesObviously we also cannot do this if speculative load hardening is on too. Full diff: https://github.com/llvm/llvm-project/pull/150045.diff 8 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 084b788d51828..993b75256fa23 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -749,7 +749,7 @@ class LLVM_ABI TargetLoweringBase {
/// Return true if the heuristic to prefer icmp eq zero should be used in code
/// gen prepare.
- virtual bool preferZeroCompareBranch() const { return false; }
+ virtual bool preferZeroCompareBranch(BranchInst *) const { return false; }
/// Return true if it is cheaper to split the store of a merged int val
/// from a pair of smaller values into multiple stores.
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index d9d41f1d72e35..90ad9949fb772 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8630,7 +8630,7 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
// br %c, bla, blb
// Creating the cmp to zero can be better for the backend, especially if the
// lshr produces flags that can be used automatically.
- if (!TLI.preferZeroCompareBranch() || !Branch->isConditional())
+ if (!TLI.preferZeroCompareBranch(Branch) || !Branch->isConditional())
return false;
ICmpInst *Cmp = dyn_cast<ICmpInst>(Branch->getCondition());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ff23f76fadccd..12a7bf60401e6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -28508,6 +28508,14 @@ Register AArch64TargetLowering::getExceptionSelectorRegister(
return AArch64::X1;
}
+bool AArch64TargetLowering::preferZeroCompareBranch(BranchInst *Branch) const {
+ // If we can use Armv9.6 CB instructions, prefer that over zero compare branches.
+
+ // If we have speculative load hardening enabled, we cannot use
+ // zero compare branches.
+ return !Subtarget->hasCMPBR() && !Branch->getFunction()->hasFnAttribute(Attribute::SpeculativeLoadHardening);
+}
+
bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
const Instruction &AndI) const {
// Only sink 'and' mask to cmp use block if it is masking a single bit, since
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 7b1de3d3254f2..26fa599655a48 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -366,6 +366,8 @@ class AArch64TargetLowering : public TargetLowering {
return true;
}
+ bool preferZeroCompareBranch(BranchInst *) const override;
+
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
bool hasAndNotCompare(SDValue V) const override {
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 5f4aef55b22c9..ff55dd8d1b06d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -605,7 +605,7 @@ class VectorType;
Sched::Preference getSchedulingPreference(SDNode *N) const override;
- bool preferZeroCompareBranch() const override { return true; }
+ bool preferZeroCompareBranch(BranchInst *) const override { return true; }
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index a5d735c407e5c..71dd861b17759 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -225,7 +225,7 @@ class RISCVTargetLowering : public TargetLowering {
unsigned getCustomCtpopCost(EVT VT, ISD::CondCode Cond) const override;
- bool preferZeroCompareBranch() const override { return true; }
+ bool preferZeroCompareBranch(BranchInst *) const override { return true; }
// Note that one specific case requires fence insertion for an
// AtomicCmpXchgInst but is handled via the RISCVZacasABIFix pass rather
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 1866962e17587..b5e497d773bd9 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -471,7 +471,7 @@ class SystemZTargetLowering : public TargetLowering {
}
bool isCheapToSpeculateCtlz(Type *) const override { return true; }
bool isCheapToSpeculateCttz(Type *) const override { return true; }
- bool preferZeroCompareBranch() const override { return true; }
+ bool preferZeroCompareBranch(BranchInst *) const override { return true; }
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override {
ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
return Mask && Mask->getValue().isIntN(16);
diff --git a/llvm/test/CodeGen/AArch64/branch-on-zero.ll b/llvm/test/CodeGen/AArch64/branch-on-zero.ll
new file mode 100644
index 0000000000000..efd4d2b319c55
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/branch-on-zero.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+define i32 @test_lshr(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
+; CHECK-SD-LABEL: test_lshr:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: lsr w8, w2, #2
+; CHECK-SD-NEXT: cbz w8, .LBB0_2
+; CHECK-SD-NEXT: .LBB0_1: // %while.body
+; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT: ldr w9, [x1], #4
+; CHECK-SD-NEXT: subs w8, w8, #1
+; CHECK-SD-NEXT: lsl w9, w9, #1
+; CHECK-SD-NEXT: str w9, [x0], #4
+; CHECK-SD-NEXT: b.ne .LBB0_1
+; CHECK-SD-NEXT: .LBB0_2: // %while.end
+; CHECK-SD-NEXT: mov w0, wzr
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_lshr:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: lsr w8, w2, #2
+; CHECK-GI-NEXT: cbz w8, .LBB0_2
+; CHECK-GI-NEXT: .LBB0_1: // %while.body
+; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT: ldr w9, [x1], #4
+; CHECK-GI-NEXT: add x10, x0, #4
+; CHECK-GI-NEXT: subs w8, w8, #1
+; CHECK-GI-NEXT: lsl w9, w9, #1
+; CHECK-GI-NEXT: str w9, [x0]
+; CHECK-GI-NEXT: mov x0, x10
+; CHECK-GI-NEXT: b.ne .LBB0_1
+; CHECK-GI-NEXT: .LBB0_2: // %while.end
+; CHECK-GI-NEXT: mov w0, wzr
+; CHECK-GI-NEXT: ret
+entry:
+ %shr = lshr i32 %n, 2
+ %tobool.not4 = icmp eq i32 %shr, 0
+ br i1 %tobool.not4, label %while.end, label %while.body
+
+while.body: ; preds = %entry, %while.body
+ %c.07 = phi i32 [ %dec, %while.body ], [ %shr, %entry ]
+ %x.addr.06 = phi ptr [ %incdec.ptr1, %while.body ], [ %x, %entry ]
+ %y.addr.05 = phi ptr [ %incdec.ptr, %while.body ], [ %y, %entry ]
+ %incdec.ptr = getelementptr inbounds i32, ptr %y.addr.05, i32 1
+ %0 = load i32, ptr %y.addr.05, align 4
+ %mul = shl nsw i32 %0, 1
+ %incdec.ptr1 = getelementptr inbounds i32, ptr %x.addr.06, i32 1
+ store i32 %mul, ptr %x.addr.06, align 4
+ %dec = add nsw i32 %c.07, -1
+ %tobool.not = icmp eq i32 %dec, 0
+ br i1 %tobool.not, label %while.end, label %while.body
+
+while.end: ; preds = %while.body, %entry
+ ret i32 0
+}
+
+define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
+; CHECK-SD-LABEL: test_lshr2:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: lsr w8, w2, #2
+; CHECK-SD-NEXT: cbz w8, .LBB1_2
+; CHECK-SD-NEXT: .LBB1_1: // %while.body
+; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT: ldr w9, [x1], #4
+; CHECK-SD-NEXT: subs w8, w8, #1
+; CHECK-SD-NEXT: lsl w9, w9, #1
+; CHECK-SD-NEXT: str w9, [x0], #4
+; CHECK-SD-NEXT: b.ne .LBB1_1
+; CHECK-SD-NEXT: .LBB1_2: // %while.end
+; CHECK-SD-NEXT: mov w0, wzr
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_lshr2:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: lsr w8, w2, #2
+; CHECK-GI-NEXT: cbz w8, .LBB1_2
+; CHECK-GI-NEXT: .LBB1_1: // %while.body
+; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT: ldr w9, [x1], #4
+; CHECK-GI-NEXT: add x10, x0, #4
+; CHECK-GI-NEXT: subs w8, w8, #1
+; CHECK-GI-NEXT: lsl w9, w9, #1
+; CHECK-GI-NEXT: str w9, [x0]
+; CHECK-GI-NEXT: mov x0, x10
+; CHECK-GI-NEXT: b.ne .LBB1_1
+; CHECK-GI-NEXT: .LBB1_2: // %while.end
+; CHECK-GI-NEXT: mov w0, wzr
+; CHECK-GI-NEXT: ret
+entry:
+ %tobool.not4 = icmp ult i32 %n, 4
+ br i1 %tobool.not4, label %while.end, label %while.body.preheader
+
+while.body.preheader: ; preds = %entry
+ %shr = lshr i32 %n, 2
+ br label %while.body
+
+while.body: ; preds = %while.body.preheader, %while.body
+ %c.07 = phi i32 [ %dec, %while.body ], [ %shr, %while.body.preheader ]
+ %x.addr.06 = phi ptr [ %incdec.ptr1, %while.body ], [ %x, %while.body.preheader ]
+ %y.addr.05 = phi ptr [ %incdec.ptr, %while.body ], [ %y, %while.body.preheader ]
+ %incdec.ptr = getelementptr inbounds i32, ptr %y.addr.05, i32 1
+ %0 = load i32, ptr %y.addr.05, align 4
+ %mul = shl nsw i32 %0, 1
+ %incdec.ptr1 = getelementptr inbounds i32, ptr %x.addr.06, i32 1
+ store i32 %mul, ptr %x.addr.06, align 4
+ %dec = add nsw i32 %c.07, -1
+ %tobool.not = icmp eq i32 %dec, 0
+ br i1 %tobool.not, label %while.end, label %while.body
+
+while.end: ; preds = %while.body, %entry
+ ret i32 0
+}
+
+
+define i32 @lshr(i32 %u) {
+; CHECK-LABEL: lshr:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: mov w19, w0
+; CHECK-NEXT: lsr w0, w0, #4
+; CHECK-NEXT: mov w8, w19
+; CHECK-NEXT: cbz w0, .LBB2_2
+; CHECK-NEXT: // %bb.1: // %if.then
+; CHECK-NEXT: bl use
+; CHECK-NEXT: add w8, w19, w19, lsl #1
+; CHECK-NEXT: .LBB2_2: // %if.end
+; CHECK-NEXT: sub w9, w19, #7
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, hi
+; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %cmp.not = icmp ult i32 %u, 16
+ br i1 %cmp.not, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %shr = lshr i32 %u, 4
+ tail call void @use(i32 noundef %shr)
+ %mul = mul i32 %u, 3
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ %u.addr.0 = phi i32 [ %mul, %if.then ], [ %u, %entry ]
+ %sub = add i32 %u, -7
+ %cmp1 = icmp ugt i32 %u.addr.0, %sub
+ %conv = zext i1 %cmp1 to i32
+ ret i32 %conv
+}
+
+declare void @use(i32)
+
|
@llvm/pr-subscribers-backend-risc-v Author: AZero13 (AZero13) ChangesObviously we also cannot do this if speculative load hardening is on too. Full diff: https://github.com/llvm/llvm-project/pull/150045.diff 8 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 084b788d51828..993b75256fa23 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -749,7 +749,7 @@ class LLVM_ABI TargetLoweringBase {
/// Return true if the heuristic to prefer icmp eq zero should be used in code
/// gen prepare.
- virtual bool preferZeroCompareBranch() const { return false; }
+ virtual bool preferZeroCompareBranch(BranchInst *) const { return false; }
/// Return true if it is cheaper to split the store of a merged int val
/// from a pair of smaller values into multiple stores.
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index d9d41f1d72e35..90ad9949fb772 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8630,7 +8630,7 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
// br %c, bla, blb
// Creating the cmp to zero can be better for the backend, especially if the
// lshr produces flags that can be used automatically.
- if (!TLI.preferZeroCompareBranch() || !Branch->isConditional())
+ if (!TLI.preferZeroCompareBranch(Branch) || !Branch->isConditional())
return false;
ICmpInst *Cmp = dyn_cast<ICmpInst>(Branch->getCondition());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ff23f76fadccd..12a7bf60401e6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -28508,6 +28508,14 @@ Register AArch64TargetLowering::getExceptionSelectorRegister(
return AArch64::X1;
}
+bool AArch64TargetLowering::preferZeroCompareBranch(BranchInst *Branch) const {
+ // If we can use Armv9.6 CB instructions, prefer that over zero compare branches.
+
+ // If we have speculative load hardening enabled, we cannot use
+ // zero compare branches.
+ return !Subtarget->hasCMPBR() && !Branch->getFunction()->hasFnAttribute(Attribute::SpeculativeLoadHardening);
+}
+
bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
const Instruction &AndI) const {
// Only sink 'and' mask to cmp use block if it is masking a single bit, since
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 7b1de3d3254f2..26fa599655a48 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -366,6 +366,8 @@ class AArch64TargetLowering : public TargetLowering {
return true;
}
+ bool preferZeroCompareBranch(BranchInst *) const override;
+
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
bool hasAndNotCompare(SDValue V) const override {
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 5f4aef55b22c9..ff55dd8d1b06d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -605,7 +605,7 @@ class VectorType;
Sched::Preference getSchedulingPreference(SDNode *N) const override;
- bool preferZeroCompareBranch() const override { return true; }
+ bool preferZeroCompareBranch(BranchInst *) const override { return true; }
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index a5d735c407e5c..71dd861b17759 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -225,7 +225,7 @@ class RISCVTargetLowering : public TargetLowering {
unsigned getCustomCtpopCost(EVT VT, ISD::CondCode Cond) const override;
- bool preferZeroCompareBranch() const override { return true; }
+ bool preferZeroCompareBranch(BranchInst *) const override { return true; }
// Note that one specific case requires fence insertion for an
// AtomicCmpXchgInst but is handled via the RISCVZacasABIFix pass rather
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 1866962e17587..b5e497d773bd9 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -471,7 +471,7 @@ class SystemZTargetLowering : public TargetLowering {
}
bool isCheapToSpeculateCtlz(Type *) const override { return true; }
bool isCheapToSpeculateCttz(Type *) const override { return true; }
- bool preferZeroCompareBranch() const override { return true; }
+ bool preferZeroCompareBranch(BranchInst *) const override { return true; }
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override {
ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
return Mask && Mask->getValue().isIntN(16);
diff --git a/llvm/test/CodeGen/AArch64/branch-on-zero.ll b/llvm/test/CodeGen/AArch64/branch-on-zero.ll
new file mode 100644
index 0000000000000..efd4d2b319c55
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/branch-on-zero.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+define i32 @test_lshr(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
+; CHECK-SD-LABEL: test_lshr:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: lsr w8, w2, #2
+; CHECK-SD-NEXT: cbz w8, .LBB0_2
+; CHECK-SD-NEXT: .LBB0_1: // %while.body
+; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT: ldr w9, [x1], #4
+; CHECK-SD-NEXT: subs w8, w8, #1
+; CHECK-SD-NEXT: lsl w9, w9, #1
+; CHECK-SD-NEXT: str w9, [x0], #4
+; CHECK-SD-NEXT: b.ne .LBB0_1
+; CHECK-SD-NEXT: .LBB0_2: // %while.end
+; CHECK-SD-NEXT: mov w0, wzr
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_lshr:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: lsr w8, w2, #2
+; CHECK-GI-NEXT: cbz w8, .LBB0_2
+; CHECK-GI-NEXT: .LBB0_1: // %while.body
+; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT: ldr w9, [x1], #4
+; CHECK-GI-NEXT: add x10, x0, #4
+; CHECK-GI-NEXT: subs w8, w8, #1
+; CHECK-GI-NEXT: lsl w9, w9, #1
+; CHECK-GI-NEXT: str w9, [x0]
+; CHECK-GI-NEXT: mov x0, x10
+; CHECK-GI-NEXT: b.ne .LBB0_1
+; CHECK-GI-NEXT: .LBB0_2: // %while.end
+; CHECK-GI-NEXT: mov w0, wzr
+; CHECK-GI-NEXT: ret
+entry:
+ %shr = lshr i32 %n, 2
+ %tobool.not4 = icmp eq i32 %shr, 0
+ br i1 %tobool.not4, label %while.end, label %while.body
+
+while.body: ; preds = %entry, %while.body
+ %c.07 = phi i32 [ %dec, %while.body ], [ %shr, %entry ]
+ %x.addr.06 = phi ptr [ %incdec.ptr1, %while.body ], [ %x, %entry ]
+ %y.addr.05 = phi ptr [ %incdec.ptr, %while.body ], [ %y, %entry ]
+ %incdec.ptr = getelementptr inbounds i32, ptr %y.addr.05, i32 1
+ %0 = load i32, ptr %y.addr.05, align 4
+ %mul = shl nsw i32 %0, 1
+ %incdec.ptr1 = getelementptr inbounds i32, ptr %x.addr.06, i32 1
+ store i32 %mul, ptr %x.addr.06, align 4
+ %dec = add nsw i32 %c.07, -1
+ %tobool.not = icmp eq i32 %dec, 0
+ br i1 %tobool.not, label %while.end, label %while.body
+
+while.end: ; preds = %while.body, %entry
+ ret i32 0
+}
+
+define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
+; CHECK-SD-LABEL: test_lshr2:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: lsr w8, w2, #2
+; CHECK-SD-NEXT: cbz w8, .LBB1_2
+; CHECK-SD-NEXT: .LBB1_1: // %while.body
+; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT: ldr w9, [x1], #4
+; CHECK-SD-NEXT: subs w8, w8, #1
+; CHECK-SD-NEXT: lsl w9, w9, #1
+; CHECK-SD-NEXT: str w9, [x0], #4
+; CHECK-SD-NEXT: b.ne .LBB1_1
+; CHECK-SD-NEXT: .LBB1_2: // %while.end
+; CHECK-SD-NEXT: mov w0, wzr
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_lshr2:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: lsr w8, w2, #2
+; CHECK-GI-NEXT: cbz w8, .LBB1_2
+; CHECK-GI-NEXT: .LBB1_1: // %while.body
+; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT: ldr w9, [x1], #4
+; CHECK-GI-NEXT: add x10, x0, #4
+; CHECK-GI-NEXT: subs w8, w8, #1
+; CHECK-GI-NEXT: lsl w9, w9, #1
+; CHECK-GI-NEXT: str w9, [x0]
+; CHECK-GI-NEXT: mov x0, x10
+; CHECK-GI-NEXT: b.ne .LBB1_1
+; CHECK-GI-NEXT: .LBB1_2: // %while.end
+; CHECK-GI-NEXT: mov w0, wzr
+; CHECK-GI-NEXT: ret
+entry:
+ %tobool.not4 = icmp ult i32 %n, 4
+ br i1 %tobool.not4, label %while.end, label %while.body.preheader
+
+while.body.preheader: ; preds = %entry
+ %shr = lshr i32 %n, 2
+ br label %while.body
+
+while.body: ; preds = %while.body.preheader, %while.body
+ %c.07 = phi i32 [ %dec, %while.body ], [ %shr, %while.body.preheader ]
+ %x.addr.06 = phi ptr [ %incdec.ptr1, %while.body ], [ %x, %while.body.preheader ]
+ %y.addr.05 = phi ptr [ %incdec.ptr, %while.body ], [ %y, %while.body.preheader ]
+ %incdec.ptr = getelementptr inbounds i32, ptr %y.addr.05, i32 1
+ %0 = load i32, ptr %y.addr.05, align 4
+ %mul = shl nsw i32 %0, 1
+ %incdec.ptr1 = getelementptr inbounds i32, ptr %x.addr.06, i32 1
+ store i32 %mul, ptr %x.addr.06, align 4
+ %dec = add nsw i32 %c.07, -1
+ %tobool.not = icmp eq i32 %dec, 0
+ br i1 %tobool.not, label %while.end, label %while.body
+
+while.end: ; preds = %while.body, %entry
+ ret i32 0
+}
+
+
+define i32 @lshr(i32 %u) {
+; CHECK-LABEL: lshr:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: mov w19, w0
+; CHECK-NEXT: lsr w0, w0, #4
+; CHECK-NEXT: mov w8, w19
+; CHECK-NEXT: cbz w0, .LBB2_2
+; CHECK-NEXT: // %bb.1: // %if.then
+; CHECK-NEXT: bl use
+; CHECK-NEXT: add w8, w19, w19, lsl #1
+; CHECK-NEXT: .LBB2_2: // %if.end
+; CHECK-NEXT: sub w9, w19, #7
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, hi
+; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %cmp.not = icmp ult i32 %u, 16
+ br i1 %cmp.not, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %shr = lshr i32 %u, 4
+ tail call void @use(i32 noundef %shr)
+ %mul = mul i32 %u, 3
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ %u.addr.0 = phi i32 [ %mul, %if.then ], [ %u, %entry ]
+ %sub = add i32 %u, -7
+ %cmp1 = icmp ugt i32 %u.addr.0, %sub
+ %conv = zext i1 %cmp1 to i32
+ ret i32 %conv
+}
+
+declare void @use(i32)
+
|
@llvm/pr-subscribers-backend-aarch64 Author: AZero13 (AZero13) ChangesObviously we also cannot do this if speculative load hardening is on too. Full diff: https://github.com/llvm/llvm-project/pull/150045.diff 8 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 084b788d51828..993b75256fa23 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -749,7 +749,7 @@ class LLVM_ABI TargetLoweringBase {
/// Return true if the heuristic to prefer icmp eq zero should be used in code
/// gen prepare.
- virtual bool preferZeroCompareBranch() const { return false; }
+ virtual bool preferZeroCompareBranch(BranchInst *) const { return false; }
/// Return true if it is cheaper to split the store of a merged int val
/// from a pair of smaller values into multiple stores.
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index d9d41f1d72e35..90ad9949fb772 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8630,7 +8630,7 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
// br %c, bla, blb
// Creating the cmp to zero can be better for the backend, especially if the
// lshr produces flags that can be used automatically.
- if (!TLI.preferZeroCompareBranch() || !Branch->isConditional())
+ if (!TLI.preferZeroCompareBranch(Branch) || !Branch->isConditional())
return false;
ICmpInst *Cmp = dyn_cast<ICmpInst>(Branch->getCondition());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ff23f76fadccd..12a7bf60401e6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -28508,6 +28508,14 @@ Register AArch64TargetLowering::getExceptionSelectorRegister(
return AArch64::X1;
}
+bool AArch64TargetLowering::preferZeroCompareBranch(BranchInst *Branch) const {
+ // If we can use Armv9.6 CB instructions, prefer that over zero compare branches.
+
+ // If we have speculative load hardening enabled, we cannot use
+ // zero compare branches.
+ return !Subtarget->hasCMPBR() && !Branch->getFunction()->hasFnAttribute(Attribute::SpeculativeLoadHardening);
+}
+
bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
const Instruction &AndI) const {
// Only sink 'and' mask to cmp use block if it is masking a single bit, since
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 7b1de3d3254f2..26fa599655a48 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -366,6 +366,8 @@ class AArch64TargetLowering : public TargetLowering {
return true;
}
+ bool preferZeroCompareBranch(BranchInst *) const override;
+
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
bool hasAndNotCompare(SDValue V) const override {
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 5f4aef55b22c9..ff55dd8d1b06d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -605,7 +605,7 @@ class VectorType;
Sched::Preference getSchedulingPreference(SDNode *N) const override;
- bool preferZeroCompareBranch() const override { return true; }
+ bool preferZeroCompareBranch(BranchInst *) const override { return true; }
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index a5d735c407e5c..71dd861b17759 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -225,7 +225,7 @@ class RISCVTargetLowering : public TargetLowering {
unsigned getCustomCtpopCost(EVT VT, ISD::CondCode Cond) const override;
- bool preferZeroCompareBranch() const override { return true; }
+ bool preferZeroCompareBranch(BranchInst *) const override { return true; }
// Note that one specific case requires fence insertion for an
// AtomicCmpXchgInst but is handled via the RISCVZacasABIFix pass rather
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 1866962e17587..b5e497d773bd9 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -471,7 +471,7 @@ class SystemZTargetLowering : public TargetLowering {
}
bool isCheapToSpeculateCtlz(Type *) const override { return true; }
bool isCheapToSpeculateCttz(Type *) const override { return true; }
- bool preferZeroCompareBranch() const override { return true; }
+ bool preferZeroCompareBranch(BranchInst *) const override { return true; }
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override {
ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
return Mask && Mask->getValue().isIntN(16);
diff --git a/llvm/test/CodeGen/AArch64/branch-on-zero.ll b/llvm/test/CodeGen/AArch64/branch-on-zero.ll
new file mode 100644
index 0000000000000..efd4d2b319c55
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/branch-on-zero.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+define i32 @test_lshr(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
+; CHECK-SD-LABEL: test_lshr:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: lsr w8, w2, #2
+; CHECK-SD-NEXT: cbz w8, .LBB0_2
+; CHECK-SD-NEXT: .LBB0_1: // %while.body
+; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT: ldr w9, [x1], #4
+; CHECK-SD-NEXT: subs w8, w8, #1
+; CHECK-SD-NEXT: lsl w9, w9, #1
+; CHECK-SD-NEXT: str w9, [x0], #4
+; CHECK-SD-NEXT: b.ne .LBB0_1
+; CHECK-SD-NEXT: .LBB0_2: // %while.end
+; CHECK-SD-NEXT: mov w0, wzr
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_lshr:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: lsr w8, w2, #2
+; CHECK-GI-NEXT: cbz w8, .LBB0_2
+; CHECK-GI-NEXT: .LBB0_1: // %while.body
+; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT: ldr w9, [x1], #4
+; CHECK-GI-NEXT: add x10, x0, #4
+; CHECK-GI-NEXT: subs w8, w8, #1
+; CHECK-GI-NEXT: lsl w9, w9, #1
+; CHECK-GI-NEXT: str w9, [x0]
+; CHECK-GI-NEXT: mov x0, x10
+; CHECK-GI-NEXT: b.ne .LBB0_1
+; CHECK-GI-NEXT: .LBB0_2: // %while.end
+; CHECK-GI-NEXT: mov w0, wzr
+; CHECK-GI-NEXT: ret
+entry:
+ %shr = lshr i32 %n, 2
+ %tobool.not4 = icmp eq i32 %shr, 0
+ br i1 %tobool.not4, label %while.end, label %while.body
+
+while.body: ; preds = %entry, %while.body
+ %c.07 = phi i32 [ %dec, %while.body ], [ %shr, %entry ]
+ %x.addr.06 = phi ptr [ %incdec.ptr1, %while.body ], [ %x, %entry ]
+ %y.addr.05 = phi ptr [ %incdec.ptr, %while.body ], [ %y, %entry ]
+ %incdec.ptr = getelementptr inbounds i32, ptr %y.addr.05, i32 1
+ %0 = load i32, ptr %y.addr.05, align 4
+ %mul = shl nsw i32 %0, 1
+ %incdec.ptr1 = getelementptr inbounds i32, ptr %x.addr.06, i32 1
+ store i32 %mul, ptr %x.addr.06, align 4
+ %dec = add nsw i32 %c.07, -1
+ %tobool.not = icmp eq i32 %dec, 0
+ br i1 %tobool.not, label %while.end, label %while.body
+
+while.end: ; preds = %while.body, %entry
+ ret i32 0
+}
+
+define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
+; CHECK-SD-LABEL: test_lshr2:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: lsr w8, w2, #2
+; CHECK-SD-NEXT: cbz w8, .LBB1_2
+; CHECK-SD-NEXT: .LBB1_1: // %while.body
+; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT: ldr w9, [x1], #4
+; CHECK-SD-NEXT: subs w8, w8, #1
+; CHECK-SD-NEXT: lsl w9, w9, #1
+; CHECK-SD-NEXT: str w9, [x0], #4
+; CHECK-SD-NEXT: b.ne .LBB1_1
+; CHECK-SD-NEXT: .LBB1_2: // %while.end
+; CHECK-SD-NEXT: mov w0, wzr
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_lshr2:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: lsr w8, w2, #2
+; CHECK-GI-NEXT: cbz w8, .LBB1_2
+; CHECK-GI-NEXT: .LBB1_1: // %while.body
+; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT: ldr w9, [x1], #4
+; CHECK-GI-NEXT: add x10, x0, #4
+; CHECK-GI-NEXT: subs w8, w8, #1
+; CHECK-GI-NEXT: lsl w9, w9, #1
+; CHECK-GI-NEXT: str w9, [x0]
+; CHECK-GI-NEXT: mov x0, x10
+; CHECK-GI-NEXT: b.ne .LBB1_1
+; CHECK-GI-NEXT: .LBB1_2: // %while.end
+; CHECK-GI-NEXT: mov w0, wzr
+; CHECK-GI-NEXT: ret
+entry:
+ %tobool.not4 = icmp ult i32 %n, 4
+ br i1 %tobool.not4, label %while.end, label %while.body.preheader
+
+while.body.preheader: ; preds = %entry
+ %shr = lshr i32 %n, 2
+ br label %while.body
+
+while.body: ; preds = %while.body.preheader, %while.body
+ %c.07 = phi i32 [ %dec, %while.body ], [ %shr, %while.body.preheader ]
+ %x.addr.06 = phi ptr [ %incdec.ptr1, %while.body ], [ %x, %while.body.preheader ]
+ %y.addr.05 = phi ptr [ %incdec.ptr, %while.body ], [ %y, %while.body.preheader ]
+ %incdec.ptr = getelementptr inbounds i32, ptr %y.addr.05, i32 1
+ %0 = load i32, ptr %y.addr.05, align 4
+ %mul = shl nsw i32 %0, 1
+ %incdec.ptr1 = getelementptr inbounds i32, ptr %x.addr.06, i32 1
+ store i32 %mul, ptr %x.addr.06, align 4
+ %dec = add nsw i32 %c.07, -1
+ %tobool.not = icmp eq i32 %dec, 0
+ br i1 %tobool.not, label %while.end, label %while.body
+
+while.end: ; preds = %while.body, %entry
+ ret i32 0
+}
+
+
+define i32 @lshr(i32 %u) {
+; CHECK-LABEL: lshr:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: mov w19, w0
+; CHECK-NEXT: lsr w0, w0, #4
+; CHECK-NEXT: mov w8, w19
+; CHECK-NEXT: cbz w0, .LBB2_2
+; CHECK-NEXT: // %bb.1: // %if.then
+; CHECK-NEXT: bl use
+; CHECK-NEXT: add w8, w19, w19, lsl #1
+; CHECK-NEXT: .LBB2_2: // %if.end
+; CHECK-NEXT: sub w9, w19, #7
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: cset w0, hi
+; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %cmp.not = icmp ult i32 %u, 16
+ br i1 %cmp.not, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %shr = lshr i32 %u, 4
+ tail call void @use(i32 noundef %shr)
+ %mul = mul i32 %u, 3
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ %u.addr.0 = phi i32 [ %mul, %if.then ], [ %u, %entry ]
+ %sub = add i32 %u, -7
+ %cmp1 = icmp ugt i32 %u.addr.0, %sub
+ %conv = zext i1 %cmp1 to i32
+ ret i32 %conv
+}
+
+declare void @use(i32)
+
|
@davemgreen These tests are copy pastes of the ones you made for ARM. |
…ve fused cmp+br We also cannot do this if speculative load hardening is on too.
Obviously we also cannot do this if speculative load hardening is on too.