From 195c01cd9b58e9f1645121954baced14c6e6ae3a Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Wed, 30 Jul 2025 17:46:21 +0100
Subject: [PATCH 1/3] Add test showing unwanted sinking of vector compare.

---
 .../dont-sink-scalable-vector-compare.ll      | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 llvm/test/Transforms/CodeGenPrepare/dont-sink-scalable-vector-compare.ll
diff --git a/llvm/test/Transforms/CodeGenPrepare/dont-sink-scalable-vector-compare.ll b/llvm/test/Transforms/CodeGenPrepare/dont-sink-scalable-vector-compare.ll
new file mode 100644
index 0000000000000..e1441ef4a1466
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/dont-sink-scalable-vector-compare.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -codegenprepare -S < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve | FileCheck %s
+
+define void @do_not_sink_scalable_vector_compare(ptr %a, ptr %b) {
+; CHECK-LABEL: define void @do_not_sink_scalable_vector_compare(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[STEP_VECTOR:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult <vscale x 4 x i32> [[STEP_VECTOR]], splat (i32 16)
+; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[SRC]], i32 4, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr inbounds ptr, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_LOAD]], ptr [[DST]], i32 4, <vscale x 4 x i1> [[TMP0]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[VECTOR_END:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[VECTOR_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %step.vector = call <vscale x 4 x i32> @llvm.stepvector()
+  %mask = icmp ult <vscale x 4 x i32> %step.vector, splat (i32 16)
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %src = getelementptr inbounds ptr, ptr %a, i64 %index
+  %wide.load =  call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr %src, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> poison)
+  %dst = getelementptr inbounds ptr, ptr %b, i64 %index
+  call void @llvm.masked.store.nxv4i32(<vscale x 4 x i32> %wide.load, ptr %dst, i32 4, <vscale x 4 x i1> %mask)
+  %index.next = add nuw i64 %index, 16
+  %exit.cond = icmp eq i64 %index.next, 1024
+  br i1 %exit.cond, label %vector.end, label %vector.body
+
+vector.end:
+  ret void
+}
+

From f7cd51c1b77b4495bec9915fc087557fdff1a90a Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Wed, 30 Jul 2025 13:57:14 +0100
Subject: [PATCH 2/3] [LLVM][CGP] Allow finer control for sinking compares.

Compare sinking is selectable based on the result of
hasMultipleConditionRegisters. This function is too coarse grained by
not taking into account the differences between scalar and vector
compares. This PR extends the interface to take an EVT to allow finer
control.

The new interface is used by AArch64 to disable sinking of scalable
vector compares, but with isProfitableToSinkOperands updated to
maintain the cases that are specifically tested.
---
 llvm/include/llvm/CodeGen/TargetLowering.h    | 28 +++++--------------
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  2 +-
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |  1 -
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  4 +++
 .../AArch64/AArch64TargetTransformInfo.cpp    | 17 +++++++----
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  8 ------
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   | 10 +++++++
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  5 +++-
 llvm/lib/Target/PowerPC/PPCISelLowering.h     |  2 ++
 .../dont-sink-scalable-vector-compare.ll      |  2 +-
 10 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index cbdc1b6031680..52729e9e7cee7 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -518,10 +518,12 @@ class LLVM_ABI TargetLoweringBase {
     return true;
   }
 
-  /// Return true if multiple condition registers are available.
-  bool hasMultipleConditionRegisters() const {
-    return HasMultipleConditionRegisters;
-  }
+  /// Does the target have multiple (allocatable) condition registers that
+  /// can be used to store the results of comparisons for use by selects
+  /// and conditional branches. With multiple condition registers, the code
+  /// generator will not aggressively sink comparisons into the blocks of their
+  /// users.
+  virtual bool hasMultipleConditionRegisters(EVT VT) const { return false; }
 
   /// Return true if the target has BitExtract instructions.
   bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }
@@ -2453,7 +2455,7 @@ class LLVM_ABI TargetLoweringBase {
                                                EVT VT) const {
     // If a target has multiple condition registers, then it likely has logical
     // operations on those registers.
-    if (hasMultipleConditionRegisters())
+    if (hasMultipleConditionRegisters(VT))
       return false;
     // Only do the transform if the value won't be split into multiple
     // registers.
@@ -2560,15 +2562,6 @@ class LLVM_ABI TargetLoweringBase {
     StackPointerRegisterToSaveRestore = R;
   }
 
-  /// Tells the code generator that the target has multiple (allocatable)
-  /// condition registers that can be used to store the results of comparisons
-  /// for use by selects and conditional branches. With multiple condition
-  /// registers, the code generator will not aggressively sink comparisons into
-  /// the blocks of their users.
-  void setHasMultipleConditionRegisters(bool hasManyRegs = true) {
-    HasMultipleConditionRegisters = hasManyRegs;
-  }
-
   /// Tells the code generator that the target has BitExtract instructions.
   /// The code generator will aggressively sink "shift"s into the blocks of
   /// their users if the users will generate "and" instructions which can be
@@ -3604,13 +3597,6 @@ class LLVM_ABI TargetLoweringBase {
 private:
   const TargetMachine &TM;
 
-  /// Tells the code generator that the target has multiple (allocatable)
-  /// condition registers that can be used to store the results of comparisons
-  /// for use by selects and conditional branches. With multiple condition
-  /// registers, the code generator will not aggressively sink comparisons into
-  /// the blocks of their users.
-  bool HasMultipleConditionRegisters;
-
   /// Tells the code generator that the target has BitExtract instructions.
   /// The code generator will aggressively sink "shift"s into the blocks of
   /// their users if the users will generate "and" instructions which can be
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 416c56d5a36f8..7e0c45e92b365 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -1834,7 +1834,7 @@ bool CodeGenPrepare::unfoldPowerOf2Test(CmpInst *Cmp) {
 ///
 /// Return true if any changes are made.
 static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
-  if (TLI.hasMultipleConditionRegisters())
+  if (TLI.hasMultipleConditionRegisters(EVT::getEVT(Cmp->getType())))
     return false;
 
   // Avoid sinking soft-FP comparisons, since this can move them into a loop.
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 3c91b0eb4e2ea..9b22033c7f4ac 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -697,7 +697,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
   MaxGluedStoresPerMemcpy = 0;
   MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
       MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
-  HasMultipleConditionRegisters = false;
   HasExtractBitsInsn = false;
   JumpIsExpensive = JumpIsExpensiveOverride;
   PredictableSelectIsExpensive = false;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index ea63edd86210e..88876570ac811 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -887,6 +887,10 @@ class AArch64TargetLowering : public TargetLowering {
   bool shouldScalarizeBinop(SDValue VecOp) const override {
     return VecOp.getOpcode() == ISD::SETCC;
   }
+
+  bool hasMultipleConditionRegisters(EVT VT) const override {
+    return VT.isScalableVector();
+  }
 };
 
 namespace AArch64 {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 40f49dade6131..1afc777d2b3b5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -6225,10 +6225,17 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
     }
   }
 
-  auto ShouldSinkCondition = [](Value *Cond) -> bool {
+  auto ShouldSinkCondition = [](Value *Cond,
+                                SmallVectorImpl<Use *> &Ops) -> bool {
+    if (!isa<IntrinsicInst>(Cond))
+      return false;
     auto *II = dyn_cast<IntrinsicInst>(Cond);
-    return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&
-           isa<ScalableVectorType>(II->getOperand(0)->getType());
+    if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
+        !isa<ScalableVectorType>(II->getOperand(0)->getType()))
+      return false;
+    if (isa<CmpInst>(II->getOperand(0)))
+      Ops.push_back(&II->getOperandUse(0));
+    return true;
   };
 
   switch (I->getOpcode()) {
@@ -6244,7 +6251,7 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
     }
     break;
   case Instruction::Select: {
-    if (!ShouldSinkCondition(I->getOperand(0)))
+    if (!ShouldSinkCondition(I->getOperand(0), Ops))
       return false;
 
     Ops.push_back(&I->getOperandUse(0));
@@ -6254,7 +6261,7 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
     if (cast<BranchInst>(I)->isUnconditional())
       return false;
 
-    if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition()))
+    if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition(), Ops))
       return false;
 
     Ops.push_back(&I->getOperandUse(0));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 61189337e5233..e32e7a2f92832 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -589,14 +589,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setSchedulingPreference(Sched::RegPressure);
   setJumpIsExpensive(true);
 
-  // FIXME: This is only partially true. If we have to do vector compares, any
-  // SGPR pair can be a condition register. If we have a uniform condition, we
-  // are better off doing SALU operations, where there is only one SCC. For now,
-  // we don't have a way of knowing during instruction selection if a condition
-  // will be uniform and we always use vector compares. Assume we are using
-  // vector compares until that is fixed.
-  setHasMultipleConditionRegisters(true);
-
   setMinCmpXchgSizeInBits(32);
   setSupportsUnalignedAtomics(false);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 39bb0adfc1a17..fd5d5b8dec431 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -388,6 +388,16 @@ class AMDGPUTargetLowering : public TargetLowering {
   MVT getFenceOperandTy(const DataLayout &DL) const override {
     return MVT::i32;
   }
+
+  bool hasMultipleConditionRegisters(EVT VT) const override {
+    // FIXME: This is only partially true. If we have to do vector compares, any
+    // SGPR pair can be a condition register. If we have a uniform condition, we
+    // are better off doing SALU operations, where there is only one SCC. For
+    // now, we don't have a way of knowing during instruction selection if a
+    // condition will be uniform and we always use vector compares. Assume we
+    // are using vector compares until that is fixed.
+    return true;
+  }
 };
 
 namespace AMDGPUISD {
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 459525ed4ee9a..99c50dc7e145f 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1433,7 +1433,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   // With 32 condition bits, we don't need to sink (and duplicate) compares
   // aggressively in CodeGenPrep.
   if (Subtarget.useCRBits()) {
-    setHasMultipleConditionRegisters();
     setJumpIsExpensive();
   }
 
@@ -19848,3 +19847,7 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
   return Builder.CreateOr(
       Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
 }
+
+bool PPCTargetLowering::hasMultipleConditionRegisters(EVT VT) const {
+  return Subtarget.useCRBits();
+}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 124c7116dc3b5..9755f0e272d16 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1207,6 +1207,8 @@ namespace llvm {
                                   bool IsVarArg) const;
     bool supportsTailCallFor(const CallBase *CB) const;
 
+    bool hasMultipleConditionRegisters(EVT VT) const override;
+
   private:
     struct ReuseLoadInfo {
       SDValue Ptr;
diff --git a/llvm/test/Transforms/CodeGenPrepare/dont-sink-scalable-vector-compare.ll b/llvm/test/Transforms/CodeGenPrepare/dont-sink-scalable-vector-compare.ll
index e1441ef4a1466..5960eaa0464f9 100644
--- a/llvm/test/Transforms/CodeGenPrepare/dont-sink-scalable-vector-compare.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/dont-sink-scalable-vector-compare.ll
@@ -6,10 +6,10 @@ define void @do_not_sink_scalable_vector_compare(ptr %a, ptr %b) {
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[STEP_VECTOR:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult <vscale x 4 x i32> [[STEP_VECTOR]], splat (i32 16)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult <vscale x 4 x i32> [[STEP_VECTOR]], splat (i32 16)
 ; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[SRC]], i32 4, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[DST:%.*]] = getelementptr inbounds ptr, ptr [[B]], i64 [[INDEX]]

From b10b6b0627f00a608482920d1660864d1f666e8d Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Fri, 1 Aug 2025 10:50:57 +0000
Subject: [PATCH 3/3] Update RUN line to use -passes=

---
 .../CodeGenPrepare/dont-sink-scalable-vector-compare.ll    | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/test/Transforms/CodeGenPrepare/dont-sink-scalable-vector-compare.ll b/llvm/test/Transforms/CodeGenPrepare/dont-sink-scalable-vector-compare.ll
index 5960eaa0464f9..26de31d6f7499 100644
--- a/llvm/test/Transforms/CodeGenPrepare/dont-sink-scalable-vector-compare.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/dont-sink-scalable-vector-compare.ll
@@ -1,7 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -codegenprepare -S < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve | FileCheck %s
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S < %s | FileCheck %s
 
-define void @do_not_sink_scalable_vector_compare(ptr %a, ptr %b) {
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @do_not_sink_scalable_vector_compare(ptr %a, ptr %b) #0 {
 ; CHECK-LABEL: define void @do_not_sink_scalable_vector_compare(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -39,3 +41,4 @@ vector.end:
   ret void
 }
 
+attributes #0 = { "target-features"="+sve" }