-
Notifications
You must be signed in to change notification settings - Fork 14.6k
[ARM] Have custom lowering for ucmp and scmp #149315
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
AZero13
wants to merge
4
commits into
llvm:main
Choose a base branch
from
AZero13:scmparm
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
+943
−215
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Limited to non-thumb at the moment, but we can do this for i32 in 3 steps, using subs to set the flags initially.
@llvm/pr-subscribers-backend-arm Author: AZero13 (AZero13) ChangesLimited to non-thumb at the moment, but we can do this for i32 in 3 steps, using subs to set the flags initially. Full diff: https://github.com/llvm/llvm-project/pull/149315.diff 4 Files Affected:
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 65d1c4e2d6515..9681eab17518b 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -802,6 +802,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
setOperationAction(ISD::BSWAP, VT, Expand);
}
+ if (!Subtarget->isThumb()) {
+ setOperationAction(ISD::SCMP, MVT::i32, Custom);
+ setOperationAction(ISD::UCMP, MVT::i32, Custom);
+ }
+
setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
@@ -10614,6 +10619,142 @@ SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
return DAG.getBitcast(MVT::i32, Res);
}
+SDValue ARMTargetLowering::LowerSCMP(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+
+ // For the ARM assembly pattern:
+ // subs r0, r0, r1 ; subtract RHS from LHS and set flags
+ // movgt r0, #1 ; if LHS > RHS, set result to 1
+ // mvnlt r0, #0 ; if LHS < RHS, set result to -1 (mvn #0 = -1)
+ // ; if LHS == RHS, result remains 0 from the subs
+
+ // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
+ // Check if RHS is (0 - something), and if so use ADDC with LHS + something
+ SDValue SubResult, Flags;
+ bool CanUseAdd = false;
+ SDValue AddOperand;
+
+ // Check if RHS is a subtraction against 0: (0 - X)
+ if (RHS.getOpcode() == ISD::SUB) {
+ SDValue SubLHS = RHS.getOperand(0);
+ SDValue SubRHS = RHS.getOperand(1);
+
+ // Check if it's 0 - X
+ if (isNullConstant(SubLHS)) {
+ // For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
+ if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
+ .getSignedMinValue()
+ .isMinSignedValue()) {
+ CanUseAdd = true;
+ AddOperand = SubRHS; // Replace RHS with X, so we do LHS + X instead of
+ // LHS - (0 - X)
+ }
+ }
+ }
+
+ if (CanUseAdd) {
+ // Use ADDC: LHS + AddOperand (where RHS was 0 - AddOperand)
+ SDValue AddWithFlags = DAG.getNode(
+ ARMISD::ADDC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, AddOperand);
+ SubResult = AddWithFlags.getValue(0); // The addition result
+ Flags = AddWithFlags.getValue(1); // The flags from ADDS
+ } else {
+ // Use ARMISD::SUBC to generate SUBS instruction (subtract with flags)
+ SDValue SubWithFlags = DAG.getNode(
+ ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
+ SubResult = SubWithFlags.getValue(0); // The subtraction result
+ Flags = SubWithFlags.getValue(1); // The flags from SUBS
+ }
+
+ // Constants for conditional moves
+ SDValue One = DAG.getConstant(1, dl, MVT::i32);
+ SDValue MinusOne = DAG.getConstant(0xFFFFFFFF, dl, MVT::i32);
+
+ // movgt: if greater than, set to 1
+ SDValue GTCond = DAG.getConstant(ARMCC::GT, dl, MVT::i32);
+ SDValue Result1 =
+ DAG.getNode(ARMISD::CMOV, dl, MVT::i32, SubResult, One, GTCond, Flags);
+
+ // mvnlt: if less than, set to -1 (equivalent to mvn #0)
+ SDValue LTCond = DAG.getConstant(ARMCC::LT, dl, MVT::i32);
+ SDValue Result2 =
+ DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne, LTCond, Flags);
+
+ if (Op.getValueType() != MVT::i32)
+ Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
+
+ return Result2;
+}
+
+SDValue ARMTargetLowering::LowerUCMP(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+
+ // For the ARM assembly pattern (unsigned version):
+ // subs r0, r0, r1 ; subtract RHS from LHS and set flags
+ // movhi r0, #1 ; if LHS > RHS (unsigned), set result to 1
+ // mvnlo r0, #0 ; if LHS < RHS (unsigned), set result to -1
+ // ; if LHS == RHS, result remains 0 from the subs
+
+ // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
+ // Check if RHS is (0 - something), and if so use ADDC with LHS + something
+ SDValue SubResult, Flags;
+ bool CanUseAdd = false;
+ SDValue AddOperand;
+
+ // Check if RHS is a subtraction against 0: (0 - X)
+ if (RHS.getOpcode() == ISD::SUB) {
+ SDValue SubLHS = RHS.getOperand(0);
+ SDValue SubRHS = RHS.getOperand(1);
+
+ // Check if it's 0 - X
+ if (isNullConstant(SubLHS)) {
+ // For UCMP: only if X is known to never be zero
+ if (DAG.isKnownNeverZero(SubRHS)) {
+ CanUseAdd = true;
+ AddOperand = SubRHS; // Replace RHS with X, so we do LHS + X instead of
+ // LHS - (0 - X)
+ }
+ }
+ }
+
+ if (CanUseAdd) {
+ // Use ADDC: LHS + AddOperand (where RHS was 0 - AddOperand)
+ SDValue AddWithFlags = DAG.getNode(
+ ARMISD::ADDC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, AddOperand);
+ SubResult = AddWithFlags.getValue(0); // The addition result
+ Flags = AddWithFlags.getValue(1); // The flags from ADDS
+ } else {
+ // Use ARMISD::SUBC to generate SUBS instruction (subtract with flags)
+ SDValue SubWithFlags = DAG.getNode(
+ ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
+ SubResult = SubWithFlags.getValue(0); // The subtraction result
+ Flags = SubWithFlags.getValue(1); // The flags from SUBS
+ }
+
+ // Constants for conditional moves
+ SDValue One = DAG.getConstant(1, dl, MVT::i32);
+ SDValue MinusOne = DAG.getConstant(0xFFFFFFFF, dl, MVT::i32);
+
+ // movhi: if higher (unsigned greater than), set to 1
+ SDValue HICond = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
+ SDValue Result1 =
+ DAG.getNode(ARMISD::CMOV, dl, MVT::i32, SubResult, One, HICond, Flags);
+
+ // mvnlo: if lower (unsigned less than), set to -1
+ SDValue LOCond = DAG.getConstant(ARMCC::LO, dl, MVT::i32);
+ SDValue Result2 =
+ DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne, LOCond, Flags);
+
+ if (Op.getValueType() != MVT::i32)
+ Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
+
+ return Result2;
+}
+
SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
switch (Op.getOpcode()) {
@@ -10742,6 +10883,10 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FP_TO_BF16:
return LowerFP_TO_BF16(Op, DAG);
case ARMISD::WIN__DBZCHK: return SDValue();
+ case ISD::SCMP:
+ return LowerSCMP(Op, DAG);
+ case ISD::UCMP:
+ return LowerUCMP(Op, DAG);
}
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 5f4aef55b22c9..1cb7edd041b32 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -903,6 +903,8 @@ class VectorType;
void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const;
SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSCMP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUCMP(SDValue Op, SelectionDAG &DAG) const;
Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
diff --git a/llvm/test/CodeGen/ARM/scmp.ll b/llvm/test/CodeGen/ARM/scmp.ll
index 6e493c993751c..9189aee6aaf43 100644
--- a/llvm/test/CodeGen/ARM/scmp.ll
+++ b/llvm/test/CodeGen/ARM/scmp.ll
@@ -4,12 +4,9 @@
define i8 @scmp_8_8(i8 signext %x, i8 signext %y) nounwind {
; CHECK-LABEL: scmp_8_8:
; CHECK: @ %bb.0:
-; CHECK-NEXT: cmp r0, r1
-; CHECK-NEXT: mov r0, #0
-; CHECK-NEXT: mov r2, #0
-; CHECK-NEXT: movwlt r0, #1
-; CHECK-NEXT: movwgt r2, #1
-; CHECK-NEXT: sub r0, r2, r0
+; CHECK-NEXT: subs r0, r0, r1
+; CHECK-NEXT: movwgt r0, #1
+; CHECK-NEXT: mvnlt r0, #0
; CHECK-NEXT: bx lr
%1 = call i8 @llvm.scmp(i8 %x, i8 %y)
ret i8 %1
@@ -18,12 +15,9 @@ define i8 @scmp_8_8(i8 signext %x, i8 signext %y) nounwind {
define i8 @scmp_8_16(i16 signext %x, i16 signext %y) nounwind {
; CHECK-LABEL: scmp_8_16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: cmp r0, r1
-; CHECK-NEXT: mov r0, #0
-; CHECK-NEXT: mov r2, #0
-; CHECK-NEXT: movwlt r0, #1
-; CHECK-NEXT: movwgt r2, #1
-; CHECK-NEXT: sub r0, r2, r0
+; CHECK-NEXT: subs r0, r0, r1
+; CHECK-NEXT: movwgt r0, #1
+; CHECK-NEXT: mvnlt r0, #0
; CHECK-NEXT: bx lr
%1 = call i8 @llvm.scmp(i16 %x, i16 %y)
ret i8 %1
@@ -32,12 +26,9 @@ define i8 @scmp_8_16(i16 signext %x, i16 signext %y) nounwind {
define i8 @scmp_8_32(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: scmp_8_32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: cmp r0, r1
-; CHECK-NEXT: mov r0, #0
-; CHECK-NEXT: mov r2, #0
-; CHECK-NEXT: movwlt r0, #1
-; CHECK-NEXT: movwgt r2, #1
-; CHECK-NEXT: sub r0, r2, r0
+; CHECK-NEXT: subs r0, r0, r1
+; CHECK-NEXT: movwgt r0, #1
+; CHECK-NEXT: mvnlt r0, #0
; CHECK-NEXT: bx lr
%1 = call i8 @llvm.scmp(i32 %x, i32 %y)
ret i8 %1
@@ -92,17 +83,26 @@ define i8 @scmp_8_128(i128 %x, i128 %y) nounwind {
define i32 @scmp_32_32(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: scmp_32_32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: cmp r0, r1
-; CHECK-NEXT: mov r0, #0
-; CHECK-NEXT: mov r2, #0
-; CHECK-NEXT: movwlt r0, #1
-; CHECK-NEXT: movwgt r2, #1
-; CHECK-NEXT: sub r0, r2, r0
+; CHECK-NEXT: subs r0, r0, r1
+; CHECK-NEXT: movwgt r0, #1
+; CHECK-NEXT: mvnlt r0, #0
; CHECK-NEXT: bx lr
%1 = call i32 @llvm.scmp(i32 %x, i32 %y)
ret i32 %1
}
+define i32 @scmp_neg(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: scmp_neg:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: adds r0, r0, r1
+; CHECK-NEXT: movwgt r0, #1
+; CHECK-NEXT: mvnlt r0, #0
+; CHECK-NEXT: bx lr
+ %yy = sub nsw i32 0, %y
+ %1 = call i32 @llvm.scmp(i32 %x, i32 %yy)
+ ret i32 %1
+}
+
define i32 @scmp_32_64(i64 %x, i64 %y) nounwind {
; CHECK-LABEL: scmp_32_64:
; CHECK: @ %bb.0:
diff --git a/llvm/test/CodeGen/ARM/ucmp.ll b/llvm/test/CodeGen/ARM/ucmp.ll
index ad4af534ee8fe..bb0201454d1ea 100644
--- a/llvm/test/CodeGen/ARM/ucmp.ll
+++ b/llvm/test/CodeGen/ARM/ucmp.ll
@@ -4,12 +4,9 @@
define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
; CHECK-LABEL: ucmp_8_8:
; CHECK: @ %bb.0:
-; CHECK-NEXT: cmp r0, r1
-; CHECK-NEXT: mov r0, #0
-; CHECK-NEXT: mov r2, #0
-; CHECK-NEXT: movwlo r0, #1
-; CHECK-NEXT: movwhi r2, #1
-; CHECK-NEXT: sub r0, r2, r0
+; CHECK-NEXT: subs r0, r0, r1
+; CHECK-NEXT: movwhi r0, #1
+; CHECK-NEXT: mvnlo r0, #0
; CHECK-NEXT: bx lr
%1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
ret i8 %1
@@ -18,12 +15,9 @@ define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
; CHECK-LABEL: ucmp_8_16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: cmp r0, r1
-; CHECK-NEXT: mov r0, #0
-; CHECK-NEXT: mov r2, #0
-; CHECK-NEXT: movwlo r0, #1
-; CHECK-NEXT: movwhi r2, #1
-; CHECK-NEXT: sub r0, r2, r0
+; CHECK-NEXT: subs r0, r0, r1
+; CHECK-NEXT: movwhi r0, #1
+; CHECK-NEXT: mvnlo r0, #0
; CHECK-NEXT: bx lr
%1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
ret i8 %1
@@ -32,12 +26,9 @@ define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: ucmp_8_32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: cmp r0, r1
-; CHECK-NEXT: mov r0, #0
-; CHECK-NEXT: mov r2, #0
-; CHECK-NEXT: movwlo r0, #1
-; CHECK-NEXT: movwhi r2, #1
-; CHECK-NEXT: sub r0, r2, r0
+; CHECK-NEXT: subs r0, r0, r1
+; CHECK-NEXT: movwhi r0, #1
+; CHECK-NEXT: mvnlo r0, #0
; CHECK-NEXT: bx lr
%1 = call i8 @llvm.ucmp(i32 %x, i32 %y)
ret i8 %1
@@ -92,12 +83,9 @@ define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind {
define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: ucmp_32_32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: cmp r0, r1
-; CHECK-NEXT: mov r0, #0
-; CHECK-NEXT: mov r2, #0
-; CHECK-NEXT: movwlo r0, #1
-; CHECK-NEXT: movwhi r2, #1
-; CHECK-NEXT: sub r0, r2, r0
+; CHECK-NEXT: subs r0, r0, r1
+; CHECK-NEXT: movwhi r0, #1
+; CHECK-NEXT: mvnlo r0, #0
; CHECK-NEXT: bx lr
%1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
ret i32 %1
|
There is no real good way to do scmp sadly
I derived the assembler by hand. Honestly the implementation came to me in a dream last night... |
davemgreen
reviewed
Jul 20, 2025
Because this is what we are doing, this is for the sake of cost calculations even though this is manually lowered via asm.
topperc
reviewed
Jul 23, 2025
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Limited to non-thumb1 for scmp at the moment, since there is no good way to do it.