-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[DAGCombiner] infer wrap flags for trunc, use to fold itofp #148729
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[DAGCombiner] infer wrap flags for trunc, use to fold itofp #148729
Conversation
@llvm/pr-subscribers-backend-nvptx @llvm/pr-subscribers-llvm-selectiondag Author: Alex MacLean (AlexMaclean) ChangesThis change adds 2 related optimizations to DAGCombiner:
Full diff: https://github.com/llvm/llvm-project/pull/148729.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 231184587d682..6da4d036f7167 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16310,6 +16310,22 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
break;
}
+ // Use known bits to apply the nsw/nuw flags to the truncate.
+ const unsigned DestWidth = VT.getScalarSizeInBits();
+ const unsigned SrcWidth = N0.getScalarValueSizeInBits();
+ SDNodeFlags Flags = N->getFlags();
+ if (!N->getFlags().hasNoSignedWrap() &&
+ DAG.ComputeMaxSignificantBits(N0) <= DestWidth)
+ Flags.setNoSignedWrap(true);
+ if (!N->getFlags().hasNoUnsignedWrap() &&
+ DAG.MaskedValueIsZero(N0, APInt::getBitsSetFrom(SrcWidth, DestWidth)))
+ Flags.setNoUnsignedWrap(true);
+
+ if (!(Flags == N->getFlags())) {
+ N->setFlags(Flags);
+ return SDValue(N, 0);
+ }
+
return SDValue();
}
@@ -18713,6 +18729,12 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
if (SDValue FTrunc = foldFPToIntToFP(N, DL, DAG, TLI))
return FTrunc;
+ // fold (sint_to_fp (trunc nsw x)) -> (sint_to_fp x)
+ if (N0.getOpcode() == ISD::TRUNCATE && N0->getFlags().hasNoSignedWrap() &&
+ TLI.isTypeDesirableForOp(ISD::SINT_TO_FP,
+ N0.getOperand(0).getValueType()))
+ return DAG.getNode(ISD::SINT_TO_FP, DL, VT, N0.getOperand(0));
+
return SDValue();
}
@@ -18750,6 +18772,12 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
if (SDValue FTrunc = foldFPToIntToFP(N, DL, DAG, TLI))
return FTrunc;
+ // fold (uint_to_fp (trunc nuw x)) -> (uint_to_fp x)
+ if (N0.getOpcode() == ISD::TRUNCATE && N0->getFlags().hasNoUnsignedWrap() &&
+ TLI.isTypeDesirableForOp(ISD::UINT_TO_FP,
+ N0.getOperand(0).getValueType()))
+ return DAG.getNode(ISD::UINT_TO_FP, DL, VT, N0.getOperand(0));
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 410c0019c7222..ac8509605a8f4 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -1425,4 +1425,25 @@ entry:
ret void
}
+define <4 x float> @test_uitofp_v4i8(<4 x i8> %a) {
+; CHECK-LABEL: test_uitofp_v4i8(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [test_uitofp_v4i8_param_0];
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U;
+; CHECK-NEXT: cvt.rn.f32.u32 %r3, %r2;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7772U;
+; CHECK-NEXT: cvt.rn.f32.u32 %r5, %r4;
+; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U;
+; CHECK-NEXT: cvt.rn.f32.u32 %r7, %r6;
+; CHECK-NEXT: prmt.b32 %r8, %r1, 0, 0x7770U;
+; CHECK-NEXT: cvt.rn.f32.u32 %r9, %r8;
+; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r9, %r7, %r5, %r3};
+; CHECK-NEXT: ret;
+ %f = uitofp <4 x i8> %a to <4 x float>
+ ret <4 x float> %f
+}
+
attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/NVPTX/trunc-tofp.ll b/llvm/test/CodeGen/NVPTX/trunc-tofp.ll
new file mode 100644
index 0000000000000..14942753f4acd
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/trunc-tofp.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_80 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define float @uitofp_trunc_nuw(i32 %x, i32 %y) {
+; CHECK-LABEL: uitofp_trunc_nuw(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [uitofp_trunc_nuw_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [uitofp_trunc_nuw_param_1];
+; CHECK-NEXT: add.s32 %r3, %r1, %r2;
+; CHECK-NEXT: cvt.rn.f32.u32 %r4, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+ %v = add i32 %x, %y
+ %t = trunc nuw i32 %v to i16
+ %f = uitofp i16 %t to float
+ ret float %f
+}
+
+define float @sitofp_trunc_nsw(i32 %x, i32 %y) {
+; CHECK-LABEL: sitofp_trunc_nsw(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [sitofp_trunc_nsw_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [sitofp_trunc_nsw_param_1];
+; CHECK-NEXT: add.s32 %r3, %r1, %r2;
+; CHECK-NEXT: cvt.rn.f32.s32 %r4, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+ %v = add i32 %x, %y
+ %t = trunc nsw i32 %v to i16
+ %f = sitofp i16 %t to float
+ ret float %f
+}
+
+;; Cannot safely fold here because the sign of the comparison does not match the
+;; sign of the wrap flag.
+define float @uitofp_trunc_nsw(i32 %x, i32 %y) {
+; CHECK-LABEL: uitofp_trunc_nsw(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [uitofp_trunc_nsw_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [uitofp_trunc_nsw_param_1];
+; CHECK-NEXT: add.s32 %r3, %r1, %r2;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: cvt.rn.f32.u16 %r4, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+ %v = add i32 %x, %y
+ %t = trunc nsw i32 %v to i16
+ %f = uitofp i16 %t to float
+ ret float %f
+}
+
+;; Cannot safely fold here because the sign of the comparison does not match the
+;; sign of the wrap flag.
+define float @sitofp_trunc_nuw(i32 %x, i32 %y) {
+; CHECK-LABEL: sitofp_trunc_nuw(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b32 %r1, [sitofp_trunc_nuw_param_0];
+; CHECK-NEXT: ld.param.b32 %r2, [sitofp_trunc_nuw_param_1];
+; CHECK-NEXT: add.s32 %r3, %r1, %r2;
+; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
+; CHECK-NEXT: cvt.rn.f32.s16 %r4, %rs1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+ %v = add i32 %x, %y
+ %t = trunc nuw i32 %v to i16
+ %f = sitofp i16 %t to float
+ ret float %f
+}
|
|
||
if (!(Flags == N->getFlags())) { | ||
N->setFlags(Flags); | ||
return SDValue(N, 0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to add users of N to the worklist?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM once Craig's question is resolved.
@@ -18713,6 +18729,12 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) { | |||
if (SDValue FTrunc = foldFPToIntToFP(N, DL, DAG, TLI)) | |||
return FTrunc; | |||
|
|||
// fold (sint_to_fp (trunc nsw x)) -> (sint_to_fp x) | |||
if (N0.getOpcode() == ISD::TRUNCATE && N0->getFlags().hasNoSignedWrap() && | |||
TLI.isTypeDesirableForOp(ISD::SINT_TO_FP, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes me wonder which conversion variants we actually have h/w support for on the GPU.
PTX spec effectively claims that (nearly) anything to anything is supported, but I don't know whether we should LLVM do some of the job, or if we can always let ptxas do it.
It's not an issue for this patch, but a general though that we may want to provide a custom TLI.isTypeDesirableForOp
for NVPTX. AFAICT we're currently running with the default which accepts all legal types.
This change adds 2 related optimizations to DAGCombiner: