diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 320b79203c0b3..6e715a0160419 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -321,6 +321,10 @@ class ConstantOffsetExtractor { bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO, bool NonNegative); + /// Analyze XOR instruction to extract disjoint constant bits that behave + /// like addition operations for improved address mode folding. + APInt extractDisjointBitsFromXor(BinaryOperator *XorInst); + /// The path from the constant offset to the old GEP index. e.g., if the GEP /// index is "a * b + (c + 5)". After running function find, UserChain[0] will /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and @@ -633,6 +637,9 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended, // Trace into subexpressions for more hoisting opportunities. if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended); + // Handle XOR with disjoint bits that can be treated as addition. + else if (BO->getOpcode() == Instruction::Xor) + ConstantOffset = extractDisjointBitsFromXor(BO); } else if (isa(V)) { ConstantOffset = find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative) @@ -745,11 +752,19 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { Value *NextInChain = removeConstOffset(ChainIndex - 1); Value *TheOther = BO->getOperand(1 - OpNo); - // If NextInChain is 0 and not the LHS of a sub, we can simplify the - // sub-expression to be just TheOther. if (ConstantInt *CI = dyn_cast(NextInChain)) { - if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0)) - return TheOther; + if (CI->isZero()) { + // Special handling for XOR with disjoint bits: + // Keep the original XOR instruction with the non disjoint part of + // the constant, and the remaining operation is still meaningful. + if (BO->getOpcode() == Instruction::Xor) + return BO; + + // If NextInChain is 0 and not the LHS of a sub, we can simplify the + // sub-expression to be just TheOther. + if (!(BO->getOpcode() == Instruction::Sub && OpNo == 0)) + return TheOther; + } } BinaryOperator::BinaryOps NewOp = BO->getOpcode(); @@ -780,6 +795,69 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { return NewBO; } +/// Analyze XOR instruction to extract disjoint constant bits for address +/// folding +/// +/// This function identifies bits in an XOR constant operand that are disjoint +/// from the base operand's known set bits. For these disjoint bits, XOR behaves +/// identically to addition, allowing us to extract them as constant offsets +/// that can be folded into addressing modes. +/// +/// Transformation: `Base ^ Const` becomes `(Base ^ NonDisjointBits) + +/// DisjointBits` where DisjointBits = Const & KnownZeros(Base) +/// +/// Example with ptr having known-zero low bit: +/// Original: `xor %ptr, 3` ; 3 = 0b11 +/// Analysis: DisjointBits = 3 & KnownZeros(%ptr) = 0b11 & 0b01 = 0b01 +/// Result: `(xor %ptr, 2) + 1` where 1 can be folded into address mode +/// +/// \param XorInst The XOR binary operator to analyze +/// \return APInt containing the disjoint bits that can be extracted as offset, +/// or zero if no disjoint bits exist +APInt ConstantOffsetExtractor::extractDisjointBitsFromXor( + BinaryOperator *XorInst) { + assert(XorInst && XorInst->getOpcode() == Instruction::Xor && + "Expected XOR instruction"); + + const unsigned BitWidth = XorInst->getType()->getScalarSizeInBits(); + Value *BaseOperand; + ConstantInt *XorConstant; + + // Match pattern: xor BaseOperand, Constant. + if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant)))) + return APInt::getZero(BitWidth); + + // Try to extract constant offset from the base operand recursively. + if (BinaryOperator *BO = dyn_cast(BaseOperand)) { + APInt ConstantOffset = find(BO, /*SignExtended=*/false, + /*ZeroExtended=*/false, /*NonNegative=*/false); + if (!ConstantOffset.isZero()) + return ConstantOffset; + } + + // Compute known bits for the base operand. + const SimplifyQuery SQ(DL); + const KnownBits BaseKnownBits = computeKnownBits(BaseOperand, SQ); + const APInt &ConstantValue = XorConstant->getValue(); + + // Identify disjoint bits: constant bits that are known zero in base. + const APInt DisjointBits = ConstantValue & BaseKnownBits.Zero; + + // Early exit if no disjoint bits found. + if (DisjointBits.isZero()) + return APInt::getZero(BitWidth); + + // Compute the remaining non-disjoint bits that stay in the XOR. + const APInt NonDisjointBits = ConstantValue & ~DisjointBits; + + // Add the non-disjoint constant to the user chain for later transformation + // This will replace the original constant in the XOR with the reduced + // constant. + UserChain.push_back(ConstantInt::get(XorInst->getType(), NonDisjointBits)); + + return DisjointBits; +} + /// A helper function to check if reassociating through an entry in the user /// chain would invalidate the GEP's nuw flag. static bool allowsPreservingNUW(const User *U) { diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll new file mode 100644 index 0000000000000..36900ba7e16bd --- /dev/null +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll @@ -0,0 +1,429 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; Test the xor with constant operand is decomposed in to gep. +; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep \ +; RUN: -S < %s | FileCheck %s +; Test the gvn pass eliminates the redundant xor instructions from decomposition. +; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep,gvn \ +; RUN: -S < %s | FileCheck --check-prefix=GVN %s + +; Check that disjoint constants are properly extracted and folded into GEP +; addressing modes and GVN to eliminate redundant computations +define amdgpu_kernel void @test1(i1 %0, ptr addrspace(3) %1) { +; CHECK-LABEL: define amdgpu_kernel void @test1( +; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 8192 +; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 16384 +; CHECK-NEXT: [[TMP11:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP12]], i32 24576 +; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; CHECK-NEXT: [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16 +; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP13]], align 16 +; CHECK-NEXT: [[TMP18:%.*]] = fadd <8 x half> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = fadd <8 x half> [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = fadd <8 x half> [[TMP18]], [[TMP19]] +; CHECK-NEXT: store <8 x half> [[TMP20]], ptr addrspace(3) [[TMP1]], align 16 +; CHECK-NEXT: ret void +; +; GVN-LABEL: define amdgpu_kernel void @test1( +; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; GVN-NEXT: [[ENTRY:.*:]] +; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; GVN-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192 +; GVN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 16384 +; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 24576 +; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 +; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 +; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]] +; GVN-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP10]], [[TMP11]] +; GVN-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP12]], [[TMP13]] +; GVN-NEXT: store <8 x half> [[TMP14]], ptr addrspace(3) [[TMP1]], align 16 +; GVN-NEXT: ret void +; +entry: + %2 = select i1 %0, i32 0, i32 288 + %3 = xor i32 %2, 32 + %4 = xor i32 %2, 4128 + %5 = xor i32 %2, 8224 + %6 = xor i32 %2, 12320 + %7 = getelementptr half, ptr addrspace(3) %1, i32 %3 + %8 = getelementptr half, ptr addrspace(3) %1, i32 %4 + %9 = getelementptr half, ptr addrspace(3) %1, i32 %5 + %10 = getelementptr half, ptr addrspace(3) %1, i32 %6 + %11 = load <8 x half>, ptr addrspace(3) %7, align 16 + %12 = load <8 x half>, ptr addrspace(3) %8, align 16 + %13 = load <8 x half>, ptr addrspace(3) %9, align 16 + %14 = load <8 x half>, ptr addrspace(3) %10, align 16 + %15 = fadd <8 x half> %11, %12 + %16 = fadd <8 x half> %13, %14 + %17 = fadd <8 x half> %15, %16 + store <8 x half> %17, ptr addrspace(3) %1, align 16 + ret void +} + +; Check that disjoint constants are properly extracted and folded into GEP +; addressing modes and GVN to eliminate redundant computations +define amdgpu_kernel void @test2(i1 %0, ptr addrspace(3) %1) { +; CHECK-LABEL: define amdgpu_kernel void @test2( +; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP5]], i32 24576 +; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 16384 +; CHECK-NEXT: [[TMP10:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP11]], i32 8192 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 +; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP9]], align 16 +; CHECK-NEXT: [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP12]], align 16 +; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP13]], align 16 +; CHECK-NEXT: [[TMP18:%.*]] = fadd <8 x half> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = fadd <8 x half> [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = fadd <8 x half> [[TMP18]], [[TMP19]] +; CHECK-NEXT: store <8 x half> [[TMP20]], ptr addrspace(3) [[TMP1]], align 16 +; CHECK-NEXT: ret void +; +; GVN-LABEL: define amdgpu_kernel void @test2( +; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; GVN-NEXT: [[ENTRY:.*:]] +; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; GVN-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 24576 +; GVN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 16384 +; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192 +; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 +; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 +; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]] +; GVN-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP10]], [[TMP11]] +; GVN-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP12]], [[TMP13]] +; GVN-NEXT: store <8 x half> [[TMP14]], ptr addrspace(3) [[TMP1]], align 16 +; GVN-NEXT: ret void +; +entry: + %2 = select i1 %0, i32 0, i32 288 + %3 = xor i32 %2, 12320 + %4 = xor i32 %2, 8224 + %5 = xor i32 %2, 4128 + %6 = xor i32 %2, 32 + %7 = getelementptr half, ptr addrspace(3) %1, i32 %3 + %8 = getelementptr half, ptr addrspace(3) %1, i32 %4 + %9 = getelementptr half, ptr addrspace(3) %1, i32 %5 + %10 = getelementptr half, ptr addrspace(3) %1, i32 %6 + %11 = load <8 x half>, ptr addrspace(3) %7, align 16 + %12 = load <8 x half>, ptr addrspace(3) %8, align 16 + %13 = load <8 x half>, ptr addrspace(3) %9, align 16 + %14 = load <8 x half>, ptr addrspace(3) %10, align 16 + %15 = fadd <8 x half> %11, %12 + %16 = fadd <8 x half> %13, %14 + %17 = fadd <8 x half> %15, %16 + store <8 x half> %17, ptr addrspace(3) %1, align 16 + ret void +} + +; Verify that xor instructions with different non-disjoint constants are optimized +define amdgpu_kernel void @test3(i1 %0, ptr addrspace(3) %1) { +; CHECK-LABEL: define amdgpu_kernel void @test3( +; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 4096 +; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 8192 +; CHECK-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; CHECK-NEXT: [[TMP12:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; CHECK-NEXT: [[TMP13:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16 +; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = fadd <8 x half> [[TMP13]], [[TMP14]] +; CHECK-NEXT: store <8 x half> [[TMP15]], ptr addrspace(3) [[TMP1]], align 16 +; CHECK-NEXT: ret void +; +; GVN-LABEL: define amdgpu_kernel void @test3( +; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; GVN-NEXT: [[ENTRY:.*:]] +; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; GVN-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288 +; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 4096 +; GVN-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192 +; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16 +; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]] +; GVN-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP11]], [[TMP12]] +; GVN-NEXT: store <8 x half> [[TMP13]], ptr addrspace(3) [[TMP1]], align 16 +; GVN-NEXT: ret void +; +entry: + %2 = select i1 %0, i32 0, i32 288 + %3 = xor i32 %2, 32 + %4 = xor i32 %2, 2336 + %5 = xor i32 %2, 4128 + %6 = getelementptr half, ptr addrspace(3) %1, i32 %3 + %7 = getelementptr half, ptr addrspace(3) %1, i32 %4 + %8 = getelementptr half, ptr addrspace(3) %1, i32 %5 + %9 = load <8 x half>, ptr addrspace(3) %6, align 16 + %10 = load <8 x half>, ptr addrspace(3) %7, align 16 + %11 = load <8 x half>, ptr addrspace(3) %8, align 16 + %12 = fadd <8 x half> %9, %10 + %13 = fadd <8 x half> %11, %12 + store <8 x half> %13, ptr addrspace(3) %1, align 16 + ret void +} + +; Verify that no optimization occurs when disjoint constants are absent +define amdgpu_kernel void @test4(i1 %0, ptr addrspace(3) %1) { +; CHECK-LABEL: define amdgpu_kernel void @test4( +; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 +; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 +; CHECK-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]] +; CHECK-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16 +; CHECK-NEXT: ret void +; +; GVN-LABEL: define amdgpu_kernel void @test4( +; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; GVN-NEXT: [[ENTRY:.*:]] +; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; GVN-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288 +; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] +; GVN-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 +; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 +; GVN-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]] +; GVN-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16 +; GVN-NEXT: ret void +; +entry: + %2 = select i1 %0, i32 0, i32 288 + %3 = xor i32 %2, 32 + %4 = xor i32 %2, 288 + %5 = getelementptr half, ptr addrspace(3) %1, i32 %3 + %6 = getelementptr half, ptr addrspace(3) %1, i32 %4 + %7 = load <8 x half>, ptr addrspace(3) %5, align 16 + %8 = load <8 x half>, ptr addrspace(3) %6, align 16 + %9 = fadd <8 x half> %7, %8 + store <8 x half> %9, ptr addrspace(3) %1, align 16 + ret void +} + + +; Verify that XOR-BinOp-GEP usage chains are properly optimized +define amdgpu_kernel void @test5(i1 %0, ptr addrspace(3) %1) { +; CHECK-LABEL: define amdgpu_kernel void @test5( +; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 256 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192 +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16 +; CHECK-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]] +; CHECK-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16 +; CHECK-NEXT: ret void +; +; GVN-LABEL: define amdgpu_kernel void @test5( +; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; GVN-NEXT: [[ENTRY:.*:]] +; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; GVN-NEXT: [[TMP5:%.*]] = add i32 [[TMP3]], 256 +; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 8192 +; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; GVN-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]] +; GVN-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16 +; GVN-NEXT: ret void +; +entry: + %2 = select i1 %0, i32 0, i32 288 + %3 = xor i32 %2, 32 + %4 = xor i32 %2, 4128 + %5 = add i32 %4, 256 + %6 = getelementptr half, ptr addrspace(3) %1, i32 %3 + %7 = getelementptr half, ptr addrspace(3) %1, i32 %5 + %8 = load <8 x half>, ptr addrspace(3) %6, align 16 + %9 = load <8 x half>, ptr addrspace(3) %7, align 16 + %10 = fadd <8 x half> %8, %9 + store <8 x half> %10, ptr addrspace(3) %1, align 16 + ret void +} + +; Verify that BinOp-XOR-GEP usage chains are properly optimized +define amdgpu_kernel void @test6(i1 %0, ptr addrspace(3) %1) { +; CHECK-LABEL: define amdgpu_kernel void @test6( +; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 4128 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 512 +; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; CHECK-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]] +; CHECK-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16 +; CHECK-NEXT: ret void +; +; GVN-LABEL: define amdgpu_kernel void @test6( +; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; GVN-NEXT: [[ENTRY:.*:]] +; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; GVN-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 4128 +; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 512 +; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; GVN-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]] +; GVN-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16 +; GVN-NEXT: ret void +; +entry: + %2 = select i1 %0, i32 0, i32 288 + %3 = xor i32 %2, 32 + %4 = add i32 %2, 256 + %5 = xor i32 %4, 4128 + %6 = getelementptr half, ptr addrspace(3) %1, i32 %3 + %7 = getelementptr half, ptr addrspace(3) %1, i32 %5 + %8 = load <8 x half>, ptr addrspace(3) %6, align 16 + %9 = load <8 x half>, ptr addrspace(3) %7, align 16 + %10 = fadd <8 x half> %8, %9 + store <8 x half> %10, ptr addrspace(3) %1, align 16 + ret void +} + +; Verify that BinOp-XOR-GEP usage chains with non disjoint xor works as +; intended. +define amdgpu_kernel void @test6a(i1 %0, ptr addrspace(3) %1) { +; CHECK-LABEL: define amdgpu_kernel void @test6a( +; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 512 +; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; CHECK-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]] +; CHECK-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16 +; CHECK-NEXT: ret void +; +; GVN-LABEL: define amdgpu_kernel void @test6a( +; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; GVN-NEXT: [[ENTRY:.*:]] +; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; GVN-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288 +; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 512 +; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; GVN-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]] +; GVN-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16 +; GVN-NEXT: ret void +; +entry: + %2 = select i1 %0, i32 0, i32 288 + %3 = xor i32 %2, 32 + %4 = add i32 %2, 256 + %5 = xor i32 %4, 288 + %6 = getelementptr half, ptr addrspace(3) %1, i32 %3 + %7 = getelementptr half, ptr addrspace(3) %1, i32 %5 + %8 = load <8 x half>, ptr addrspace(3) %6, align 16 + %9 = load <8 x half>, ptr addrspace(3) %7, align 16 + %10 = fadd <8 x half> %8, %9 + store <8 x half> %10, ptr addrspace(3) %1, align 16 + ret void +} + +; Ensure disjoint constants exceeding addressing mode limits (e.g., 32768) are +; not extracted +define amdgpu_kernel void @test7(i1 %0, ptr addrspace(3) %1) { +; CHECK-LABEL: define amdgpu_kernel void @test7( +; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32800 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 +; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 +; CHECK-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]] +; CHECK-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16 +; CHECK-NEXT: ret void +; +; GVN-LABEL: define amdgpu_kernel void @test7( +; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; GVN-NEXT: [[ENTRY:.*:]] +; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; GVN-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32800 +; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] +; GVN-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 +; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 +; GVN-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]] +; GVN-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16 +; GVN-NEXT: ret void +; +entry: + %2 = select i1 %0, i32 0, i32 288 + %3 = xor i32 %2, 32 + %4 = xor i32 %2, 32800 + %5 = getelementptr half, ptr addrspace(3) %1, i32 %3 + %6 = getelementptr half, ptr addrspace(3) %1, i32 %4 + %7 = load <8 x half>, ptr addrspace(3) %5, align 16 + %8 = load <8 x half>, ptr addrspace(3) %6, align 16 + %9 = fadd <8 x half> %7, %8 + store <8 x half> %9, ptr addrspace(3) %1, align 16 + ret void +} +