From 72844323dc672f2d077f169c0f3856e8f2401d96 Mon Sep 17 00:00:00 2001 From: Sumanth Gundapaneni Date: Wed, 23 Jul 2025 11:58:12 -0500 Subject: [PATCH 1/7] [SeparateConstOffsetFromGEP] Decompose constant xor operand if possible Try to transform XOR(A, B+C) in to XOR(A,C) + B where XOR(A,C) is part of base for memory operations. This transformation is true under the following conditions Check 1 - B and C are disjoint. Check 2 - XOR(A,C) and B are disjoint. This transformation can map these Xors in to better addressing mode and eventually decompose them in to geps. --- .../Scalar/SeparateConstOffsetFromGEP.cpp | 141 ++++++++++++++++-- .../AMDGPU/xor-idiom.ll | 66 ++++++++ 2 files changed, 191 insertions(+), 16 deletions(-) create mode 100644 llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 320b79203c0b3..203850c28787c 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -238,16 +238,17 @@ class ConstantOffsetExtractor { /// \p PreservesNUW Outputs whether the extraction allows preserving the /// GEP's nuw flag, if it has one. static Value *Extract(Value *Idx, GetElementPtrInst *GEP, - User *&UserChainTail, bool &PreservesNUW); + User *&UserChainTail, bool &PreservesNUW, + DominatorTree *DT); /// Looks for a constant offset from the given GEP index without extracting /// it. It returns the numeric value of the extracted constant offset (0 if /// failed). The meaning of the arguments are the same as Extract. - static int64_t Find(Value *Idx, GetElementPtrInst *GEP); + static int64_t Find(Value *Idx, GetElementPtrInst *GEP, DominatorTree *DT); private: - ConstantOffsetExtractor(BasicBlock::iterator InsertionPt) - : IP(InsertionPt), DL(InsertionPt->getDataLayout()) {} + ConstantOffsetExtractor(BasicBlock::iterator InsertionPt, DominatorTree *DT) + : IP(InsertionPt), DT(DT), DL(InsertionPt->getDataLayout()) {} /// Searches the expression that computes V for a non-zero constant C s.t. /// V can be reassociated into the form V' + C. If the searching is @@ -321,6 +322,20 @@ class ConstantOffsetExtractor { bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO, bool NonNegative); + // Find the most dominating Xor with the same base operand. + BinaryOperator *findDominatingXor(Value *BaseOperand, + BinaryOperator *CurrentXor); + + /// Check if Xor instruction should be considered for optimization. + bool shouldConsiderXor(BinaryOperator *XorInst); + + /// Cache the information about Xor idiom. + struct XorRewriteInfo { + llvm::BinaryOperator *BaseXor = nullptr; + int64_t AdjustedOffset = 0; + }; + std::optional CachedXorInfo; + /// The path from the constant offset to the old GEP index. e.g., if the GEP /// index is "a * b + (c + 5)". After running function find, UserChain[0] will /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and @@ -336,6 +351,8 @@ class ConstantOffsetExtractor { /// Insertion position of cloned instructions. BasicBlock::iterator IP; + DominatorTree *DT; + const DataLayout &DL; }; @@ -514,12 +531,14 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO, bool NonNegative) { - // We only consider ADD, SUB and OR, because a non-zero constant found in + // We only consider ADD, SUB, OR and XOR, because a non-zero constant found in // expressions composed of these operations can be easily hoisted as a - // constant offset by reassociation. + // constant offset by reassociation. XOR is a special case and can be folded + // in to gep if the constant is proven to be disjoint. if (BO->getOpcode() != Instruction::Add && BO->getOpcode() != Instruction::Sub && - BO->getOpcode() != Instruction::Or) { + BO->getOpcode() != Instruction::Or && + BO->getOpcode() != Instruction::Xor) { return false; } @@ -530,6 +549,10 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, !cast(BO)->isDisjoint()) return false; + // Handle Xor idiom. + if (BO->getOpcode() == Instruction::Xor) + return shouldConsiderXor(BO); + // FIXME: We don't currently support constants from the RHS of subs, // when we are zero-extended, because we need a way to zero-extended // them before they are negated. @@ -740,6 +763,10 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { "UserChain, so no one should be used more than " "once"); + // Special case for Xor idiom. + if (BO->getOpcode() == Instruction::Xor) + return CachedXorInfo->BaseXor; + unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]); Value *NextInChain = removeConstOffset(ChainIndex - 1); @@ -780,6 +807,80 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { return NewBO; } +// Find the most dominating Xor with the same base operand. +BinaryOperator * +ConstantOffsetExtractor::findDominatingXor(Value *BaseOperand, + BinaryOperator *CurrentXor) { + BinaryOperator *MostDominatingXor = nullptr; + // Iterate over all instructions that use the BaseOperand. + for (User *U : BaseOperand->users()) { + auto *CandidateXor = dyn_cast(U); + + // Simple checks. + if (!CandidateXor || CandidateXor == CurrentXor) + continue; + + // Check if the binary operator is a Xor with constant. + if (!match(CandidateXor, m_Xor(m_Specific(BaseOperand), m_ConstantInt()))) + continue; + + // After confirming the structure, check the dominance relationship. + if (DT->dominates(CandidateXor, CurrentXor)) + // If we find a dominating Xor, keep it if it's the first one, + // or if it dominates the best candidate we've found so far. + if (!MostDominatingXor || DT->dominates(CandidateXor, MostDominatingXor)) + MostDominatingXor = CandidateXor; + } + + return MostDominatingXor; +} + +// Check if Xor should be considered. +// Only the following idiom is considered. +// Example: +// %3 = xor i32 %2, 32 +// %4 = xor i32 %2, 8224 +// %6 = getelementptr half, ptr addrspace(3) %1, i32 %3 +// %7 = getelementptr half, ptr addrspace(3) %1, i32 %4 +// GEP that corresponds to %7, looks at the binary operator %4. +// In order for %4 to be considered, it should have a dominating xor with +// constant offset that is disjoint with an adjusted offset. +// If disjoint, %4 = xor i32 %2, 8224 can be treated as %4 = add i32 %3, 8192 +bool ConstantOffsetExtractor::shouldConsiderXor(BinaryOperator *XorInst) { + + Value *BaseOperand = nullptr; + ConstantInt *CurrentConst = nullptr; + if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(CurrentConst)))) + return false; + + // Find the most dominating Xor with the same base operand. + BinaryOperator *DominatingXor = findDominatingXor(BaseOperand, XorInst); + if (!DominatingXor) + return false; + + // We expect the dominating instruction to also be a 'xor-const'. + ConstantInt *DominatingConst = nullptr; + if (!match(DominatingXor, + m_Xor(m_Specific(BaseOperand), m_ConstantInt(DominatingConst)))) + return false; + + // Calculate the adjusted offset (difference between constants) + APInt AdjustedOffset = CurrentConst->getValue() - DominatingConst->getValue(); + + // Check disjoint conditions + // 1. AdjustedOffset and DominatingConst should be disjoint + if ((AdjustedOffset & DominatingConst->getValue()) != 0) + return false; + + // 2. DominatingXor and AdjustedOffset should be disjoint + if (!MaskedValueIsZero(DominatingXor, AdjustedOffset, SimplifyQuery(DL), 0)) + return false; + + // Cache the result. + CachedXorInfo = XorRewriteInfo{DominatingXor, AdjustedOffset.getSExtValue()}; + return true; +} + /// A helper function to check if reassociating through an entry in the user /// chain would invalidate the GEP's nuw flag. static bool allowsPreservingNUW(const User *U) { @@ -805,8 +906,8 @@ static bool allowsPreservingNUW(const User *U) { Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP, User *&UserChainTail, - bool &PreservesNUW) { - ConstantOffsetExtractor Extractor(GEP->getIterator()); + bool &PreservesNUW, DominatorTree *DT) { + ConstantOffsetExtractor Extractor(GEP->getIterator(), DT); // Find a non-zero constant offset first. APInt ConstantOffset = Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, @@ -825,12 +926,20 @@ Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP, return IdxWithoutConstOffset; } -int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP) { +int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP, + DominatorTree *DT) { // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative. - return ConstantOffsetExtractor(GEP->getIterator()) - .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, - GEP->isInBounds()) - .getSExtValue(); + ConstantOffsetExtractor Extractor(GEP->getIterator(), DT); + auto Offset = Extractor + .find(Idx, /* SignExtended */ false, + /* ZeroExtended */ false, GEP->isInBounds()) + .getSExtValue(); + + // Return the disjoint offset for Xor. + if (Extractor.CachedXorInfo) + return Extractor.CachedXorInfo->AdjustedOffset; + + return Offset; } bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToIndexSize( @@ -866,7 +975,7 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, // Tries to extract a constant offset from this GEP index. int64_t ConstantOffset = - ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP); + ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT); if (ConstantOffset != 0) { NeedsExtraction = true; // A GEP may have multiple indices. We accumulate the extracted @@ -1106,7 +1215,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { User *UserChainTail; bool PreservesNUW; Value *NewIdx = ConstantOffsetExtractor::Extract( - OldIdx, GEP, UserChainTail, PreservesNUW); + OldIdx, GEP, UserChainTail, PreservesNUW, DT); if (NewIdx != nullptr) { // Switches to the index with the constant offset removed. GEP->setOperand(I, NewIdx); diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll new file mode 100644 index 0000000000000..a0d0de070e735 --- /dev/null +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep \ +; RUN: -S < %s | FileCheck %s + +; Test that xor idiom. +; Xors with disjoint constants 4128,8224 and 12320 must be expressed in GEPs. +; Xors with non-disjoint constants 2336 and 8480, should not be optimized. +define amdgpu_kernel void @test1(i1 %0, ptr addrspace(3) %1) { +; CHECK-LABEL: define amdgpu_kernel void @test1( +; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP14:%.*]] = xor i32 [[TMP2]], 2336 +; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 8480 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP20]], i32 8192 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 16384 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP25]], i32 24576 +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP16]], align 16 +; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP21]], align 16 +; CHECK-NEXT: [[TMP18:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 +; CHECK-NEXT: [[TMP19:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP15]], align 16 +; CHECK-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16 +; CHECK-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP22:%.*]] = fadd <8 x half> [[TMP17]], [[TMP18]] +; CHECK-NEXT: [[TMP23:%.*]] = fadd <8 x half> [[TMP19]], [[TMP11]] +; CHECK-NEXT: [[TMP24:%.*]] = fadd <8 x half> [[TMP12]], [[TMP22]] +; CHECK-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP23]], [[TMP24]] +; CHECK-NEXT: store <8 x half> [[TMP13]], ptr addrspace(3) [[TMP1]], align 16 +; CHECK-NEXT: ret void +; +entry: + %2 = select i1 %0, i32 0, i32 288 + %3 = xor i32 %2, 32 // Base + %4 = xor i32 %2, 2336 // Not disjoint + %5 = xor i32 %2, 4128 // Disjoint + %6 = xor i32 %2, 8224 // Disjoint + %7 = xor i32 %2, 8480 // Not disjoint + %8 = xor i32 %2, 12320 // Disjoint + %9 = getelementptr half, ptr addrspace(3) %1, i32 %3 + %10 = getelementptr half, ptr addrspace(3) %1, i32 %4 + %11 = getelementptr half, ptr addrspace(3) %1, i32 %5 + %12 = getelementptr half, ptr addrspace(3) %1, i32 %6 + %13 = getelementptr half, ptr addrspace(3) %1, i32 %7 + %14 = getelementptr half, ptr addrspace(3) %1, i32 %8 + %15 = load <8 x half>, ptr addrspace(3) %9, align 16 + %16 = load <8 x half>, ptr addrspace(3) %10, align 16 + %17 = load <8 x half>, ptr addrspace(3) %11, align 16 + %18 = load <8 x half>, ptr addrspace(3) %12, align 16 + %19 = load <8 x half>, ptr addrspace(3) %13, align 16 + %20 = load <8 x half>, ptr addrspace(3) %14, align 16 + %21 = fadd <8 x half> %15, %16 + %22 = fadd <8 x half> %17, %18 + %23 = fadd <8 x half> %19, %20 + %24 = fadd <8 x half> %21, %22 + %25 = fadd <8 x half> %23, %24 + store <8 x half> %25, ptr addrspace(3) %1, align 16 + ret void +} From a56ac2f27523f540a5ca286ef7905343450169f7 Mon Sep 17 00:00:00 2001 From: Sumanth Gundapaneni Date: Thu, 24 Jul 2025 11:08:38 -0500 Subject: [PATCH 2/7] Update lit test with comments --- .../SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll index a0d0de070e735..2cbf2ead2107e 100644 --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll @@ -38,12 +38,12 @@ define amdgpu_kernel void @test1(i1 %0, ptr addrspace(3) %1) { ; entry: %2 = select i1 %0, i32 0, i32 288 - %3 = xor i32 %2, 32 // Base - %4 = xor i32 %2, 2336 // Not disjoint - %5 = xor i32 %2, 4128 // Disjoint - %6 = xor i32 %2, 8224 // Disjoint - %7 = xor i32 %2, 8480 // Not disjoint - %8 = xor i32 %2, 12320 // Disjoint + %3 = xor i32 %2, 32 ; Base + %4 = xor i32 %2, 2336 ; Not disjoint + %5 = xor i32 %2, 4128 ; Disjoint + %6 = xor i32 %2, 8224 ; Disjoint + %7 = xor i32 %2, 8480 ; Not disjoint + %8 = xor i32 %2, 12320 ; Disjoint %9 = getelementptr half, ptr addrspace(3) %1, i32 %3 %10 = getelementptr half, ptr addrspace(3) %1, i32 %4 %11 = getelementptr half, ptr addrspace(3) %1, i32 %5 From 49bcd01bce48be7fa68cb130606ffa52c0e363c6 Mon Sep 17 00:00:00 2001 From: Sumanth Gundapaneni Date: Tue, 29 Jul 2025 12:49:41 -0500 Subject: [PATCH 3/7] Update the patch --- .../Scalar/SeparateConstOffsetFromGEP.cpp | 167 ++++++--------- .../AMDGPU/xor-decompose.ll | 195 ++++++++++++++++++ .../AMDGPU/xor-idiom.ll | 66 ------ 3 files changed, 256 insertions(+), 172 deletions(-) create mode 100644 llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll delete mode 100644 llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 203850c28787c..c6ce7859a1f31 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -238,17 +238,16 @@ class ConstantOffsetExtractor { /// \p PreservesNUW Outputs whether the extraction allows preserving the /// GEP's nuw flag, if it has one. static Value *Extract(Value *Idx, GetElementPtrInst *GEP, - User *&UserChainTail, bool &PreservesNUW, - DominatorTree *DT); + User *&UserChainTail, bool &PreservesNUW); /// Looks for a constant offset from the given GEP index without extracting /// it. It returns the numeric value of the extracted constant offset (0 if /// failed). The meaning of the arguments are the same as Extract. - static int64_t Find(Value *Idx, GetElementPtrInst *GEP, DominatorTree *DT); + static int64_t Find(Value *Idx, GetElementPtrInst *GEP); private: - ConstantOffsetExtractor(BasicBlock::iterator InsertionPt, DominatorTree *DT) - : IP(InsertionPt), DT(DT), DL(InsertionPt->getDataLayout()) {} + ConstantOffsetExtractor(BasicBlock::iterator InsertionPt) + : IP(InsertionPt), DL(InsertionPt->getDataLayout()) {} /// Searches the expression that computes V for a non-zero constant C s.t. /// V can be reassociated into the form V' + C. If the searching is @@ -322,19 +321,9 @@ class ConstantOffsetExtractor { bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO, bool NonNegative); - // Find the most dominating Xor with the same base operand. - BinaryOperator *findDominatingXor(Value *BaseOperand, - BinaryOperator *CurrentXor); - - /// Check if Xor instruction should be considered for optimization. - bool shouldConsiderXor(BinaryOperator *XorInst); - - /// Cache the information about Xor idiom. - struct XorRewriteInfo { - llvm::BinaryOperator *BaseXor = nullptr; - int64_t AdjustedOffset = 0; - }; - std::optional CachedXorInfo; + /// Check if Xor instruction should be considered and updated for + /// optimization. + bool shouldConsiderAndUpdateXor(BinaryOperator *XorInst); /// The path from the constant offset to the old GEP index. e.g., if the GEP /// index is "a * b + (c + 5)". After running function find, UserChain[0] will @@ -351,8 +340,6 @@ class ConstantOffsetExtractor { /// Insertion position of cloned instructions. BasicBlock::iterator IP; - DominatorTree *DT; - const DataLayout &DL; }; @@ -549,9 +536,9 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, !cast(BO)->isDisjoint()) return false; - // Handle Xor idiom. + // Handle Xor decomposition. if (BO->getOpcode() == Instruction::Xor) - return shouldConsiderXor(BO); + return shouldConsiderAndUpdateXor(BO); // FIXME: We don't currently support constants from the RHS of subs, // when we are zero-extended, because we need a way to zero-extended @@ -763,10 +750,6 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { "UserChain, so no one should be used more than " "once"); - // Special case for Xor idiom. - if (BO->getOpcode() == Instruction::Xor) - return CachedXorInfo->BaseXor; - unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]); Value *NextInChain = removeConstOffset(ChainIndex - 1); @@ -807,77 +790,57 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { return NewBO; } -// Find the most dominating Xor with the same base operand. -BinaryOperator * -ConstantOffsetExtractor::findDominatingXor(Value *BaseOperand, - BinaryOperator *CurrentXor) { - BinaryOperator *MostDominatingXor = nullptr; - // Iterate over all instructions that use the BaseOperand. - for (User *U : BaseOperand->users()) { - auto *CandidateXor = dyn_cast(U); - - // Simple checks. - if (!CandidateXor || CandidateXor == CurrentXor) - continue; - - // Check if the binary operator is a Xor with constant. - if (!match(CandidateXor, m_Xor(m_Specific(BaseOperand), m_ConstantInt()))) - continue; - - // After confirming the structure, check the dominance relationship. - if (DT->dominates(CandidateXor, CurrentXor)) - // If we find a dominating Xor, keep it if it's the first one, - // or if it dominates the best candidate we've found so far. - if (!MostDominatingXor || DT->dominates(CandidateXor, MostDominatingXor)) - MostDominatingXor = CandidateXor; - } - - return MostDominatingXor; -} - -// Check if Xor should be considered. -// Only the following idiom is considered. -// Example: -// %3 = xor i32 %2, 32 -// %4 = xor i32 %2, 8224 -// %6 = getelementptr half, ptr addrspace(3) %1, i32 %3 -// %7 = getelementptr half, ptr addrspace(3) %1, i32 %4 -// GEP that corresponds to %7, looks at the binary operator %4. -// In order for %4 to be considered, it should have a dominating xor with -// constant offset that is disjoint with an adjusted offset. -// If disjoint, %4 = xor i32 %2, 8224 can be treated as %4 = add i32 %3, 8192 -bool ConstantOffsetExtractor::shouldConsiderXor(BinaryOperator *XorInst) { - - Value *BaseOperand = nullptr; - ConstantInt *CurrentConst = nullptr; - if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(CurrentConst)))) +/// \brief Tries to canonicalize a 'xor' with a constant into a form that is +/// more amenable to address-mode matching. +/// +/// The transformation rewrites `Base ^ Const` into +/// `(Base ^ NonDisjointBits) ^ DisjointBits`. +/// +/// `DisjointBits` are the bits set in `Const` operand that are known to be zero +/// in `Base` operand. For these bits, the `xor` operation is equivalent to +/// `add`, which exposes an offset that can be more easily folded into a memory +/// access. +/// +/// For example, if we know the low bit of `%ptr` is 0: +/// `xor %ptr, 3` ; 3 is `0b11` +/// becomes: +/// `%tmp = xor %ptr, 2` ; NonDisjointBits is `0b10` +/// `xor %tmp, 1` ; DisjointBits is `0b01` +/// +/// The final `xor %tmp, 1` is an addition of 1. +/// +/// \returns `true` if the transformation was applied, `false` otherwise. +bool ConstantOffsetExtractor::shouldConsiderAndUpdateXor( + BinaryOperator *XorInst) { + Value *BaseOperand; + ConstantInt *XorConst; + if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConst)))) return false; - // Find the most dominating Xor with the same base operand. - BinaryOperator *DominatingXor = findDominatingXor(BaseOperand, XorInst); - if (!DominatingXor) - return false; + const SimplifyQuery SQ(DL); + const KnownBits BaseKnown = computeKnownBits(BaseOperand, SQ); + const APInt &ConstValue = XorConst->getValue(); - // We expect the dominating instruction to also be a 'xor-const'. - ConstantInt *DominatingConst = nullptr; - if (!match(DominatingXor, - m_Xor(m_Specific(BaseOperand), m_ConstantInt(DominatingConst)))) + // Check if any bits of the constant can be treated as disjoint + // (addition-like). + const APInt DisjointBits = ConstValue & BaseKnown.Zero; + if (DisjointBits.isZero()) return false; - // Calculate the adjusted offset (difference between constants) - APInt AdjustedOffset = CurrentConst->getValue() - DominatingConst->getValue(); + // Split the xor into disjoint and non-disjoint parts. + const APInt NonDisjointBits = ConstValue & ~DisjointBits; - // Check disjoint conditions - // 1. AdjustedOffset and DominatingConst should be disjoint - if ((AdjustedOffset & DominatingConst->getValue()) != 0) - return false; + IRBuilder<> Builder(XorInst); + Type *Ty = XorConst->getType(); - // 2. DominatingXor and AdjustedOffset should be disjoint - if (!MaskedValueIsZero(DominatingXor, AdjustedOffset, SimplifyQuery(DL), 0)) - return false; + // Transform: (base ^ constant) -> ((base ^ non_disjoint) ^ disjoint). + if (!NonDisjointBits.isZero()) { + Value *NewBase = + Builder.CreateXor(BaseOperand, ConstantInt::get(Ty, NonDisjointBits)); + XorInst->setOperand(0, NewBase); + } - // Cache the result. - CachedXorInfo = XorRewriteInfo{DominatingXor, AdjustedOffset.getSExtValue()}; + XorInst->setOperand(1, ConstantInt::get(Ty, DisjointBits)); return true; } @@ -906,8 +869,8 @@ static bool allowsPreservingNUW(const User *U) { Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP, User *&UserChainTail, - bool &PreservesNUW, DominatorTree *DT) { - ConstantOffsetExtractor Extractor(GEP->getIterator(), DT); + bool &PreservesNUW) { + ConstantOffsetExtractor Extractor(GEP->getIterator()); // Find a non-zero constant offset first. APInt ConstantOffset = Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, @@ -926,20 +889,12 @@ Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP, return IdxWithoutConstOffset; } -int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP, - DominatorTree *DT) { +int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP) { // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative. - ConstantOffsetExtractor Extractor(GEP->getIterator(), DT); - auto Offset = Extractor - .find(Idx, /* SignExtended */ false, - /* ZeroExtended */ false, GEP->isInBounds()) - .getSExtValue(); - - // Return the disjoint offset for Xor. - if (Extractor.CachedXorInfo) - return Extractor.CachedXorInfo->AdjustedOffset; - - return Offset; + return ConstantOffsetExtractor(GEP->getIterator()) + .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false, + GEP->isInBounds()) + .getSExtValue(); } bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToIndexSize( @@ -975,7 +930,7 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP, // Tries to extract a constant offset from this GEP index. int64_t ConstantOffset = - ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT); + ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP); if (ConstantOffset != 0) { NeedsExtraction = true; // A GEP may have multiple indices. We accumulate the extracted @@ -1215,7 +1170,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { User *UserChainTail; bool PreservesNUW; Value *NewIdx = ConstantOffsetExtractor::Extract( - OldIdx, GEP, UserChainTail, PreservesNUW, DT); + OldIdx, GEP, UserChainTail, PreservesNUW); if (NewIdx != nullptr) { // Switches to the index with the constant offset removed. GEP->setOperand(I, NewIdx); diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll new file mode 100644 index 0000000000000..f7cd8f3139ae4 --- /dev/null +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll @@ -0,0 +1,195 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; Test the xor with constant operand is decomposed in to gep. +; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep \ +; RUN: -S < %s | FileCheck %s +; Test the gvn pass eliminates the redundant xor instructions from decomposition. +; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep,gvn \ +; RUN: -S < %s | FileCheck --check-prefix=GVN %s + +define amdgpu_kernel void @test1(i1 %0, ptr addrspace(3) %1) { +; CHECK-LABEL: define amdgpu_kernel void @test1( +; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 8192 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP10]], i32 16384 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP12]], i32 24576 +; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP9]], align 16 +; CHECK-NEXT: [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP11]], align 16 +; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP13]], align 16 +; CHECK-NEXT: [[TMP18:%.*]] = fadd <8 x half> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = fadd <8 x half> [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = fadd <8 x half> [[TMP18]], [[TMP19]] +; CHECK-NEXT: store <8 x half> [[TMP20]], ptr addrspace(3) [[TMP1]], align 16 +; CHECK-NEXT: ret void +; +; GVN-LABEL: define amdgpu_kernel void @test1( +; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; GVN-NEXT: [[ENTRY:.*:]] +; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; GVN-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192 +; GVN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 16384 +; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 24576 +; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 +; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 +; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]] +; GVN-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP10]], [[TMP11]] +; GVN-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP12]], [[TMP13]] +; GVN-NEXT: store <8 x half> [[TMP14]], ptr addrspace(3) [[TMP1]], align 16 +; GVN-NEXT: ret void +; +entry: + %2 = select i1 %0, i32 0, i32 288 + %3 = xor i32 %2, 32 + %4 = xor i32 %2, 4128 + %5 = xor i32 %2, 8224 + %6 = xor i32 %2, 12320 + %7 = getelementptr half, ptr addrspace(3) %1, i32 %3 + %8 = getelementptr half, ptr addrspace(3) %1, i32 %4 + %9 = getelementptr half, ptr addrspace(3) %1, i32 %5 + %10 = getelementptr half, ptr addrspace(3) %1, i32 %6 + %11 = load <8 x half>, ptr addrspace(3) %7, align 16 + %12 = load <8 x half>, ptr addrspace(3) %8, align 16 + %13 = load <8 x half>, ptr addrspace(3) %9, align 16 + %14 = load <8 x half>, ptr addrspace(3) %10, align 16 + %15 = fadd <8 x half> %11, %12 + %16 = fadd <8 x half> %13, %14 + %17 = fadd <8 x half> %15, %16 + store <8 x half> %17, ptr addrspace(3) %1, align 16 + ret void +} + +define amdgpu_kernel void @test2(i1 %0, ptr addrspace(3) %1) { +; CHECK-LABEL: define amdgpu_kernel void @test2( +; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 24576 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 16384 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP11]], i32 8192 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16 +; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16 +; CHECK-NEXT: [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP12]], align 16 +; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP13]], align 16 +; CHECK-NEXT: [[TMP18:%.*]] = fadd <8 x half> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = fadd <8 x half> [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = fadd <8 x half> [[TMP18]], [[TMP19]] +; CHECK-NEXT: store <8 x half> [[TMP20]], ptr addrspace(3) [[TMP1]], align 16 +; CHECK-NEXT: ret void +; +; GVN-LABEL: define amdgpu_kernel void @test2( +; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; GVN-NEXT: [[ENTRY:.*:]] +; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; GVN-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 24576 +; GVN-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 16384 +; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192 +; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 +; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 +; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]] +; GVN-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP10]], [[TMP11]] +; GVN-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP12]], [[TMP13]] +; GVN-NEXT: store <8 x half> [[TMP14]], ptr addrspace(3) [[TMP1]], align 16 +; GVN-NEXT: ret void +; +entry: + %2 = select i1 %0, i32 0, i32 288 + %3 = xor i32 %2, 12320 + %4 = xor i32 %2, 8224 + %5 = xor i32 %2, 4128 + %6 = xor i32 %2, 32 + %7 = getelementptr half, ptr addrspace(3) %1, i32 %3 + %8 = getelementptr half, ptr addrspace(3) %1, i32 %4 + %9 = getelementptr half, ptr addrspace(3) %1, i32 %5 + %10 = getelementptr half, ptr addrspace(3) %1, i32 %6 + %11 = load <8 x half>, ptr addrspace(3) %7, align 16 + %12 = load <8 x half>, ptr addrspace(3) %8, align 16 + %13 = load <8 x half>, ptr addrspace(3) %9, align 16 + %14 = load <8 x half>, ptr addrspace(3) %10, align 16 + %15 = fadd <8 x half> %11, %12 + %16 = fadd <8 x half> %13, %14 + %17 = fadd <8 x half> %15, %16 + store <8 x half> %17, ptr addrspace(3) %1, align 16 + ret void +} + +define amdgpu_kernel void @test3(i1 %0, ptr addrspace(3) %1) { +; CHECK-LABEL: define amdgpu_kernel void @test3( +; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288 +; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 4096 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 8192 +; CHECK-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 +; CHECK-NEXT: [[TMP12:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16 +; CHECK-NEXT: [[TMP13:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16 +; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = fadd <8 x half> [[TMP13]], [[TMP14]] +; CHECK-NEXT: store <8 x half> [[TMP15]], ptr addrspace(3) [[TMP1]], align 16 +; CHECK-NEXT: ret void +; +; GVN-LABEL: define amdgpu_kernel void @test3( +; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; GVN-NEXT: [[ENTRY:.*:]] +; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; GVN-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288 +; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] +; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 4096 +; GVN-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP5]], i32 8192 +; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 +; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16 +; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]] +; GVN-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP11]], [[TMP12]] +; GVN-NEXT: store <8 x half> [[TMP13]], ptr addrspace(3) [[TMP1]], align 16 +; GVN-NEXT: ret void +; +entry: + %2 = select i1 %0, i32 0, i32 288 + %3 = xor i32 %2, 32 + %4 = xor i32 %2, 2336 + %5 = xor i32 %2, 4128 + %6 = getelementptr half, ptr addrspace(3) %1, i32 %3 + %7 = getelementptr half, ptr addrspace(3) %1, i32 %4 + %8 = getelementptr half, ptr addrspace(3) %1, i32 %5 + %9 = load <8 x half>, ptr addrspace(3) %6, align 16 + %10 = load <8 x half>, ptr addrspace(3) %7, align 16 + %11 = load <8 x half>, ptr addrspace(3) %8, align 16 + %12 = fadd <8 x half> %9, %10 + %13 = fadd <8 x half> %11, %12 + store <8 x half> %13, ptr addrspace(3) %1, align 16 + ret void +} diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll deleted file mode 100644 index 2cbf2ead2107e..0000000000000 --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-idiom.ll +++ /dev/null @@ -1,66 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep \ -; RUN: -S < %s | FileCheck %s - -; Test that xor idiom. -; Xors with disjoint constants 4128,8224 and 12320 must be expressed in GEPs. -; Xors with non-disjoint constants 2336 and 8480, should not be optimized. -define amdgpu_kernel void @test1(i1 %0, ptr addrspace(3) %1) { -; CHECK-LABEL: define amdgpu_kernel void @test1( -; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 -; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP14:%.*]] = xor i32 [[TMP2]], 2336 -; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 8480 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP14]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP20]], i32 8192 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 16384 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP25]], i32 24576 -; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 -; CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP16]], align 16 -; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP21]], align 16 -; CHECK-NEXT: [[TMP18:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 -; CHECK-NEXT: [[TMP19:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP15]], align 16 -; CHECK-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16 -; CHECK-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP22:%.*]] = fadd <8 x half> [[TMP17]], [[TMP18]] -; CHECK-NEXT: [[TMP23:%.*]] = fadd <8 x half> [[TMP19]], [[TMP11]] -; CHECK-NEXT: [[TMP24:%.*]] = fadd <8 x half> [[TMP12]], [[TMP22]] -; CHECK-NEXT: [[TMP13:%.*]] = fadd <8 x half> [[TMP23]], [[TMP24]] -; CHECK-NEXT: store <8 x half> [[TMP13]], ptr addrspace(3) [[TMP1]], align 16 -; CHECK-NEXT: ret void -; -entry: - %2 = select i1 %0, i32 0, i32 288 - %3 = xor i32 %2, 32 ; Base - %4 = xor i32 %2, 2336 ; Not disjoint - %5 = xor i32 %2, 4128 ; Disjoint - %6 = xor i32 %2, 8224 ; Disjoint - %7 = xor i32 %2, 8480 ; Not disjoint - %8 = xor i32 %2, 12320 ; Disjoint - %9 = getelementptr half, ptr addrspace(3) %1, i32 %3 - %10 = getelementptr half, ptr addrspace(3) %1, i32 %4 - %11 = getelementptr half, ptr addrspace(3) %1, i32 %5 - %12 = getelementptr half, ptr addrspace(3) %1, i32 %6 - %13 = getelementptr half, ptr addrspace(3) %1, i32 %7 - %14 = getelementptr half, ptr addrspace(3) %1, i32 %8 - %15 = load <8 x half>, ptr addrspace(3) %9, align 16 - %16 = load <8 x half>, ptr addrspace(3) %10, align 16 - %17 = load <8 x half>, ptr addrspace(3) %11, align 16 - %18 = load <8 x half>, ptr addrspace(3) %12, align 16 - %19 = load <8 x half>, ptr addrspace(3) %13, align 16 - %20 = load <8 x half>, ptr addrspace(3) %14, align 16 - %21 = fadd <8 x half> %15, %16 - %22 = fadd <8 x half> %17, %18 - %23 = fadd <8 x half> %19, %20 - %24 = fadd <8 x half> %21, %22 - %25 = fadd <8 x half> %23, %24 - store <8 x half> %25, ptr addrspace(3) %1, align 16 - ret void -} From 77869e80c91447aed23fceb85001e5b866a64ad8 Mon Sep 17 00:00:00 2001 From: Sumanth Gundapaneni Date: Wed, 30 Jul 2025 17:20:29 -0500 Subject: [PATCH 4/7] Update logic and add more tests --- .../Scalar/SeparateConstOffsetFromGEP.cpp | 116 ++++---- .../AMDGPU/xor-decompose.ll | 260 +++++++++++++++--- 2 files changed, 287 insertions(+), 89 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index c6ce7859a1f31..1605aaa3cd1f6 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -321,9 +321,9 @@ class ConstantOffsetExtractor { bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO, bool NonNegative); - /// Check if Xor instruction should be considered and updated for - /// optimization. - bool shouldConsiderAndUpdateXor(BinaryOperator *XorInst); + /// Analyze XOR instruction to extract disjoint constant bits that behave + /// like addition operations for improved address mode folding. + APInt extractDisjointBitsFromXor(BinaryOperator *XorInst); /// The path from the constant offset to the old GEP index. e.g., if the GEP /// index is "a * b + (c + 5)". After running function find, UserChain[0] will @@ -518,14 +518,12 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO, bool NonNegative) { - // We only consider ADD, SUB, OR and XOR, because a non-zero constant found in + // We only consider ADD, SUB and OR, because a non-zero constant found in // expressions composed of these operations can be easily hoisted as a - // constant offset by reassociation. XOR is a special case and can be folded - // in to gep if the constant is proven to be disjoint. + // constant offset by reassociation. if (BO->getOpcode() != Instruction::Add && BO->getOpcode() != Instruction::Sub && - BO->getOpcode() != Instruction::Or && - BO->getOpcode() != Instruction::Xor) { + BO->getOpcode() != Instruction::Or) { return false; } @@ -536,10 +534,6 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, !cast(BO)->isDisjoint()) return false; - // Handle Xor decomposition. - if (BO->getOpcode() == Instruction::Xor) - return shouldConsiderAndUpdateXor(BO); - // FIXME: We don't currently support constants from the RHS of subs, // when we are zero-extended, because we need a way to zero-extended // them before they are negated. @@ -643,6 +637,9 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended, // Trace into subexpressions for more hoisting opportunities. if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended); + // Handle XOR with disjoint bits that can be treated as addition. + else if (BO->getOpcode() == Instruction::Xor) + ConstantOffset = extractDisjointBitsFromXor(BO); } else if (isa(V)) { ConstantOffset = find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative) @@ -755,11 +752,19 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { Value *NextInChain = removeConstOffset(ChainIndex - 1); Value *TheOther = BO->getOperand(1 - OpNo); - // If NextInChain is 0 and not the LHS of a sub, we can simplify the - // sub-expression to be just TheOther. if (ConstantInt *CI = dyn_cast(NextInChain)) { - if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0)) - return TheOther; + if (CI->isZero()) { + // Special handling for XOR with disjoint bits: + // Keep the original XOR instruction with the non disjoint part of + // the constant, and the remaining operation is still meaningful. + if (BO->getOpcode() == Instruction::Xor) + return BO; + + // If NextInChain is 0 and not the LHS of a sub, we can simplify the + // sub-expression to be just TheOther. + if (!(BO->getOpcode() == Instruction::Sub && OpNo == 0)) + return TheOther; + } } BinaryOperator::BinaryOps NewOp = BO->getOpcode(); @@ -790,58 +795,59 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { return NewBO; } -/// \brief Tries to canonicalize a 'xor' with a constant into a form that is -/// more amenable to address-mode matching. -/// -/// The transformation rewrites `Base ^ Const` into -/// `(Base ^ NonDisjointBits) ^ DisjointBits`. +/// Analyze XOR instruction to extract disjoint constant bits for address +/// folding /// -/// `DisjointBits` are the bits set in `Const` operand that are known to be zero -/// in `Base` operand. For these bits, the `xor` operation is equivalent to -/// `add`, which exposes an offset that can be more easily folded into a memory -/// access. +/// This function identifies bits in an XOR constant operand that are disjoint +/// from the base operand's known set bits. For these disjoint bits, XOR behaves +/// identically to addition, allowing us to extract them as constant offsets +/// that can be folded into addressing modes. /// -/// For example, if we know the low bit of `%ptr` is 0: -/// `xor %ptr, 3` ; 3 is `0b11` -/// becomes: -/// `%tmp = xor %ptr, 2` ; NonDisjointBits is `0b10` -/// `xor %tmp, 1` ; DisjointBits is `0b01` +/// Transformation: `Base ^ Const` becomes `(Base ^ NonDisjointBits) + +/// DisjointBits` where DisjointBits = Const & KnownZeros(Base) /// -/// The final `xor %tmp, 1` is an addition of 1. +/// Example with ptr having known-zero low bit: +/// Original: `xor %ptr, 3` ; 3 = 0b11 +/// Analysis: DisjointBits = 3 & KnownZeros(%ptr) = 0b11 & 0b01 = 0b01 +/// Result: `(xor %ptr, 2) + 1` where 1 can be folded into address mode /// -/// \returns `true` if the transformation was applied, `false` otherwise. -bool ConstantOffsetExtractor::shouldConsiderAndUpdateXor( +/// \param XorInst The XOR binary operator to analyze +/// \return APInt containing the disjoint bits that can be extracted as offset, +/// or zero if no disjoint bits exist +APInt ConstantOffsetExtractor::extractDisjointBitsFromXor( BinaryOperator *XorInst) { + assert(XorInst && XorInst->getOpcode() == Instruction::Xor && + "Expected XOR instruction"); + + const unsigned BitWidth = XorInst->getType()->getScalarSizeInBits(); Value *BaseOperand; - ConstantInt *XorConst; - if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConst)))) - return false; + ConstantInt *XorConstant; + + // Match pattern: xor BaseOperand, Constant. + if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant)))) + return APInt::getZero(BitWidth); + // Compute known bits for the base operand. const SimplifyQuery SQ(DL); - const KnownBits BaseKnown = computeKnownBits(BaseOperand, SQ); - const APInt &ConstValue = XorConst->getValue(); + const KnownBits BaseKnownBits = computeKnownBits(BaseOperand, SQ); + const APInt &ConstantValue = XorConstant->getValue(); - // Check if any bits of the constant can be treated as disjoint - // (addition-like). - const APInt DisjointBits = ConstValue & BaseKnown.Zero; - if (DisjointBits.isZero()) - return false; + // Identify disjoint bits: constant bits that are known zero in base. + const APInt DisjointBits = ConstantValue & BaseKnownBits.Zero; - // Split the xor into disjoint and non-disjoint parts. - const APInt NonDisjointBits = ConstValue & ~DisjointBits; + // Early exit if no disjoint bits found. + if (DisjointBits.isZero()) + return APInt::getZero(BitWidth); - IRBuilder<> Builder(XorInst); - Type *Ty = XorConst->getType(); + // Compute the remaining non-disjoint bits that stay in the XOR. + const APInt NonDisjointBits = ConstantValue & ~DisjointBits; - // Transform: (base ^ constant) -> ((base ^ non_disjoint) ^ disjoint). - if (!NonDisjointBits.isZero()) { - Value *NewBase = - Builder.CreateXor(BaseOperand, ConstantInt::get(Ty, NonDisjointBits)); - XorInst->setOperand(0, NewBase); - } + // Add the non-disjoint constant to the user chain for later transformation + // This will replace the original constant in the XOR with the reduced + // constant. + UserChain.push_back(ConstantInt::get(XorInst->getType(), NonDisjointBits)); - XorInst->setOperand(1, ConstantInt::get(Ty, DisjointBits)); - return true; + return DisjointBits; } /// A helper function to check if reassociating through an entry in the user diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll index f7cd8f3139ae4..df3a9180b1617 100644 --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll @@ -6,25 +6,27 @@ ; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep,gvn \ ; RUN: -S < %s | FileCheck --check-prefix=GVN %s +; Check that disjoint constants are properly extracted and folded into GEP +; addressing modes and GVN to eliminate redundant computations define amdgpu_kernel void @test1(i1 %0, ptr addrspace(3) %1) { ; CHECK-LABEL: define amdgpu_kernel void @test1( ; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 ; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 8192 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP10]], i32 16384 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 8192 +; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 16384 +; CHECK-NEXT: [[TMP11:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP12]], i32 24576 -; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 -; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP9]], align 16 -; CHECK-NEXT: [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP11]], align 16 +; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; CHECK-NEXT: [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16 ; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP13]], align 16 ; CHECK-NEXT: [[TMP18:%.*]] = fadd <8 x half> [[TMP14]], [[TMP15]] ; CHECK-NEXT: [[TMP19:%.*]] = fadd <8 x half> [[TMP16]], [[TMP17]] @@ -72,6 +74,8 @@ entry: ret void } +; Check that disjoint constants are properly extracted and folded into GEP +; addressing modes and GVN to eliminate redundant computations define amdgpu_kernel void @test2(i1 %0, ptr addrspace(3) %1) { ; CHECK-LABEL: define amdgpu_kernel void @test2( ; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { @@ -79,17 +83,17 @@ define amdgpu_kernel void @test2(i1 %0, ptr addrspace(3) %1) { ; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 ; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 ; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 24576 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 16384 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP5]], i32 24576 +; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP8]], i32 16384 +; CHECK-NEXT: [[TMP10:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP11]], i32 8192 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16 -; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP14:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 +; CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP9]], align 16 ; CHECK-NEXT: [[TMP16:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP12]], align 16 ; CHECK-NEXT: [[TMP17:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP13]], align 16 ; CHECK-NEXT: [[TMP18:%.*]] = fadd <8 x half> [[TMP14]], [[TMP15]] @@ -138,21 +142,22 @@ entry: ret void } +; Verify that xor instructions with different non-disjoint constants are optimized define amdgpu_kernel void @test3(i1 %0, ptr addrspace(3) %1) { ; CHECK-LABEL: define amdgpu_kernel void @test3( ; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 ; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288 -; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 4096 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 4096 +; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP9]], i32 8192 -; CHECK-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 -; CHECK-NEXT: [[TMP12:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16 +; CHECK-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; CHECK-NEXT: [[TMP12:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 ; CHECK-NEXT: [[TMP13:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP10]], align 16 ; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x half> [[TMP11]], [[TMP12]] ; CHECK-NEXT: [[TMP15:%.*]] = fadd <8 x half> [[TMP13]], [[TMP14]] @@ -164,12 +169,12 @@ define amdgpu_kernel void @test3(i1 %0, ptr addrspace(3) %1) { ; GVN-NEXT: [[ENTRY:.*:]] ; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 ; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; GVN-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288 -; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] +; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; GVN-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288 +; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] ; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 4096 -; GVN-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP5]], i32 8192 -; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 +; GVN-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP4]], i32 8192 +; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 ; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 ; GVN-NEXT: [[TMP11:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16 ; GVN-NEXT: [[TMP12:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]] @@ -193,3 +198,190 @@ entry: store <8 x half> %13, ptr addrspace(3) %1, align 16 ret void } + +; Verify that no optimization occurs when disjoint constants are absent +define amdgpu_kernel void @test4(i1 %0, ptr addrspace(3) %1) { +; CHECK-LABEL: define amdgpu_kernel void @test4( +; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 +; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 +; CHECK-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]] +; CHECK-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16 +; CHECK-NEXT: ret void +; +; GVN-LABEL: define amdgpu_kernel void @test4( +; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; GVN-NEXT: [[ENTRY:.*:]] +; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; GVN-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 288 +; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] +; GVN-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 +; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 +; GVN-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]] +; GVN-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16 +; GVN-NEXT: ret void +; +entry: + %2 = select i1 %0, i32 0, i32 288 + %3 = xor i32 %2, 32 + %4 = xor i32 %2, 288 + %5 = getelementptr half, ptr addrspace(3) %1, i32 %3 + %6 = getelementptr half, ptr addrspace(3) %1, i32 %4 + %7 = load <8 x half>, ptr addrspace(3) %5, align 16 + %8 = load <8 x half>, ptr addrspace(3) %6, align 16 + %9 = fadd <8 x half> %7, %8 + store <8 x half> %9, ptr addrspace(3) %1, align 16 + ret void +} + + +; Verify that XOR-BinOp-GEP usage chains are properly optimized +define amdgpu_kernel void @test5(i1 %0, ptr addrspace(3) %1) { +; CHECK-LABEL: define amdgpu_kernel void @test5( +; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 256 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192 +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16 +; CHECK-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]] +; CHECK-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16 +; CHECK-NEXT: ret void +; +; GVN-LABEL: define amdgpu_kernel void @test5( +; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; GVN-NEXT: [[ENTRY:.*:]] +; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; GVN-NEXT: [[TMP5:%.*]] = add i32 [[TMP3]], 256 +; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 8192 +; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; GVN-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]] +; GVN-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16 +; GVN-NEXT: ret void +; +entry: + %2 = select i1 %0, i32 0, i32 288 + %3 = xor i32 %2, 32 + %4 = xor i32 %2, 4128 + %5 = add i32 %4, 256 + %6 = getelementptr half, ptr addrspace(3) %1, i32 %3 + %7 = getelementptr half, ptr addrspace(3) %1, i32 %5 + %8 = load <8 x half>, ptr addrspace(3) %6, align 16 + %9 = load <8 x half>, ptr addrspace(3) %7, align 16 + %10 = fadd <8 x half> %8, %9 + store <8 x half> %10, ptr addrspace(3) %1, align 16 + ret void +} + + +; Verify that BinOp-XOR-GEP usage chains are properly optimized +; This represents the common pattern found in real target workloads +define amdgpu_kernel void @test6(i1 %0, ptr addrspace(3) %1) { +; CHECK-LABEL: define amdgpu_kernel void @test6( +; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], 32 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192 +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 +; CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16 +; CHECK-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]] +; CHECK-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16 +; CHECK-NEXT: ret void +; +; GVN-LABEL: define amdgpu_kernel void @test6( +; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; GVN-NEXT: [[ENTRY:.*:]] +; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; GVN-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256 +; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; GVN-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], 32 +; GVN-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]] +; GVN-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192 +; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 +; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16 +; GVN-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]] +; GVN-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16 +; GVN-NEXT: ret void +; +entry: + %2 = select i1 %0, i32 0, i32 288 + %3 = xor i32 %2, 32 + %4 = add i32 %2, 256 + %5 = xor i32 %4, 4128 + %6 = getelementptr half, ptr addrspace(3) %1, i32 %3 + %7 = getelementptr half, ptr addrspace(3) %1, i32 %5 + %8 = load <8 x half>, ptr addrspace(3) %6, align 16 + %9 = load <8 x half>, ptr addrspace(3) %7, align 16 + %10 = fadd <8 x half> %8, %9 + store <8 x half> %10, ptr addrspace(3) %1, align 16 + ret void +} + + +; Ensure disjoint constants exceeding addressing mode limits (e.g., 32768) are +; not extracted +define amdgpu_kernel void @test7(i1 %0, ptr addrspace(3) %1) { +; CHECK-LABEL: define amdgpu_kernel void @test7( +; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32800 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 +; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 +; CHECK-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]] +; CHECK-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16 +; CHECK-NEXT: ret void +; +; GVN-LABEL: define amdgpu_kernel void @test7( +; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; GVN-NEXT: [[ENTRY:.*:]] +; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; GVN-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], 32800 +; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP4]] +; GVN-NEXT: [[TMP7:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 +; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP6]], align 16 +; GVN-NEXT: [[TMP9:%.*]] = fadd <8 x half> [[TMP7]], [[TMP8]] +; GVN-NEXT: store <8 x half> [[TMP9]], ptr addrspace(3) [[TMP1]], align 16 +; GVN-NEXT: ret void +; +entry: + %2 = select i1 %0, i32 0, i32 288 + %3 = xor i32 %2, 32 + %4 = xor i32 %2, 32800 + %5 = getelementptr half, ptr addrspace(3) %1, i32 %3 + %6 = getelementptr half, ptr addrspace(3) %1, i32 %4 + %7 = load <8 x half>, ptr addrspace(3) %5, align 16 + %8 = load <8 x half>, ptr addrspace(3) %6, align 16 + %9 = fadd <8 x half> %7, %8 + store <8 x half> %9, ptr addrspace(3) %1, align 16 + ret void +} + From fa0af2612cba7aae1835f80f888e540b8d6f4f47 Mon Sep 17 00:00:00 2001 From: Sumanth Gundapaneni Date: Thu, 31 Jul 2025 13:10:12 -0500 Subject: [PATCH 5/7] Fix recursion logic --- .../Scalar/SeparateConstOffsetFromGEP.cpp | 8 +++++ .../AMDGPU/xor-decompose.ll | 36 +++++++++---------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 1605aaa3cd1f6..8533b645ba774 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -827,6 +827,14 @@ APInt ConstantOffsetExtractor::extractDisjointBitsFromXor( if (!match(XorInst, m_Xor(m_Value(BaseOperand), m_ConstantInt(XorConstant)))) return APInt::getZero(BitWidth); + // Try to extract constant offset from the base operand recursively. + if (BinaryOperator *BO = dyn_cast(BaseOperand)) { + APInt ConstantOffset = find(BO, /*SignExtended=*/false, + /*ZeroExtended=*/false, /*NonNegative=*/false); + if (ConstantOffset.isZero()) + return ConstantOffset; + } + // Compute known bits for the base operand. const SimplifyQuery SQ(DL); const KnownBits BaseKnownBits = computeKnownBits(BaseOperand, SQ); diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll index df3a9180b1617..d9f73c1d30cce 100644 --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll @@ -290,24 +290,21 @@ entry: ret void } - ; Verify that BinOp-XOR-GEP usage chains are properly optimized -; This represents the common pattern found in real target workloads define amdgpu_kernel void @test6(i1 %0, ptr addrspace(3) %1) { ; CHECK-LABEL: define amdgpu_kernel void @test6( ; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 ; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], 32 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192 -; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 -; CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16 -; CHECK-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]] -; CHECK-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 4128 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 512 +; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; CHECK-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]] +; CHECK-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16 ; CHECK-NEXT: ret void ; ; GVN-LABEL: define amdgpu_kernel void @test6( @@ -315,15 +312,14 @@ define amdgpu_kernel void @test6(i1 %0, ptr addrspace(3) %1) { ; GVN-NEXT: [[ENTRY:.*:]] ; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 ; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 -; GVN-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], 256 -; GVN-NEXT: [[TMP5:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] -; GVN-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], 32 -; GVN-NEXT: [[TMP7:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP6]] -; GVN-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP7]], i32 8192 -; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP5]], align 16 -; GVN-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP8]], align 16 -; GVN-NEXT: [[TMP11:%.*]] = fadd <8 x half> [[TMP9]], [[TMP10]] -; GVN-NEXT: store <8 x half> [[TMP11]], ptr addrspace(3) [[TMP1]], align 16 +; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; GVN-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 4128 +; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 512 +; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; GVN-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]] +; GVN-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16 ; GVN-NEXT: ret void ; entry: From 0f88c77e18531cf7860c2313e1b5f1df83be6bff Mon Sep 17 00:00:00 2001 From: Sumanth Gundapaneni Date: Thu, 31 Jul 2025 13:22:17 -0500 Subject: [PATCH 6/7] Fix the typo error --- llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 8533b645ba774..6e715a0160419 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -831,7 +831,7 @@ APInt ConstantOffsetExtractor::extractDisjointBitsFromXor( if (BinaryOperator *BO = dyn_cast(BaseOperand)) { APInt ConstantOffset = find(BO, /*SignExtended=*/false, /*ZeroExtended=*/false, /*NonNegative=*/false); - if (ConstantOffset.isZero()) + if (!ConstantOffset.isZero()) return ConstantOffset; } From 378e5ffa943b90384014f1e25740676a9cdd5a57 Mon Sep 17 00:00:00 2001 From: Sumanth Gundapaneni Date: Fri, 1 Aug 2025 09:15:55 -0500 Subject: [PATCH 7/7] Update lit test --- .../AMDGPU/xor-decompose.ll | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll index d9f73c1d30cce..36900ba7e16bd 100644 --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-decompose.ll @@ -336,6 +336,52 @@ entry: ret void } +; Verify that BinOp-XOR-GEP usage chains with non disjoint xor works as +; intended. +define amdgpu_kernel void @test6a(i1 %0, ptr addrspace(3) %1) { +; CHECK-LABEL: define amdgpu_kernel void @test6a( +; CHECK-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 512 +; CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; CHECK-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]] +; CHECK-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16 +; CHECK-NEXT: ret void +; +; GVN-LABEL: define amdgpu_kernel void @test6a( +; GVN-SAME: i1 [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) { +; GVN-NEXT: [[ENTRY:.*:]] +; GVN-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], i32 0, i32 288 +; GVN-NEXT: [[TMP3:%.*]] = xor i32 [[TMP2]], 32 +; GVN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP3]] +; GVN-NEXT: [[TMP5:%.*]] = xor i32 [[TMP2]], 288 +; GVN-NEXT: [[TMP6:%.*]] = getelementptr half, ptr addrspace(3) [[TMP1]], i32 [[TMP5]] +; GVN-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP6]], i32 512 +; GVN-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP4]], align 16 +; GVN-NEXT: [[TMP9:%.*]] = load <8 x half>, ptr addrspace(3) [[TMP7]], align 16 +; GVN-NEXT: [[TMP10:%.*]] = fadd <8 x half> [[TMP8]], [[TMP9]] +; GVN-NEXT: store <8 x half> [[TMP10]], ptr addrspace(3) [[TMP1]], align 16 +; GVN-NEXT: ret void +; +entry: + %2 = select i1 %0, i32 0, i32 288 + %3 = xor i32 %2, 32 + %4 = add i32 %2, 256 + %5 = xor i32 %4, 288 + %6 = getelementptr half, ptr addrspace(3) %1, i32 %3 + %7 = getelementptr half, ptr addrspace(3) %1, i32 %5 + %8 = load <8 x half>, ptr addrspace(3) %6, align 16 + %9 = load <8 x half>, ptr addrspace(3) %7, align 16 + %10 = fadd <8 x half> %8, %9 + store <8 x half> %10, ptr addrspace(3) %1, align 16 + ret void +} ; Ensure disjoint constants exceeding addressing mode limits (e.g., 32768) are ; not extracted