Skip to content

Commit 393114b

Browse files
pthamminigcbot
authored andcommitted
Replace Atomic Fence with GenISA_source_value try 2
Replace Atomic Fence with GenISA_source_value try 2
1 parent 9a52805 commit 393114b

File tree

3 files changed

+179
-5
lines changed

3 files changed

+179
-5
lines changed

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8574,6 +8574,7 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst *inst) {
85748574
}
85758575
case GenISAIntrinsic::GenISA_source_value: {
85768576
m_encoder->Copy(m_currShader->GetNULL(), GetSymbol(inst->getOperand(0)));
8577+
m_encoder->Fence(false, false, false, false, false, false, false, true);
85778578
m_encoder->Push();
85788579
break;
85798580
}

IGC/Compiler/Optimizer/SynchronizationObjectCoalescing.cpp

Lines changed: 175 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ SPDX-License-Identifier: MIT
1717
#include "Compiler/IGCPassSupport.h"
1818
#include "SynchronizationObjectCoalescing.hpp"
1919
#include "visa_igc_common_header.h"
20+
#include "llvm/IR/IRBuilder.h"
21+
#include "llvm/Analysis/CFG.h"
22+
#include "llvm/Analysis/LoopInfo.h"
2023
#include <utility>
2124
#include <map>
2225

@@ -285,6 +288,9 @@ class SynchronizationObjectCoalescing : public llvm::FunctionPass {
285288
static_cast<SynchronizationCaseMask>(WriteSyncRead | WriteSyncWrite | AtomicSyncRead | AtomicSyncWrite |
286289
WriteSyncAtomic | ReadSyncAtomic | ReadSyncWrite | AtomicSyncAtomic);
287290

291+
////////////////////////////////////////////////////////////////////////
292+
void CreateSourceValueInst(std::vector<llvm::Instruction *> &pAtomicInstToBeSourced, llvm::Instruction *pFenceInst);
293+
288294
////////////////////////////////////////////////////////////////////////
289295
void EraseRedundantInst(llvm::Instruction *pInst);
290296

@@ -327,6 +333,7 @@ class SynchronizationObjectCoalescing : public llvm::FunctionPass {
327333

328334
////////////////////////////////////////////////////////////////////////
329335
bool IsRequiredForAtomicOperationsOrdering(const llvm::Instruction *pSourceInst,
336+
std::vector<llvm::Instruction *> &pAtomicInstToBeSourced,
330337
bool onlyGlobalAtomics = false) const;
331338

332339
////////////////////////////////////////////////////////////////////////
@@ -440,6 +447,7 @@ class SynchronizationObjectCoalescing : public llvm::FunctionPass {
440447
std::vector<llvm::Instruction *> m_LscMemoryFences;
441448
std::vector<llvm::Instruction *> m_UntypedMemoryFences;
442449
std::vector<llvm::Instruction *> m_ThreadGroupBarriers;
450+
std::unordered_set<llvm::Instruction *> m_SourcedAtomicInstructions;
443451

444452
// this variable holds a mapping from a basic block to its memory instructions ordered by their occurrences in it
445453
// (the initial index of line of this basic block - the number of instructions preceding an instruction it its basic
@@ -538,6 +546,125 @@ bool SynchronizationObjectCoalescing::ProcessFunction() {
538546
return FindRedundancies();
539547
}
540548

549+
// Referenced from MemoryModelPass
550+
static inline PHINode *FindDominatingPhi(DominatorTree &DT, Instruction *def, BasicBlock *postDominator) {
551+
IGC_ASSERT(def->getParent() != postDominator);
552+
IGC_ASSERT(!DT.dominates(def, postDominator));
553+
SmallPtrSet<PHINode *, 8> seen;
554+
SmallVector<User *, 8> worklist(def->users());
555+
while (!worklist.empty()) {
556+
PHINode *phi = dyn_cast<PHINode>(worklist.pop_back_val());
557+
if (phi == nullptr || seen.count(phi) > 0) {
558+
continue;
559+
}
560+
if (phi->getParent() == postDominator || DT.dominates(phi, postDominator)) {
561+
return phi;
562+
}
563+
seen.insert(phi);
564+
}
565+
return nullptr;
566+
}
567+
568+
////////////////////////////////////////////////////////////////////////
569+
/// @brief Fence Instruction responsible for only ordering of atomic Instructions
570+
/// can be replaced with Source Value Intrinsic which will still maintain
571+
/// the order of Instructions
572+
void SynchronizationObjectCoalescing::CreateSourceValueInst(std::vector<llvm::Instruction *> &pAtomicInstToBeSourced,
573+
llvm::Instruction *pFenceInst) {
574+
IGC_ASSERT(pAtomicInstToBeSourced.size() > 0);
575+
// reversing the list to source the atomic instructions in the order
576+
reverse(pAtomicInstToBeSourced.begin(), pAtomicInstToBeSourced.end());
577+
Function *funcPtr = GenISAIntrinsic::getDeclaration(pFenceInst->getModule(), GenISAIntrinsic::GenISA_source_value);
578+
BasicBlock *fenceBB = pFenceInst->getParent();
579+
580+
Function *F = pAtomicInstToBeSourced[0]->getFunction();
581+
DominatorTree DT(*F);
582+
PostDominatorTree PDT(*F);
583+
LoopInfo LI(DT);
584+
585+
for (llvm::Instruction *atomicInst : pAtomicInstToBeSourced) {
586+
// Making sure that the Fence Inst is potentially reachable from the atomic Instruction.
587+
if (!isPotentiallyReachable(atomicInst, pFenceInst, nullptr, &DT, &LI)) {
588+
continue;
589+
}
590+
591+
// In few shaders, the atomic instructions were already sourced before unification. Skip creating one in this case
592+
bool atomicSourced = false;
593+
for (User *U : atomicInst->users()) {
594+
if (GenIntrinsicInst *Inst = dyn_cast<GenIntrinsicInst>(U)) {
595+
// TODO: If dominates fail, then either move the source_value to BB that dominates fence and post dominates
596+
// atomic or delete them and let the below code create new ones. Requires further testing.
597+
if (Inst->getIntrinsicID() == GenISAIntrinsic::GenISA_source_value &&
598+
DT.dominates(Inst->getParent(), fenceBB)) {
599+
atomicSourced = true;
600+
break;
601+
}
602+
}
603+
}
604+
if (atomicSourced) {
605+
m_SourcedAtomicInstructions.insert(atomicInst);
606+
continue;
607+
}
608+
609+
BasicBlock *atomicBB = atomicInst->getParent();
610+
BasicBlock *fenceDominator = fenceBB;
611+
Instruction *insertPoint = atomicBB->getTerminator();
612+
Value *sourceVal = cast<GenIntrinsicInst>(atomicInst);
613+
614+
// TODO: Determining Insert point can be improved which can postpone the source value intrinsic as long as possible.
615+
// Similar analysis is done in FindOptimalInsertPoints() in ApplyCacheControls.cpp
616+
617+
// Check if fence Instruction BB post dominates atomic Instruction BB
618+
// Else find the BB that is a predecessor of fence BB and post dominates atomic BB.
619+
// If we don't find one, then the insert point is near the terminator of atomic BB
620+
while (fenceDominator && fenceDominator != atomicBB) {
621+
if (PDT.dominates(fenceDominator, atomicBB)) {
622+
// If fence instruction is in same BB, then use fence as insert point
623+
// Else use the terminator of fenceDominator as insert point
624+
insertPoint = fenceBB == fenceDominator ? pFenceInst : fenceDominator->getTerminator();
625+
// It's possible that the atomic instruction does not dominate
626+
// the post-dominator, find a PHI user of the atomic instruction
627+
// that dominates the post-dominator.
628+
if (!DT.dominates(atomicBB, fenceDominator)) {
629+
PHINode *phi = FindDominatingPhi(DT, atomicInst, fenceDominator);
630+
if (phi) {
631+
sourceVal = phi;
632+
} else {
633+
// Fallback to inserting the source value in the basic
634+
// block with the atomic instruction.
635+
insertPoint = atomicBB->getTerminator();
636+
}
637+
}
638+
break;
639+
}
640+
fenceDominator = fenceDominator->getSinglePredecessor();
641+
}
642+
// If Fence is present in same BB as atomic, then insert at Fence Instruction
643+
if (fenceBB == atomicBB) {
644+
insertPoint = pFenceInst;
645+
}
646+
647+
IRBuilder<> builder(insertPoint);
648+
Type *sourceValType = sourceVal->getType();
649+
650+
// Source value intrinsic accepts only i32.
651+
if (sourceValType->isIntegerTy()) {
652+
sourceVal = builder.CreateZExtOrTrunc(sourceVal, builder.getInt32Ty());
653+
} else if (sourceValType->isFloatingPointTy()) {
654+
if (sourceValType->isFloatTy()) {
655+
sourceVal = builder.CreateBitCast(sourceVal, builder.getInt32Ty());
656+
} else {
657+
sourceVal = builder.CreateFPToUI(sourceVal, builder.getInt32Ty());
658+
}
659+
} else {
660+
IGC_ASSERT_MESSAGE(0, "Unexpected type");
661+
}
662+
663+
builder.CreateCall(funcPtr, {sourceVal});
664+
m_SourcedAtomicInstructions.insert(atomicInst);
665+
}
666+
}
667+
541668
////////////////////////////////////////////////////////////////////////
542669
void SynchronizationObjectCoalescing::EraseRedundantInst(llvm::Instruction *pInst) {
543670
bool isFence = IsFenceOperation(pInst);
@@ -740,7 +867,13 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
740867
}
741868
SynchronizationCaseMask referenceSyncCaseMask = GetStrictSynchronizationMask(pInst);
742869
bool isObligatory = (syncCaseMask & referenceSyncCaseMask) != 0;
743-
isObligatory |= IsRequiredForAtomicOperationsOrdering(pInst, true /*onlyGlobalAtomics*/);
870+
871+
std::vector<llvm::Instruction *> atomicInstToBeSourced;
872+
if (!isObligatory) {
873+
isObligatory =
874+
IsRequiredForAtomicOperationsOrdering(pInst, atomicInstToBeSourced, true /*onlyGlobalAtomics*/);
875+
}
876+
744877
bool verifyUnsynchronizedInstructions = IsFenceOperation(pInst);
745878
verifyUnsynchronizedInstructions &= (!isObligatory || syncCaseMask == ReadSyncWrite);
746879

@@ -767,6 +900,9 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
767900
#if _DEBUG
768901
RegisterRedundancyExplanation(pInst, ExplanationEntry::GlobalMemoryRedundancy);
769902
#endif // _DEBUG
903+
if (IGC_IS_FLAG_ENABLED(ReplaceAtomicFenceWithSourceValue) && atomicInstToBeSourced.size() > 0) {
904+
CreateSourceValueInst(atomicInstToBeSourced, const_cast<Instruction *>(pInst));
905+
}
770906
EraseRedundantGlobalScope(pInst);
771907
isModified = true;
772908
SetLocalMemoryInstructionMask();
@@ -831,7 +967,12 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
831967
GetSynchronizationMaskForAllResources(localForwardMemoryInstructionMask, localBackwardMemoryInstructionMask);
832968
SynchronizationCaseMask referenceSyncCaseMask = GetStrictSynchronizationMask(pInst);
833969
bool isObligatory = (syncCaseMask & referenceSyncCaseMask) != 0;
834-
isObligatory |= IsRequiredForAtomicOperationsOrdering(pInst);
970+
971+
std::vector<llvm::Instruction *> atomicInstToBeSourced;
972+
if (!isObligatory) {
973+
isObligatory = IsRequiredForAtomicOperationsOrdering(pInst, atomicInstToBeSourced);
974+
}
975+
835976
bool verifyUnsynchronizedInstructions = IsFenceOperation(pInst);
836977
verifyUnsynchronizedInstructions &= (!isObligatory || syncCaseMask == ReadSyncWrite);
837978

@@ -847,6 +988,9 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
847988
#if _DEBUG
848989
RegisterRedundancyExplanation(pInst, ExplanationEntry::StrictRedundancy);
849990
#endif // _DEBUG
991+
if (IGC_IS_FLAG_ENABLED(ReplaceAtomicFenceWithSourceValue) && atomicInstToBeSourced.size() > 0) {
992+
CreateSourceValueInst(atomicInstToBeSourced, const_cast<Instruction *>(pInst));
993+
}
850994
EraseRedundantInst(pInst);
851995
isModified = true;
852996
}
@@ -1731,8 +1875,9 @@ SynchronizationObjectCoalescing::GetUnsynchronizedForwardInstructionMask(const l
17311875
/// operations present before the fence (in program order)
17321876
/// @param pSourceInst the source synchronization instruction
17331877
/// @param onlyGlobalAtomics check only TGM and UGM atomic operations
1734-
bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(const llvm::Instruction *pSourceInst,
1735-
bool onlyGlobalAtomics /*= false*/) const {
1878+
bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(
1879+
const llvm::Instruction *pSourceInst, std::vector<llvm::Instruction *> &pAtomicInstToBeSourced,
1880+
bool onlyGlobalAtomics /*= false*/) const {
17361881
if (!IsFenceOperation(pSourceInst)) {
17371882
// Not a fence, nothing to check
17381883
return false;
@@ -1782,6 +1927,10 @@ bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(cons
17821927
{
17831928
isPotentiallyUnsynchronizedAtomic = false;
17841929
// Lambda that checks if a fence operation synchronizes the atomic operation.
1930+
// This can be improved to detect the users of atomic instruction and end the search for fences once we find the
1931+
// user. This user is essentially same as Source Value Intrinsic, however it can be reordered in visa affecting
1932+
// the execution order of atomic instructions. If we can find a way to treat this user as a special instruction
1933+
// and avoid reordering, we can skip creating new source value instruction.
17851934
std::function<bool(const llvm::Instruction *)> IsBoundaryInst = [this, &atomicPointerMemoryInstructionMask,
17861935
&isPotentiallyUnsynchronizedAtomic,
17871936
pSourceInst](const llvm::Instruction *pInst) {
@@ -1832,6 +1981,11 @@ bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(cons
18321981
for (llvm::BasicBlock::const_iterator it = ++pSourceInst->getIterator(); it != pSourceInst->getParent()->end();
18331982
++it) {
18341983
const llvm::Instruction *pCurrInst = &(*it);
1984+
// If we encounter an atomic instruction after pSourceInst (Source Fence), then the fence is required to execute
1985+
// pInst (initial atomic) before pCurrInst (current atomic)
1986+
if (IsAtomicOperation(pCurrInst)) {
1987+
break;
1988+
}
18351989
if (IsFenceOperation(pCurrInst) && IsSubstituteInstruction(pCurrInst, pSourceInst)) {
18361990
substituteFenceFound = true;
18371991
break;
@@ -1840,7 +1994,22 @@ bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(cons
18401994
if (!substituteFenceFound) {
18411995
// Found an atomic operation that requires the source fence
18421996
// instruction for correct memory ordering.
1843-
return true;
1997+
1998+
// If ReplaceAtomicFenceWithSourceValue is true, we can replace this fence with GenISA_source_value.
1999+
// This will source the atomic instruction and still maintains the order of atomic instructions.
2000+
// Else return true marking the fence instruction as Obligatory.
2001+
2002+
if (IGC_IS_FLAG_ENABLED(ReplaceAtomicFenceWithSourceValue)) {
2003+
// If a previous fence was replaced with source value intrinsic, GetVisibleMemoryInstructions will add the
2004+
// same atomic instruction again for the next fence resulting in multiple source value intrinsics but we need
2005+
// it to be sourced only once. Hence we check if it was already sourced previously. Continues to check all
2006+
// valid atomic Instructions to be sourced.
2007+
if (m_SourcedAtomicInstructions.find(const_cast<Instruction *>(pInst)) == m_SourcedAtomicInstructions.end()) {
2008+
pAtomicInstToBeSourced.push_back(const_cast<Instruction *>(pInst));
2009+
}
2010+
} else {
2011+
return true;
2012+
}
18442013
}
18452014
}
18462015
}
@@ -2002,6 +2171,7 @@ void SynchronizationObjectCoalescing::InvalidateMembers() {
20022171
m_OrderedFenceInstructionsInBasicBlockCache.clear();
20032172
m_OrderedBarrierInstructionsInBasicBlockCache.clear();
20042173
m_BasicBlockMemoryInstructionMaskCache.clear();
2174+
m_SourcedAtomicInstructions.clear();
20052175
#if _DEBUG
20062176
m_ExplanationEntries.clear();
20072177
#endif // _DEBUG

IGC/common/igc_flags.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,9 @@ DECLARE_IGC_REGKEY(
435435
"The mask is casted to IGC::SyncInstMask and informs which synchronization objects should not be coalesced. Note "
436436
"that synchronization objects classified in multiple types are not disabled if any bit describing them is off.",
437437
true)
438+
DECLARE_IGC_REGKEY(bool, ReplaceAtomicFenceWithSourceValue, true,
439+
"Fences are required to maintain the order of atomic memory instructions. This flag will replace the fence with "
440+
"GenISA_source_value intrinsic which sources the result of atomic operation and still maintains the order.", true)
438441
DECLARE_IGC_REGKEY(bool, UnrollLoopForCodeSizeOnly, false,
439442
"Only unroll the loop if it can reduce program size/register pressure. Ignore all other threshold "
440443
"setting but still enable PromoteLoopUnrollwithAlloca due to high likelyhood to reduce size.",

0 commit comments

Comments
 (0)