@@ -17,6 +17,9 @@ SPDX-License-Identifier: MIT
17
17
#include " Compiler/IGCPassSupport.h"
18
18
#include " SynchronizationObjectCoalescing.hpp"
19
19
#include " visa_igc_common_header.h"
20
+ #include " llvm/IR/IRBuilder.h"
21
+ #include " llvm/Analysis/CFG.h"
22
+ #include " llvm/Analysis/LoopInfo.h"
20
23
#include < utility>
21
24
#include < map>
22
25
@@ -285,6 +288,9 @@ class SynchronizationObjectCoalescing : public llvm::FunctionPass {
285
288
static_cast <SynchronizationCaseMask>(WriteSyncRead | WriteSyncWrite | AtomicSyncRead | AtomicSyncWrite |
286
289
WriteSyncAtomic | ReadSyncAtomic | ReadSyncWrite | AtomicSyncAtomic);
287
290
291
+ // //////////////////////////////////////////////////////////////////////
292
+ void CreateSourceValueInst (std::vector<llvm::Instruction *> &pAtomicInstToBeSourced, llvm::Instruction *pFenceInst);
293
+
288
294
// //////////////////////////////////////////////////////////////////////
289
295
void EraseRedundantInst (llvm::Instruction *pInst);
290
296
@@ -327,6 +333,7 @@ class SynchronizationObjectCoalescing : public llvm::FunctionPass {
327
333
328
334
// //////////////////////////////////////////////////////////////////////
329
335
bool IsRequiredForAtomicOperationsOrdering (const llvm::Instruction *pSourceInst,
336
+ std::vector<llvm::Instruction *> &pAtomicInstToBeSourced,
330
337
bool onlyGlobalAtomics = false ) const ;
331
338
332
339
// //////////////////////////////////////////////////////////////////////
@@ -440,6 +447,7 @@ class SynchronizationObjectCoalescing : public llvm::FunctionPass {
440
447
std::vector<llvm::Instruction *> m_LscMemoryFences;
441
448
std::vector<llvm::Instruction *> m_UntypedMemoryFences;
442
449
std::vector<llvm::Instruction *> m_ThreadGroupBarriers;
450
+ std::unordered_set<llvm::Instruction *> m_SourcedAtomicInstructions;
443
451
444
452
// this variable holds a mapping from a basic block to its memory instructions ordered by their occurrences in it
445
453
// (the initial index of line of this basic block - the number of instructions preceding an instruction it its basic
@@ -538,6 +546,125 @@ bool SynchronizationObjectCoalescing::ProcessFunction() {
538
546
return FindRedundancies ();
539
547
}
540
548
549
+ // Referenced from MemoryModelPass
550
+ static inline PHINode *FindDominatingPhi (DominatorTree &DT, Instruction *def, BasicBlock *postDominator) {
551
+ IGC_ASSERT (def->getParent () != postDominator);
552
+ IGC_ASSERT (!DT.dominates (def, postDominator));
553
+ SmallPtrSet<PHINode *, 8 > seen;
554
+ SmallVector<User *, 8 > worklist (def->users ());
555
+ while (!worklist.empty ()) {
556
+ PHINode *phi = dyn_cast<PHINode>(worklist.pop_back_val ());
557
+ if (phi == nullptr || seen.count (phi) > 0 ) {
558
+ continue ;
559
+ }
560
+ if (phi->getParent () == postDominator || DT.dominates (phi, postDominator)) {
561
+ return phi;
562
+ }
563
+ seen.insert (phi);
564
+ }
565
+ return nullptr ;
566
+ }
567
+
568
+ // //////////////////////////////////////////////////////////////////////
569
+ // / @brief Fence Instruction responsible for only ordering of atomic Instructions
570
+ // / can be replaced with Source Value Intrinsic which will still maintain
571
+ // / the order of Instructions
572
+ void SynchronizationObjectCoalescing::CreateSourceValueInst (std::vector<llvm::Instruction *> &pAtomicInstToBeSourced,
573
+ llvm::Instruction *pFenceInst) {
574
+ IGC_ASSERT (pAtomicInstToBeSourced.size () > 0 );
575
+ // reversing the list to source the atomic instructions in the order
576
+ reverse (pAtomicInstToBeSourced.begin (), pAtomicInstToBeSourced.end ());
577
+ Function *funcPtr = GenISAIntrinsic::getDeclaration (pFenceInst->getModule (), GenISAIntrinsic::GenISA_source_value);
578
+ BasicBlock *fenceBB = pFenceInst->getParent ();
579
+
580
+ Function *F = pAtomicInstToBeSourced[0 ]->getFunction ();
581
+ DominatorTree DT (*F);
582
+ PostDominatorTree PDT (*F);
583
+ LoopInfo LI (DT);
584
+
585
+ for (llvm::Instruction *atomicInst : pAtomicInstToBeSourced) {
586
+ // Making sure that the Fence Inst is potentially reachable from the atomic Instruction.
587
+ if (!isPotentiallyReachable (atomicInst, pFenceInst, nullptr , &DT, &LI)) {
588
+ continue ;
589
+ }
590
+
591
+ // In few shaders, the atomic instructions were already sourced before unification. Skip creating one in this case
592
+ bool atomicSourced = false ;
593
+ for (User *U : atomicInst->users ()) {
594
+ if (GenIntrinsicInst *Inst = dyn_cast<GenIntrinsicInst>(U)) {
595
+ // TODO: If dominates fail, then either move the source_value to BB that dominates fence and post dominates
596
+ // atomic or delete them and let the below code create new ones. Requires further testing.
597
+ if (Inst->getIntrinsicID () == GenISAIntrinsic::GenISA_source_value &&
598
+ DT.dominates (Inst->getParent (), fenceBB)) {
599
+ atomicSourced = true ;
600
+ break ;
601
+ }
602
+ }
603
+ }
604
+ if (atomicSourced) {
605
+ m_SourcedAtomicInstructions.insert (atomicInst);
606
+ continue ;
607
+ }
608
+
609
+ BasicBlock *atomicBB = atomicInst->getParent ();
610
+ BasicBlock *fenceDominator = fenceBB;
611
+ Instruction *insertPoint = atomicBB->getTerminator ();
612
+ Value *sourceVal = cast<GenIntrinsicInst>(atomicInst);
613
+
614
+ // TODO: Determining Insert point can be improved which can postpone the source value intrinsic as long as possible.
615
+ // Similar analysis is done in FindOptimalInsertPoints() in ApplyCacheControls.cpp
616
+
617
+ // Check if fence Instruction BB post dominates atomic Instruction BB
618
+ // Else find the BB that is a predecessor of fence BB and post dominates atomic BB.
619
+ // If we don't find one, then the insert point is near the terminator of atomic BB
620
+ while (fenceDominator && fenceDominator != atomicBB) {
621
+ if (PDT.dominates (fenceDominator, atomicBB)) {
622
+ // If fence instruction is in same BB, then use fence as insert point
623
+ // Else use the terminator of fenceDominator as insert point
624
+ insertPoint = fenceBB == fenceDominator ? pFenceInst : fenceDominator->getTerminator ();
625
+ // It's possible that the atomic instruction does not dominate
626
+ // the post-dominator, find a PHI user of the atomic instruction
627
+ // that dominates the post-dominator.
628
+ if (!DT.dominates (atomicBB, fenceDominator)) {
629
+ PHINode *phi = FindDominatingPhi (DT, atomicInst, fenceDominator);
630
+ if (phi) {
631
+ sourceVal = phi;
632
+ } else {
633
+ // Fallback to inserting the source value in the basic
634
+ // block with the atomic instruction.
635
+ insertPoint = atomicBB->getTerminator ();
636
+ }
637
+ }
638
+ break ;
639
+ }
640
+ fenceDominator = fenceDominator->getSinglePredecessor ();
641
+ }
642
+ // If Fence is present in same BB as atomic, then insert at Fence Instruction
643
+ if (fenceBB == atomicBB) {
644
+ insertPoint = pFenceInst;
645
+ }
646
+
647
+ IRBuilder<> builder (insertPoint);
648
+ Type *sourceValType = sourceVal->getType ();
649
+
650
+ // Source value intrinsic accepts only i32.
651
+ if (sourceValType->isIntegerTy ()) {
652
+ sourceVal = builder.CreateZExtOrTrunc (sourceVal, builder.getInt32Ty ());
653
+ } else if (sourceValType->isFloatingPointTy ()) {
654
+ if (sourceValType->isFloatTy ()) {
655
+ sourceVal = builder.CreateBitCast (sourceVal, builder.getInt32Ty ());
656
+ } else {
657
+ sourceVal = builder.CreateFPToUI (sourceVal, builder.getInt32Ty ());
658
+ }
659
+ } else {
660
+ IGC_ASSERT_MESSAGE (0 , " Unexpected type" );
661
+ }
662
+
663
+ builder.CreateCall (funcPtr, {sourceVal});
664
+ m_SourcedAtomicInstructions.insert (atomicInst);
665
+ }
666
+ }
667
+
541
668
// //////////////////////////////////////////////////////////////////////
542
669
void SynchronizationObjectCoalescing::EraseRedundantInst (llvm::Instruction *pInst) {
543
670
bool isFence = IsFenceOperation (pInst);
@@ -740,7 +867,13 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
740
867
}
741
868
SynchronizationCaseMask referenceSyncCaseMask = GetStrictSynchronizationMask (pInst);
742
869
bool isObligatory = (syncCaseMask & referenceSyncCaseMask) != 0 ;
743
- isObligatory |= IsRequiredForAtomicOperationsOrdering (pInst, true /* onlyGlobalAtomics*/ );
870
+
871
+ std::vector<llvm::Instruction *> atomicInstToBeSourced;
872
+ if (!isObligatory) {
873
+ isObligatory =
874
+ IsRequiredForAtomicOperationsOrdering (pInst, atomicInstToBeSourced, true /* onlyGlobalAtomics*/ );
875
+ }
876
+
744
877
bool verifyUnsynchronizedInstructions = IsFenceOperation (pInst);
745
878
verifyUnsynchronizedInstructions &= (!isObligatory || syncCaseMask == ReadSyncWrite);
746
879
@@ -767,6 +900,9 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
767
900
#if _DEBUG
768
901
RegisterRedundancyExplanation (pInst, ExplanationEntry::GlobalMemoryRedundancy);
769
902
#endif // _DEBUG
903
+ if (IGC_IS_FLAG_ENABLED (ReplaceAtomicFenceWithSourceValue) && atomicInstToBeSourced.size () > 0 ) {
904
+ CreateSourceValueInst (atomicInstToBeSourced, const_cast <Instruction *>(pInst));
905
+ }
770
906
EraseRedundantGlobalScope (pInst);
771
907
isModified = true ;
772
908
SetLocalMemoryInstructionMask ();
@@ -831,7 +967,12 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
831
967
GetSynchronizationMaskForAllResources (localForwardMemoryInstructionMask, localBackwardMemoryInstructionMask);
832
968
SynchronizationCaseMask referenceSyncCaseMask = GetStrictSynchronizationMask (pInst);
833
969
bool isObligatory = (syncCaseMask & referenceSyncCaseMask) != 0 ;
834
- isObligatory |= IsRequiredForAtomicOperationsOrdering (pInst);
970
+
971
+ std::vector<llvm::Instruction *> atomicInstToBeSourced;
972
+ if (!isObligatory) {
973
+ isObligatory = IsRequiredForAtomicOperationsOrdering (pInst, atomicInstToBeSourced);
974
+ }
975
+
835
976
bool verifyUnsynchronizedInstructions = IsFenceOperation (pInst);
836
977
verifyUnsynchronizedInstructions &= (!isObligatory || syncCaseMask == ReadSyncWrite);
837
978
@@ -847,6 +988,9 @@ bool SynchronizationObjectCoalescing::FindRedundancies() {
847
988
#if _DEBUG
848
989
RegisterRedundancyExplanation (pInst, ExplanationEntry::StrictRedundancy);
849
990
#endif // _DEBUG
991
+ if (IGC_IS_FLAG_ENABLED (ReplaceAtomicFenceWithSourceValue) && atomicInstToBeSourced.size () > 0 ) {
992
+ CreateSourceValueInst (atomicInstToBeSourced, const_cast <Instruction *>(pInst));
993
+ }
850
994
EraseRedundantInst (pInst);
851
995
isModified = true ;
852
996
}
@@ -1731,8 +1875,9 @@ SynchronizationObjectCoalescing::GetUnsynchronizedForwardInstructionMask(const l
1731
1875
// / operations present before the fence (in program order)
1732
1876
// / @param pSourceInst the source synchronization instruction
1733
1877
// / @param onlyGlobalAtomics check only TGM and UGM atomic operations
1734
- bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering (const llvm::Instruction *pSourceInst,
1735
- bool onlyGlobalAtomics /* = false*/ ) const {
1878
+ bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering (
1879
+ const llvm::Instruction *pSourceInst, std::vector<llvm::Instruction *> &pAtomicInstToBeSourced,
1880
+ bool onlyGlobalAtomics /* = false*/ ) const {
1736
1881
if (!IsFenceOperation (pSourceInst)) {
1737
1882
// Not a fence, nothing to check
1738
1883
return false ;
@@ -1782,6 +1927,10 @@ bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(cons
1782
1927
{
1783
1928
isPotentiallyUnsynchronizedAtomic = false ;
1784
1929
// Lambda that checks if a fence operation synchronizes the atomic operation.
1930
+ // This can be improved to detect the users of atomic instruction and end the search for fences once we find the
1931
+ // user. This user is essentially same as Source Value Intrinsic, however it can be reordered in visa affecting
1932
+ // the execution order of atomic instructions. If we can find a way to treat this user as a special instruction
1933
+ // and avoid reordering, we can skip creating new source value instruction.
1785
1934
std::function<bool (const llvm::Instruction *)> IsBoundaryInst = [this , &atomicPointerMemoryInstructionMask,
1786
1935
&isPotentiallyUnsynchronizedAtomic,
1787
1936
pSourceInst](const llvm::Instruction *pInst) {
@@ -1832,6 +1981,11 @@ bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(cons
1832
1981
for (llvm::BasicBlock::const_iterator it = ++pSourceInst->getIterator (); it != pSourceInst->getParent ()->end ();
1833
1982
++it) {
1834
1983
const llvm::Instruction *pCurrInst = &(*it);
1984
+ // If we encounter an atomic instruction after pSourceInst (Source Fence), then the fence is required to execute
1985
+ // pInst (initial atomic) before pCurrInst (current atomic)
1986
+ if (IsAtomicOperation (pCurrInst)) {
1987
+ break ;
1988
+ }
1835
1989
if (IsFenceOperation (pCurrInst) && IsSubstituteInstruction (pCurrInst, pSourceInst)) {
1836
1990
substituteFenceFound = true ;
1837
1991
break ;
@@ -1840,7 +1994,22 @@ bool SynchronizationObjectCoalescing::IsRequiredForAtomicOperationsOrdering(cons
1840
1994
if (!substituteFenceFound) {
1841
1995
// Found an atomic operation that requires the source fence
1842
1996
// instruction for correct memory ordering.
1843
- return true ;
1997
+
1998
+ // If ReplaceAtomicFenceWithSourceValue is true, we can replace this fence with GenISA_source_value.
1999
+ // This will source the atomic instruction and still maintains the order of atomic instructions.
2000
+ // Else return true marking the fence instruction as Obligatory.
2001
+
2002
+ if (IGC_IS_FLAG_ENABLED (ReplaceAtomicFenceWithSourceValue)) {
2003
+ // If a previous fence was replaced with source value intrinsic, GetVisibleMemoryInstructions will add the
2004
+ // same atomic instruction again for the next fence resulting in multiple source value intrinsics but we need
2005
+ // it to be sourced only once. Hence we check if it was already sourced previously. Continues to check all
2006
+ // valid atomic Instructions to be sourced.
2007
+ if (m_SourcedAtomicInstructions.find (const_cast <Instruction *>(pInst)) == m_SourcedAtomicInstructions.end ()) {
2008
+ pAtomicInstToBeSourced.push_back (const_cast <Instruction *>(pInst));
2009
+ }
2010
+ } else {
2011
+ return true ;
2012
+ }
1844
2013
}
1845
2014
}
1846
2015
}
@@ -2002,6 +2171,7 @@ void SynchronizationObjectCoalescing::InvalidateMembers() {
2002
2171
m_OrderedFenceInstructionsInBasicBlockCache.clear ();
2003
2172
m_OrderedBarrierInstructionsInBasicBlockCache.clear ();
2004
2173
m_BasicBlockMemoryInstructionMaskCache.clear ();
2174
+ m_SourcedAtomicInstructions.clear ();
2005
2175
#if _DEBUG
2006
2176
m_ExplanationEntries.clear ();
2007
2177
#endif // _DEBUG
0 commit comments