Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 114 additions & 19 deletions llvm/lib/Target/AIE/AIEClusterBaseAddress.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@ static cl::opt<bool> EnableChainsAcrossMultiBlocks(
"aie-chain-addr-multi-block", cl::Hidden, cl::init(true),
cl::desc("Enable ptradd chaining when Ptr is used across multiple MBBs."));

static cl::opt<bool>
DetachCondLoadChain("aie-chain-addr-detach-cond-load-jump", cl::Hidden,
cl::init(true),
cl::desc("Disable ptradd chaining that feed "
"loads that are used in conditional jumps."));

namespace {

LLT getLoadStoreType(const MachineInstr &MI) {
Expand All @@ -84,10 +90,10 @@ LLT getLoadStoreType(const MachineInstr &MI) {

/// Try and re-order PTR_ADD instructions to maximise the size of constant
/// PTR_ADD chains.
bool optimisePostIncrements(ArrayRef<MachineInstr *> PtrAdds,
const MachineRegisterInfo &MRI,
MachineIRBuilder &MIB,
GISelObserverWrapper &Observer) {
bool bundleConstIncrements(ArrayRef<MachineInstr *> PtrAdds,
const MachineRegisterInfo &MRI,
MachineIRBuilder &MIB,
GISelObserverWrapper &Observer) {
bool Changed = false;

// Look for the following sequence:
Expand Down Expand Up @@ -194,6 +200,15 @@ class AIEClusterBaseAddress : public MachineFunctionPass {

// Find if a register is used in reachable MBBs.
bool isRegUsedInSuccessiveMBBs(MachineBasicBlock *MBB, Register Reg);

/// Return a set of Load Instrs whose results are used in the path of
/// the conditional branch of \p MBB .
std::set<MachineInstr *>
getLoadsFeedingCondBranch(MachineBasicBlock &MBB) const;

/// \return whether PtrAdd uses a Load Instr in \p LoadsToAvoid .
bool avoidPtrAdd(MachineInstr *PtrAdd,
const std::set<MachineInstr *> &LoadsToAvoid) const;
};

void AIEClusterBaseAddress::getAnalysisUsage(AnalysisUsage &AU) const {
Expand Down Expand Up @@ -224,8 +239,7 @@ bool AIEClusterBaseAddress::runOnMachineFunction(MachineFunction &MF) {

bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
while (processBasicBlock(MBB, MIB, Observer))
Changed = true;
Changed |= processBasicBlock(MBB, MIB, Observer);
}
return Changed;
}
Expand All @@ -241,8 +255,13 @@ bool AIEClusterBaseAddress::processBasicBlock(MachineBasicBlock &MBB,

// Optimise instruction order
for (auto &RegAndUse : reverse(RegAndUses)) {
// Chaining acceptance criteria.
SmallVector<MachineInstr *, 8> &Instrs = RegAndUse.second;
if (shouldSkipChaining(RegAndUse.first, Instrs, MBB))
continue;

ArrayRef<MachineInstr *> PtrAdds = RegAndUse.second;
Changed |= optimisePostIncrements(PtrAdds, *MRI, MIB, Observer);
Changed |= bundleConstIncrements(PtrAdds, *MRI, MIB, Observer);
}

// Create chains, when profitable.
Expand All @@ -259,12 +278,94 @@ bool AIEClusterBaseAddress::processBasicBlock(MachineBasicBlock &MBB,
return Changed;
}

/// Recursively search bottom up for Load instrs in the use chain of \p MI .
/// Stop the search when Exiting \p MBB . Return all found Load MachineInstr in
/// \p LoadsFeedingInstrs .
void findLoadsFeedingInstr(MachineInstr &MI, MachineBasicBlock *MBB,
std::set<MachineInstr *> &LoadsFeedingInstrs,
const MachineRegisterInfo &MRI) {
for (MachineOperand &MO : MI.uses()) {
if (!MO.isReg())
continue;

Register UseReg = MO.getReg();
if (!UseReg.isVirtual())
continue;

auto *UseMI = MRI.getUniqueVRegDef(UseReg);
if (!UseMI)
continue;

if (UseMI->getParent() != MBB || UseMI->isPHI())
continue;

if (UseMI->mayLoad()) {
LoadsFeedingInstrs.emplace(UseMI);
LLVM_DEBUG(dbgs() << "Found Feeding Load " << *UseMI);
}

findLoadsFeedingInstr(*UseMI, MBB, LoadsFeedingInstrs, MRI);
}
}

std::set<MachineInstr *>
AIEClusterBaseAddress::getLoadsFeedingCondBranch(MachineBasicBlock &MBB) const {
assert(MRI);

if (!DetachCondLoadChain)
return {};

std::set<MachineInstr *> LoadsFeedingCondBranch;
for (auto &MI : make_range(MBB.getFirstTerminator(), MBB.end())) {
if (MI.isConditionalBranch()) {
findLoadsFeedingInstr(MI, &MBB, LoadsFeedingCondBranch, *MRI);
break;
}
}

return LoadsFeedingCondBranch;
}

bool AIEClusterBaseAddress::avoidPtrAdd(
MachineInstr *PtrAdd, const std::set<MachineInstr *> &LoadsToAvoid) const {
assert(PtrAdd->getOpcode() == TargetOpcode::G_PTR_ADD);

// Is G_PTR_ADD feeding a Load instruction?
const Register DefReg = PtrAdd->getOperand(0).getReg();
if (MRI->use_nodbg_empty(DefReg))
return false;

auto UseBegin = MRI->use_instr_nodbg_begin(DefReg);
MachineInstr *LoadMI = &*UseBegin;
if (!LoadMI->mayLoad())
return false;

const bool LoadFeedCondBranch = LoadsToAvoid.count(LoadMI);
LLVM_DEBUG(if (LoadFeedCondBranch) dbgs()
<< "Found Load feeding Cond Branch attached to " << *PtrAdd;);

return LoadFeedCondBranch;
}

AIEClusterBaseAddress::RegUseMap
AIEClusterBaseAddress::collectPtrUses(MachineBasicBlock &MBB) {
// Initialize Load Instrs to avoid
const std::set<MachineInstr *> LoadsToAvoid = getLoadsFeedingCondBranch(MBB);

RegUseMap RegAndUses;
for (MachineInstr &MI : MBB) {
if (MI.getOpcode() == TargetOpcode::G_PTR_ADD)
RegAndUses[MI.getOperand(1).getReg()].push_back(&MI);
for (MachineInstr &PtrAdd : MBB) {
// Only consider G_PTR_ADDs
if (PtrAdd.getOpcode() != TargetOpcode::G_PTR_ADD)
continue;

// If G_PTR_ADDs is used in a Load in LoadsToAvoid, ignore PtrAdd in
// chain collection. An example could be a Load Instr that feeds a
// conditional jump and increases the critical path because the Load
// Instr is delayed because of chaining.
if (!LoadsToAvoid.empty() && avoidPtrAdd(&PtrAdd, LoadsToAvoid))
continue;

RegAndUses[PtrAdd.getOperand(1).getReg()].push_back(&PtrAdd);
}
return RegAndUses;
}
Expand All @@ -274,17 +375,11 @@ bool AIEClusterBaseAddress::shouldSkipChaining(
MachineBasicBlock &MBB) {

// No chain possibility at all.
if (Instrs.size() <= 1)
if (Instrs.size() <= 1 || (!EnableChainsAcrossMultiBlocks &&
isRegUsedInSuccessiveMBBs(&MBB, PtrReg)))
return true;

// Chain MBB regardless.
if (EnableChainsAcrossMultiBlocks)
return false;

// If the base reg is used in any of the successive MBBs, then we don't
// want to chain the corresponding ptr adds, since this would introduce a
// COPY and increase reg pressure.
return isRegUsedInSuccessiveMBBs(&MBB, PtrReg);
return false;
}

bool AIEClusterBaseAddress::buildChain(SmallVector<MachineInstr *, 8> &Instrs,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
# RUN: llc -mtriple aie2 -run-pass=aie-cluster-base-address %s -verify-machineinstrs -o - | FileCheck %s

# Address Chaining test across multiple MBBs.

# Decouple Load that feeds into a conditional jump, so that it can be scheduled
# as early is possible and not delay the branch decision.
---
name: DecoupleCondJumpLoad
legalized: true
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: DecoupleCondJumpLoad
; CHECK: bb.0.entry:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: liveins: $p0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 4
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 8
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 12
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nuw G_PTR_ADD [[COPY]], [[C]](s20)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (dereferenceable load (s32))
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s20) = G_CONSTANT i20 4
; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = nuw G_PTR_ADD [[PTR_ADD]], [[C3]](s20)
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (dereferenceable load (s32))
; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = nuw G_PTR_ADD [[COPY]], [[C2]](s20)
; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (dereferenceable load (s32))
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
; CHECK-NEXT: %BR_SEL:_(s1) = G_ICMP intpred(sle), [[LOAD2]](s32), [[C4]]
; CHECK-NEXT: G_BRCOND %BR_SEL(s1), %bb.1
; CHECK-NEXT: G_BR %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: G_BR %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (dereferenceable load (s32))
; CHECK-NEXT: PseudoRET implicit $lr, implicit [[LOAD]](s32), implicit [[LOAD1]](s32), implicit [[LOAD3]](s32)
bb.0.entry:
liveins: $p0
%0:_(p0) = COPY $p0
%1:_(s20) = G_CONSTANT i20 4
%2:_(s20) = G_CONSTANT i20 8
%3:_(s20) = G_CONSTANT i20 12
%4:_(p0) = nuw G_PTR_ADD %0, %1(s20)
%5:_(s32) = G_LOAD %4(p0) :: (dereferenceable load (s32))
%6:_(p0) = nuw G_PTR_ADD %0, %2(s20)
%7:_(s32) = G_LOAD %6(p0) :: (dereferenceable load (s32))
%8:_(p0) = nuw G_PTR_ADD %0, %3(s20)
%9:_(s32) = G_LOAD %8(p0) :: (dereferenceable load (s32))
%10:_(s32) = G_CONSTANT i32 4
%BR_SEL:_(s1) = G_ICMP intpred(sle), %9:_(s32), %10:_(s32)
G_BRCOND %BR_SEL, %bb.1
G_BR %bb.2

bb.1:

G_BR %bb.2

bb.2:
%11:_(s32) = G_LOAD %0(p0) :: (dereferenceable load (s32))
PseudoRET implicit $lr, implicit %5, implicit %7, implicit %11
...
76 changes: 35 additions & 41 deletions llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll
Original file line number Diff line number Diff line change
Expand Up @@ -34,55 +34,49 @@ declare { ptr, i20, i20 } @llvm.aie2.add.3d(ptr, i20, i20, i20, i20, i20, i20, i
define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm2_data, ptr noalias %ofm_data, ptr %.out, ptr %conv.i.i.i.out, ptr %idx.ext9.out, ptr %.out1, ptr %.out2, ptr %.out3, ptr %.out4, ptr %.out5, ptr %conv.i.i.i.i.i.out, ptr %.out6, ptr %conv.i.i.i46.out, ptr %xtraiter.out, ptr %in_ptr1.051.unr.ce.out, ptr %in_ptr2.0.in50.unr.ce.out, ptr %out_ptr.049.unr.ce.out, ptr %itr_left_cnt0.048.unr.ce.out, ptr %itr_left_cnt1.047.unr.ce.out) #3 {
; ASM-LABEL: add2d:
; ASM: // %bb.0: // %newFuncRoot
; ASM-NEXT: nopa ; paddb [sp], #32; nopxm ; nops
; ASM-NEXT: st p7, [sp, #-32] // 4-byte Folded Spill
; ASM-NEXT: paddb [p0], #40; st p6, [sp, #-28] // 4-byte Folded Spill
; ASM-NEXT: lda m2, [p0], #-4; mov p6, sp
; ASM-NEXT: lda m3, [p0], #8; paddb [p6], #-36; mov p7, sp
; ASM-NEXT: lda r5, [p6, #0]; paddb [p7], #-40
; ASM-NEXT: lda p7, [p7, #0]
; ASM-NEXT: lda r2, [p0, #64]; paddb [p0], #40; nopm
; ASM-NEXT: lda m2, [p0], #-4
; ASM-NEXT: lda m5, [p0], #8
; ASM-NEXT: lda m4, [p0], #-24
; ASM-NEXT: lda r0, [p0], #36
; ASM-NEXT: lda r3, [p0], #-32
; ASM-NEXT: lda m4, [p0], #8
; ASM-NEXT: lda m3, [p0], #-24; paddb [sp], #32
; ASM-NEXT: lda r0, [p0], #4; st p6, [sp, #-28] // 4-byte Folded Spill
; ASM-NEXT: lda r1, [p0], #-12; mov p6, sp
; ASM-NEXT: lda r2, [p0], #40; paddb [p6], #-44
; ASM-NEXT: lda r3, [p0], #40; paddb [p6], #-36
; ASM-NEXT: lda p7, [p6, #0]; mov p6, sp
; ASM-NEXT: paddb [p6], #-40
; ASM-NEXT: lda r5, [p6, #0]; mov p6, sp
; ASM-NEXT: lda m1, [p0], #36; paddb [p6], #-44
; ASM-NEXT: lda p6, [p6, #0]
; ASM-NEXT: nop
; ASM-NEXT: lda m1, [p0], #36
; ASM-NEXT: lda m0, [p0], #-8
; ASM-NEXT: lda dn0, [p0], #-8
; ASM-NEXT: lda dj0, [p0], #12
; ASM-NEXT: lda dn4, [p0], #-8; st r2, [p4, #0]
; ASM-NEXT: lda dj4, [p0], #-36; nez r4, r1; mov p4, sp
; ASM-NEXT: lda r1, [p0, #0]; paddb [p4], #-48; st r4, [p5, #0]
; ASM-NEXT: lda p4, [p4, #0]; mov p5, r5
; ASM-NEXT: lda r5, [p0, #-36]; mov p0, sp
; ASM-NEXT: paddb [p0], #-72; st m1, [p5, #0]
; ASM-NEXT: lda p0, [p0, #0]; mov p5, sp
; ASM-NEXT: paddb [p5], #-52; st m0, [p7, #0]
; ASM-NEXT: lda p5, [p5, #0]; mov p7, sp
; ASM-NEXT: st dj0, [p6, #0]
; ASM-NEXT: paddb [p7], #-56; mov p6, sp
; ASM-NEXT: lda r6, [p7, #0]; mov p7, sp
; ASM-NEXT: paddb [p6], #-60; st dj4, [p4, #0]
; ASM-NEXT: lda r7, [p6, #0]; paddb [p7], #-64; mov p4, sp
; ASM-NEXT: lda p7, [p7, #0]; paddb [p4], #-76
; ASM-NEXT: lda r11, [p4, #0]; mov p6, sp
; ASM-NEXT: lda m0, [p0], #-8; st p7, [sp, #-32] // 4-byte Folded Spill
; ASM-NEXT: lda dn0, [p0], #-8; st r3, [p4, #0]
; ASM-NEXT: lda dj0, [p0], #12; nez r4, r1; mov p4, sp
; ASM-NEXT: lda dn4, [p0], #-8; paddb [p4], #-48; st r4, [p5, #0]
; ASM-NEXT: lda p4, [p4, #0]; mov p5, sp
; ASM-NEXT: lda dj4, [p0], #-36; st m1, [p7, #0]
; ASM-NEXT: lda r1, [p0, #0]; mov p7, r5
; ASM-NEXT: lda r5, [p0, #-36]; paddb [p5], #-52; mov p0, sp
; ASM-NEXT: lda p5, [p5, #0]; st m0, [p7, #0]
; ASM-NEXT: paddb [p0], #-72; mov p7, sp
; ASM-NEXT: lda p0, [p0, #0]; paddb [p7], #-56; st dj0, [p6, #0]
; ASM-NEXT: lda r6, [p7, #0]; mov p6, sp
; ASM-NEXT: paddb [p6], #-60; mov p7, sp
; ASM-NEXT: lda r7, [p6, #0]; paddb [p7], #-64; mov p6, sp
; ASM-NEXT: lda p7, [p7, #0]; st dj4, [p4, #0]
; ASM-NEXT: mov p4, sp
; ASM-NEXT: paddb [p6], #-68; st dn0, [p5, #0]
; ASM-NEXT: paddb [p4], #-76; st dn0, [p5, #0]
; ASM-NEXT: lda r11, [p4, #0]; paddb [p6], #-68; mov p4, sp
; ASM-NEXT: lda r8, [p6, #0]; paddb [p4], #-80; mov p5, r6
; ASM-NEXT: lda p6, [p4, #0]; mov p4, sp
; ASM-NEXT: mova r6, #1; paddb [p4], #-84; nez r1, r1; st dn4, [p5, #0]
; ASM-NEXT: lda r9, [p4, #0]; ne r6, r0, r6; mov p4, sp
; ASM-NEXT: mova r0, #3; paddb [p4], #-88; add r7, r3, #-1; mov p5, r7
; ASM-NEXT: mova r0, #3; paddb [p4], #-88; add r7, r2, #-1; mov p5, r7
; ASM-NEXT: lda r10, [p4, #0]; ltu r7, r7, r0; mov p4, sp
; ASM-NEXT: jz r7, #.LBB0_2
; ASM-NEXT: paddb [p4], #-92; st r1, [p5, #0] // Delay Slot 5
; ASM-NEXT: lda p4, [p4, #0]; st r5, [p7, #0] // Delay Slot 4
; ASM-NEXT: paddb [p2], m3; mov p7, r8 // Delay Slot 3
; ASM-NEXT: st r6, [p7, #0]; paddb [p2], m5; and r8, r3, r0 // Delay Slot 2
; ASM-NEXT: padda [p1], m2; paddb [p2], m4; movx r0, #0; st r8, [p0, #0] // Delay Slot 1
; ASM-NEXT: paddb [p2], m5; mov p7, r8 // Delay Slot 3
; ASM-NEXT: st r6, [p7, #0]; paddb [p2], m4; and r8, r2, r0 // Delay Slot 2
; ASM-NEXT: padda [p1], m2; paddb [p2], m3; movx r0, #0; st r8, [p0, #0] // Delay Slot 1
; ASM-NEXT: // %bb.1:
; ASM-NEXT: j #.LBB0_5
; ASM-NEXT: nop // Delay Slot 5
Expand All @@ -96,11 +90,11 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm
; ASM-NEXT: vlda.3d.ups.s32.d8 cm0, s1, [p2], d0; nopx
; ASM-NEXT: vlda.ups.s32.d8 cm4, s1, [p1], m1; movxm ls, #.LBB0_3
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; mov crUPSSign, r4
; ASM-NEXT: vlda.ups.s32.d8 cm1, s1, [p1], m1; movx r0, #-4; mov s1, r2
; ASM-NEXT: vlda.ups.s32.d8 cm1, s1, [p1], m1; movx r0, #-4; mov s1, r3
; ASM-NEXT: vlda.3d.ups.s32.d8 cm5, s1, [p2], d0; movxm le, #.L_LEnd0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm7, s1, [p2], d0; and r0, r3, r0
; ASM-NEXT: mova r3, #-2; add r0, r0, #-4
; ASM-NEXT: lshl r0, r0, r3; mov crSRSSign, r6
; ASM-NEXT: vlda.3d.ups.s32.d8 cm7, s1, [p2], d0; and r0, r2, r0
; ASM-NEXT: mova r2, #-2; add r0, r0, #-4
; ASM-NEXT: lshl r0, r0, r2; mov crSRSSign, r6
; ASM-NEXT: add r0, r0, #1; mov s0, r5
; ASM-NEXT: add.nc lc, r0, #-1
; ASM-NEXT: .LBB0_3: // %for.body
Expand Down