Skip to content

Commit 6ee4bd0

Browse files
[AIEx] Enhance Super-Reg-Rewrite pass to expandCopyBundle
The new strategy exposes a fundamental problem on how bundled instruction in case of sub-reg are created by live range splitting logic(Refer : SplitEditor::buildSingleSubRegCopy) . From standard llvm perspective it is not a problem but when it comes to AIE and what we do in Super-Reg-Rewrite pass. We make them a complete register(which we want/need to do) but now there are COPY instr where in we end a live range on the Bundle and create a new live range by a different COPY instruction in the same bundle which are using the same reg class for src & dst. The major issue comes when reg-alloc end up assigning same register to such COPY in the same bundle, AFAIK this happens because the bundle is assign one unique stack slot. By expanding the CopyBundle we provide the COPY MI a unique slot and the associate operands a proper LiveInterval
1 parent 6b97af9 commit 6ee4bd0

File tree

3 files changed

+256
-20
lines changed

3 files changed

+256
-20
lines changed

llvm/lib/Target/AIE/AIESuperRegRewriter.cpp

Lines changed: 113 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@ class AIESuperRegRewriter : public MachineFunctionPass {
7373
const AIEBaseRegisterInfo &TRI, VirtRegMap &VRM,
7474
LiveRegMatrix &LRM, LiveIntervals &LIS,
7575
SlotIndexes &Indexes, LiveDebugVariables &DebugVars);
76+
77+
void expandCopyBundle(MachineInstr &MI, MachineFunction &MF,
78+
const AIEBaseRegisterInfo &TRI, SlotIndexes &Indexes,
79+
SmallSet<Register, 8> &RecomputeLIandLRM);
7680
};
7781

7882
/// Returns the subreg indices that can be used to rewrite \p Reg into smaller
@@ -208,6 +212,36 @@ bool AIESuperRegRewriter::runOnMachineFunction(MachineFunction &MF) {
208212
DebugVars);
209213
}
210214

215+
// Expand CopyBundle
216+
SmallSet<Register, 8> RecomputeLIandLRM;
217+
for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end();
218+
MBBI != MBBE; ++MBBI) {
219+
LLVM_DEBUG(MBBI->print(dbgs(), &Indexes));
220+
for (MachineInstr &MI : llvm::make_early_inc_range(MBBI->instrs())) {
221+
expandCopyBundle(MI, MF, TRI, Indexes, RecomputeLIandLRM);
222+
}
223+
}
224+
225+
for (Register Reg : RecomputeLIandLRM) {
226+
if (LIS.hasInterval(Reg)) {
227+
LLVM_DEBUG(dbgs() << "Recomputing live range for " << printReg(Reg, &TRI)
228+
<< '\n');
229+
// Recompute the LiveIntervals for the register and update the
230+
// LiveRegMatrix
231+
if (Reg.isPhysical()) {
232+
const MCRegister PhysReg = VRM.getPhys(Reg);
233+
const LiveInterval &OldLI = LIS.getInterval(Reg);
234+
LIS.removeInterval(Reg);
235+
LRM.unassign(OldLI);
236+
const LiveInterval &LI = LIS.getInterval(Reg);
237+
LRM.assign(LI, PhysReg);
238+
} else {
239+
LIS.removeInterval(Reg);
240+
LIS.getInterval(Reg);
241+
}
242+
}
243+
}
244+
211245
LLVM_DEBUG(VRM.dump());
212246
return !AssignedPhysRegs.empty();
213247
}
@@ -265,7 +299,7 @@ void AIESuperRegRewriter::rewriteSuperReg(
265299
MachineRegisterInfo &MRI, const AIEBaseRegisterInfo &TRI, VirtRegMap &VRM,
266300
LiveRegMatrix &LRM, LiveIntervals &LIS, SlotIndexes &Indexes,
267301
LiveDebugVariables &DebugVars) {
268-
bool AssignPhysRegIsValid = AssignedPhysReg.isValid();
302+
bool AssignPhysRegIsValid = AssignedPhysReg.isPhysical();
269303
LLVM_DEBUG(dbgs() << "Rewriting " << printReg(Reg, &TRI, 0, &MRI)
270304
<< " Assigned " << AssignPhysRegIsValid << '\n');
271305
auto *TII = static_cast<const AIEBaseInstrInfo *>(
@@ -365,6 +399,84 @@ void AIESuperRegRewriter::rewriteSuperReg(
365399
DebugVars.splitRegister(Reg, NewVRegs, LIS);
366400
}
367401

402+
// The liverange splitting logic sometimes produces bundles of copies when
403+
// subregisters are involved. Expand these into a sequence of copy instructions
404+
// after processing the last in the bundle. This is needed to ensure that the
405+
// un-assigned virtual reg operands of COPY that were part of these bundles have
406+
// a unique SlotIndex and thus a LiveInterval which is better for RA.
407+
void AIESuperRegRewriter::expandCopyBundle(
408+
MachineInstr &MI, MachineFunction &MF, const AIEBaseRegisterInfo &TRI,
409+
SlotIndexes &Indexes, SmallSet<Register, 8> &RecomputeLIandLRM) {
410+
if (!MI.isCopy() && !MI.isKill())
411+
return;
412+
413+
if (MI.isBundledWithPred() && !MI.isBundledWithSucc()) {
414+
SmallVector<MachineInstr *, 2> MIs({&MI});
415+
416+
// Only do this when the complete bundle is made out of COPYs and KILLs.
417+
MachineBasicBlock &MBB = *MI.getParent();
418+
for (MachineBasicBlock::reverse_instr_iterator
419+
I = std::next(MI.getReverseIterator()),
420+
E = MBB.instr_rend();
421+
I != E && I->isBundledWithSucc(); ++I) {
422+
if (!I->isCopy() && !I->isKill())
423+
return;
424+
MIs.push_back(&*I);
425+
}
426+
MachineInstr *FirstMI = MIs.back();
427+
428+
auto anyRegsAlias = [](const MachineInstr *Dst,
429+
ArrayRef<MachineInstr *> Srcs,
430+
const TargetRegisterInfo &TRI) {
431+
for (const MachineInstr *Src : Srcs)
432+
if (Src != Dst)
433+
if (TRI.regsOverlap(Dst->getOperand(0).getReg(),
434+
Src->getOperand(1).getReg()))
435+
return true;
436+
return false;
437+
};
438+
439+
// If any of the destination registers in the bundle of copies alias any of
440+
// the source registers, try to schedule the instructions to avoid any
441+
// clobbering.
442+
for (int E = MIs.size(), PrevE = E; E > 1; PrevE = E) {
443+
for (int I = E; I--;)
444+
if (!anyRegsAlias(MIs[I], ArrayRef(MIs).take_front(E), TRI)) {
445+
if (I + 1 != E)
446+
std::swap(MIs[I], MIs[E - 1]);
447+
--E;
448+
}
449+
if (PrevE == E) {
450+
MF.getFunction().getContext().emitError(
451+
"super-reg-rewriter register rewriting failed: cycle in copy "
452+
"bundle");
453+
break;
454+
}
455+
}
456+
457+
MachineInstr *BundleStart = FirstMI;
458+
for (MachineInstr *BundledMI : llvm::reverse(MIs)) {
459+
// If instruction is in the middle of the bundle, move it before the
460+
// bundle starts, otherwise, just unbundle it. When we get to the last
461+
// instruction, the bundle will have been completely undone.
462+
if (BundledMI != BundleStart) {
463+
BundledMI->removeFromBundle();
464+
MBB.insert(BundleStart, BundledMI);
465+
} else if (BundledMI->isBundledWithSucc()) {
466+
BundledMI->unbundleFromSucc();
467+
BundleStart = &*std::next(BundledMI->getIterator());
468+
}
469+
470+
if (BundledMI != FirstMI) {
471+
Indexes.insertMachineInstrInMaps(*BundledMI);
472+
RecomputeLIandLRM.insert(BundledMI->getOperand(0).getReg());
473+
RecomputeLIandLRM.insert(BundledMI->getOperand(1).getReg());
474+
BundledMI->getOperand(0).setIsInternalRead(false);
475+
}
476+
}
477+
}
478+
}
479+
368480
} // end anonymous namespace
369481

370482
char AIESuperRegRewriter::ID = 0;

llvm/test/CodeGen/AIE/aie2p/issue_2.ll

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,34 +20,36 @@ define void @issue_2(i32 %0, i1 %exitcond.not.i) {
2020
; CHECK-NEXT: movs dj4, m0; mov dn0, m0
2121
; CHECK-NEXT: movs dj2, m0; mov dn4, m0
2222
; CHECK-NEXT: movs dj6, m0; mov dn2, m0
23-
; CHECK-NEXT: movs dj3, m0; mov dn6, m0
24-
; CHECK-NEXT: movs dj7, m0; mov dn3, m0
25-
; CHECK-NEXT: mova r2, #1; movs dn7, m0; mov dc0, m0
26-
; CHECK-NEXT: movs dc4, m0; and r5, r1, r2; mov r2, m0
27-
; CHECK-NEXT: movs dc3, m0; mov r1, m0
28-
; CHECK-NEXT: movs dc2, m0; mov m2, m0
29-
; CHECK-NEXT: mova dn5, #1; movs dj5, m0; mov m3, m0
30-
; CHECK-NEXT: mova r3, #0; movs dn1, m0; mov m1, m0
23+
; CHECK-NEXT: movs dn6, m0; mov dc0, m0
24+
; CHECK-NEXT: movs dc4, m0; mov r4, m0
25+
; CHECK-NEXT: movs dc3, m0; mov r6, m0
26+
; CHECK-NEXT: mova dn5, #1; movs dc2, m0; mov r3, m0
27+
; CHECK-NEXT: movs dn3, m0; mov r5, m0
28+
; CHECK-NEXT: mova r16, #1; movs dj3, m0; mov r2, m0
29+
; CHECK-NEXT: movs dn7, m0; and r16, r1, r16; mov r1, m0
30+
; CHECK-NEXT: mova r7, #0; movs dj7, m0; mov m3, m0
3131
; CHECK-NEXT: .p2align 4
3232
; CHECK-NEXT: .LBB0_1: // %for.body58.i
3333
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
34-
; CHECK-NEXT: jz r5, #.LBB0_1
35-
; CHECK-NEXT: nop // Delay Slot 5
34+
; CHECK-NEXT: nopa ; nopb ; nops ; jz r16, #.LBB0_1; nopv
35+
; CHECK-NEXT: nopx // Delay Slot 5
3636
; CHECK-NEXT: mova p0, #0 // Delay Slot 4
3737
; CHECK-NEXT: paddb.3d [p0], d0 // Delay Slot 3
38-
; CHECK-NEXT: mova p0, #0; mov dc6, r3 // Delay Slot 2
39-
; CHECK-NEXT: paddb.3d [p0], d2; or r3, r0, r0; mov dc0, dn5 // Delay Slot 1
38+
; CHECK-NEXT: mova p0, #0; movs dc6, r7; mov m2, m0 // Delay Slot 2
39+
; CHECK-NEXT: paddb.3d [p0], d2; or r7, r0, r0; mov dc0, dn5 // Delay Slot 1
4040
; CHECK-NEXT: // %bb.2: // %for.cond.cleanup57.i
4141
; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1
42-
; CHECK-NEXT: nopa ; nopb ; movs dc7, dn5; nopx ; mov dc0, dc5; nopv
43-
; CHECK-NEXT: nopa ; nopb ; nopx ; mov r3, dn5; movs dc1, r2
44-
; CHECK-NEXT: movs dj1, r1; mov dn5, m0
45-
; CHECK-NEXT: mova p0, #0; movs dc5, m0; j #.LBB0_1
42+
; CHECK-NEXT: nopx ; mov dc7, dn5
43+
; CHECK-NEXT: movs dc0, dc5; mov dc1, r1
44+
; CHECK-NEXT: movs dj1, r2; mov r7, dn5
45+
; CHECK-NEXT: movs dj5, m0; mov dn1, m0
46+
; CHECK-NEXT: movs dn5, m0; mov dc5, m0
47+
; CHECK-NEXT: mova p0, #0; movs m1, m0; j #.LBB0_1
4648
; CHECK-NEXT: paddb.3d [p0], d1 // Delay Slot 5
47-
; CHECK-NEXT: mova p0, #0; movs dc2, m0; mov dn5, r3 // Delay Slot 4
48-
; CHECK-NEXT: movs dj1, m0; paddb.3d [p0], d3; mov r2, dc1 // Delay Slot 3
49+
; CHECK-NEXT: mova p0, #0; movs dc2, m0; mov dn5, r7 // Delay Slot 4
50+
; CHECK-NEXT: movs dj1, m0; paddb.3d [p0], d3; mov r1, dc1 // Delay Slot 3
4951
; CHECK-NEXT: mova p0, #0; movs dc5, dc0; mov dc1, m0 // Delay Slot 2
50-
; CHECK-NEXT: mova r3, #0; paddb.3d [p0], d1; movs dc4, m0; mov dc0, m0 // Delay Slot 1
52+
; CHECK-NEXT: mova r7, #0; paddb.3d [p0], d1; movs dc4, m0; mov dc0, m0 // Delay Slot 1
5153
entry:
5254
br label %for.body.i
5355

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
;
3+
; This file is licensed under the Apache License v2.0 with LLVM Exceptions.
4+
; See https://llvm.org/LICENSE.txt for license information.
5+
; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
;
7+
; (c) Copyright 2024-25 Advanced Micro Devices, Inc. or its affiliates
8+
; RUN: llc -mtriple=aie2p -verify-machineinstrs -o - < %s | FileCheck %s
9+
10+
target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
11+
target triple = "aie2p-none-unknown-elf"
12+
13+
define void @issue_3(i1 %exitcond.not.i) {
14+
; CHECK-LABEL: issue_3:
15+
; CHECK: .p2align 4
16+
; CHECK-NEXT: // %bb.0: // %entry
17+
; CHECK-NEXT: mova p3, #0; nopb ; nops ; paddxm [sp], #192; nopv
18+
; CHECK-NEXT: mova r1, #0; nopb ; jl p3; nopm ; nops
19+
; CHECK-NEXT: st r8, [sp, #-192]; vbcst.32 x0, r1 // 4-byte Folded Spill Delay Slot 5
20+
; CHECK-NEXT: st lr, [sp, #-188]; vmov x1, x0 // 4-byte Folded Spill Delay Slot 4
21+
; CHECK-NEXT: mova p0, #0; vst x0, [sp, #-128] // 64-byte Folded Spill Delay Slot 3
22+
; CHECK-NEXT: mova p1, #0; vst x1, [sp, #-64] // 64-byte Folded Spill Delay Slot 2
23+
; CHECK-NEXT: mova p2, #0; mov r8, r0 // Delay Slot 1
24+
; CHECK-NEXT: mova m4, #0; nopb ; nops ; nopxm ; nopv
25+
; CHECK-NEXT: mov dn0, m4
26+
; CHECK-NEXT: mov dn4, m4
27+
; CHECK-NEXT: mov dn1, m4
28+
; CHECK-NEXT: mov dn5, m4
29+
; CHECK-NEXT: mov dn2, m4
30+
; CHECK-NEXT: movs dc5, m4; mov dc1, m4
31+
; CHECK-NEXT: vlda x2, [sp, #-128]; movs dc2, m4; mov r1, m4 // 64-byte Folded Reload
32+
; CHECK-NEXT: vlda x3, [sp, #-64]; movs dc3, m4; movx r0, #1; mov r2, m4 // 64-byte Folded Reload
33+
; CHECK-NEXT: movs dc0, m4; and r3, r8, r0; mov r0, m4
34+
; CHECK-NEXT: movs m1, m4; mov dj7, m4
35+
; CHECK-NEXT: movs m3, m4; mov dj1, r1
36+
; CHECK-NEXT: movs m2, m4; mov dj5, r1
37+
; CHECK-NEXT: movs dn7, m4; mov dj2, r1
38+
; CHECK-NEXT: movs dj6, r1; vmov lfl0, x2
39+
; CHECK-NEXT: mova dc4, #0; movs dj3, r1; movx r4, #0; vmov lfh0, x3
40+
; CHECK-NEXT: .p2align 4
41+
; CHECK-NEXT: .LBB0_1: // %for.body.i
42+
; CHECK-NEXT: // =>This Loop Header: Depth=1
43+
; CHECK-NEXT: // Child Loop BB0_2 Depth 2
44+
; CHECK-NEXT: nopx ; vmov lfl1, lfl0
45+
; CHECK-NEXT: mova p1, #0; mov r25, r4
46+
; CHECK-NEXT: vldb.pop.576.3d ex0, [p1, lf1, r25, d1]; mov dj4, r1
47+
; CHECK-NEXT: mova p0, #0; movs m0, m4; mov dj0, r1
48+
; CHECK-NEXT: movs dn6, dn0; paddb.3d [p0], d0; vmov lfh1, lfh0
49+
; CHECK-NEXT: mova p0, #0; movs dn3, dn4; mov dc6, dc4
50+
; CHECK-NEXT: .p2align 4
51+
; CHECK-NEXT: .LBB0_2: // %for.body103.i
52+
; CHECK-NEXT: // Parent Loop BB0_1 Depth=1
53+
; CHECK-NEXT: // => This Inner Loop Header: Depth=2
54+
; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; mov m0, m4; nopv
55+
; CHECK-NEXT: movs dn0, r0; jz r3, #.LBB0_2
56+
; CHECK-NEXT: movs dj0, r1; mov dc0, m4 // Delay Slot 5
57+
; CHECK-NEXT: movs dn4, r2; mov dc4, m4 // Delay Slot 4
58+
; CHECK-NEXT: movs dj4, r1; mov r25, r4 // Delay Slot 3
59+
; CHECK-NEXT: movs p1, p0; vmov lfl1, x2 // Delay Slot 2
60+
; CHECK-NEXT: vldb.pop.576.3d ex0, [p1, lf1, r25, d0]; vmov lfh1, x3 // Delay Slot 1
61+
; CHECK-NEXT: // %bb.3: // %for.cond.cleanup102.i
62+
; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1
63+
; CHECK-NEXT: nopa ; nopb ; nopxm
64+
; CHECK-NEXT: movs dc5, dc4; j #.LBB0_1
65+
; CHECK-NEXT: movs dn0, dn6; mov dc4, dc6 // Delay Slot 5
66+
; CHECK-NEXT: mova p0, #0; movs dn6, m4; mov dc6, m4 // Delay Slot 4
67+
; CHECK-NEXT: paddb.3d [p0], d2; mov dn4, dn3 // Delay Slot 3
68+
; CHECK-NEXT: mova p0, #0; movs dc7, m4; mov dn3, m4 // Delay Slot 2
69+
; CHECK-NEXT: mova dc0, #1; paddb.3d [p0], d3; movs dc1, dc0 // Delay Slot 1
70+
entry:
71+
tail call void null(ptr null, ptr null, ptr null)
72+
br label %for.body.i
73+
74+
for.body.i: ; preds = %for.cond.cleanup102.i, %entry
75+
%dimsAI.sroa.17.0665.i = phi i32 [ 0, %entry ], [ %20, %for.cond.cleanup102.i ]
76+
%dimsAI.sroa.13.0664.i = phi i32 [ 0, %entry ], [ %18, %for.cond.cleanup102.i ]
77+
%dimsAO.sroa.8.0662.i = phi i32 [ 0, %entry ], [ %11, %for.cond.cleanup102.i ]
78+
%dimsW.sroa.8.0660.i = phi i32 [ 0, %entry ], [ %15, %for.cond.cleanup102.i ]
79+
%iterator_psum_cnt0.0659.i = phi i32 [ 0, %entry ], [ 1, %for.cond.cleanup102.i ]
80+
%iterator_psum_cnt1.0658.i = phi i32 [ 0, %entry ], [ %7, %for.cond.cleanup102.i ]
81+
%0 = trunc i32 %iterator_psum_cnt0.0659.i to i20
82+
%1 = trunc i32 %iterator_psum_cnt1.0658.i to i20
83+
%2 = tail call { ptr, i20, i20 } @llvm.aie2p.add.3d(ptr null, i20 0, i20 0, i20 0, i20 0, i20 %0, i20 0, i20 %1)
84+
%3 = extractvalue { ptr, i20, i20 } %2, 2
85+
%4 = trunc i32 %dimsAI.sroa.13.0664.i to i20
86+
%5 = trunc i32 %dimsAI.sroa.17.0665.i to i20
87+
%6 = tail call { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.3d.bfp16.p5.p5(ptr addrspace(5) null, <32 x i32> zeroinitializer, i32 0, i20 0, i20 0, i20 %4, i20 0, i20 0, i20 %5, i20 0)
88+
br label %for.body103.i
89+
90+
for.cond.cleanup102.i: ; preds = %for.body103.i
91+
%7 = zext i20 %3 to i32
92+
%8 = trunc i32 %dimsAO.sroa.8.0662.i to i20
93+
%9 = tail call { ptr, i20, i20 } @llvm.aie2p.add.3d(ptr null, i20 0, i20 0, i20 0, i20 0, i20 %8, i20 0, i20 0)
94+
%10 = extractvalue { ptr, i20, i20 } %9, 1
95+
%11 = zext i20 %10 to i32
96+
%12 = trunc i32 %dimsW.sroa.8.0660.i to i20
97+
%13 = tail call { ptr, i20, i20 } @llvm.aie2p.add.3d(ptr null, i20 0, i20 0, i20 0, i20 0, i20 %12, i20 0, i20 0)
98+
%14 = extractvalue { ptr, i20, i20 } %13, 1
99+
%15 = zext i20 %14 to i32
100+
br label %for.body.i
101+
102+
for.body103.i: ; preds = %for.body103.i, %for.body.i
103+
%16 = tail call { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.3d.bfp16.p5.p5(ptr addrspace(5) null, <32 x i32> zeroinitializer, i32 0, i20 0, i20 0, i20 0, i20 0, i20 0, i20 0, i20 0)
104+
%17 = extractvalue { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %16, 3
105+
%18 = zext i20 %17 to i32
106+
%19 = extractvalue { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %16, 4
107+
%20 = zext i20 %19 to i32
108+
br i1 %exitcond.not.i, label %for.cond.cleanup102.i, label %for.body103.i
109+
}
110+
111+
; Function Attrs: nounwind memory(none)
112+
declare { ptr, i20, i20 } @llvm.aie2p.add.3d(ptr, i20, i20, i20, i20, i20, i20, i20) #0
113+
114+
; Function Attrs: nounwind memory(argmem: read)
115+
declare { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.3d.bfp16.p5.p5(ptr addrspace(5), <32 x i32>, i32, i20, i20, i20, i20, i20, i20, i20) #1
116+
117+
; uselistorder directives
118+
uselistorder ptr @llvm.aie2p.add.3d, { 2, 1, 0 }
119+
uselistorder ptr @llvm.aie2p.fifo.ld.pop.576.3d.bfp16.p5.p5, { 1, 0 }
120+
121+
attributes #0 = { nounwind memory(none) }
122+
attributes #1 = { nounwind memory(argmem: read) }

0 commit comments

Comments
 (0)