Skip to content

Commit 598be5c

Browse files
Martien de Jongmartien-de-jong
authored andcommitted
[AIE] Combine REG_SEQ of copies of subregs into a copy of the superreg
1 parent 73ef21e commit 598be5c

File tree

3 files changed

+262
-22
lines changed

3 files changed

+262
-22
lines changed

llvm/lib/Target/AIE/AIEPostSelectOptimize.cpp

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -617,6 +617,73 @@ bool modifyStoreFlush(MachineBasicBlock &MBB, MachineRegisterInfo &MRI) {
617617
return Changed;
618618
}
619619

620+
static bool combineSubRegCopyToSuperregCopy(MachineBasicBlock &MBB,
621+
MachineRegisterInfo &MRI) {
622+
const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
623+
bool Changed = false;
624+
625+
for (MachineInstr &MI : make_early_inc_range(reverse(MBB))) {
626+
if (!MI.isRegSequence())
627+
continue;
628+
629+
LLVM_DEBUG(llvm::dbgs() << "SubregCopyToSuperRegCopy visiting ");
630+
LLVM_DEBUG(MI.dump());
631+
Register CommonSrcReg;
632+
// TODO: We may want to check that the list covers the whole superregister
633+
// lest we run the risk of replacing an undefined register with the defined
634+
// source subreg.
635+
for (unsigned I = 1; I < MI.getNumExplicitOperands(); I += 2) {
636+
637+
const Register SrcReg = MI.getOperand(I).getReg();
638+
const unsigned SubRegIdx = MI.getOperand(I + 1).getImm();
639+
640+
auto *SubRegDef = MRI.getVRegDef(SrcReg);
641+
if (!SubRegDef->isCopy()) {
642+
LLVM_DEBUG(llvm::dbgs() << "Src is not a COPY. Skip\n");
643+
CommonSrcReg = {};
644+
break;
645+
}
646+
647+
if (!SubRegDef->getOperand(1).getSubReg()) {
648+
LLVM_DEBUG(llvm::dbgs() << "Src COPY is not a subreg COPY. Skip\n");
649+
CommonSrcReg = {};
650+
break;
651+
}
652+
653+
const unsigned SrcSubRegIdx = SubRegDef->getOperand(1).getSubReg();
654+
655+
if (SubRegIdx != SrcSubRegIdx) {
656+
LLVM_DEBUG(llvm::dbgs() << "SubReg indexes are not the same. Skip\n");
657+
CommonSrcReg = {};
658+
break;
659+
}
660+
661+
Register SubRegCopySrc = SubRegDef->getOperand(1).getReg();
662+
if (CommonSrcReg && SubRegCopySrc != CommonSrcReg) {
663+
LLVM_DEBUG(llvm::dbgs()
664+
<< "REG_SEQUENCE from different sources. Skip\n");
665+
CommonSrcReg = {};
666+
break;
667+
}
668+
CommonSrcReg = SubRegCopySrc;
669+
}
670+
671+
if (CommonSrcReg) {
672+
LLVM_DEBUG(llvm::dbgs() << "Folding ");
673+
LLVM_DEBUG(MI.dump());
674+
LLVM_DEBUG(llvm::dbgs() << "into superreg COPY\n");
675+
auto MIB =
676+
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
677+
MI.getOperand(0).getReg())
678+
.addReg(CommonSrcReg);
679+
LLVM_DEBUG(MIB->dump());
680+
MI.eraseFromParent();
681+
Changed = true;
682+
}
683+
}
684+
return Changed;
685+
}
686+
620687
bool AIEPostSelectOptimize::runOnMachineFunction(MachineFunction &MF) {
621688
LLVM_DEBUG(dbgs() << "\n******* POST I-SEL OPTIMIZATION PASS *******\n"
622689
<< "********** Function: " << MF.getName() << '\n');
@@ -632,6 +699,12 @@ bool AIEPostSelectOptimize::runOnMachineFunction(MachineFunction &MF) {
632699

633700
bool Changed = false;
634701

702+
// 0. Fold REG_SEQUENCE (COPY %0.sub_bfp16_x), %subreg.sub_bfp16_x,
703+
// (%0.sub_bfp16_e), %subreg.sub_bfp16_e) into COPY %0
704+
for (MachineBasicBlock &MBB : MF) {
705+
Changed |= combineSubRegCopyToSuperregCopy(MBB, MF.getRegInfo());
706+
}
707+
635708
// 1. Turn INSERT_SUBREG into REG_SEQUENCE when possible
636709
for (MachineBasicBlock &MBB : MF) {
637710
Changed |= combineINSERT_SUBREG(MBB);

llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -79,18 +79,18 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non
7979
; CHECK-NEXT: movxm r6, #15616; vmul.f bmh2, x0, x3, r1
8080
; CHECK-NEXT: movxm r7, #16000
8181
; CHECK-NEXT: vbcst.16 x1, r3
82-
; CHECK-NEXT: vbcst.16 x10, r4
83-
; CHECK-NEXT: vbcst.16 x8, r5; vmul.f bmh3, x0, x3, r1
82+
; CHECK-NEXT: vbcst.16 x8, r4
83+
; CHECK-NEXT: vbcst.16 x10, r5; vmul.f bmh3, x0, x3, r1
8484
; CHECK-NEXT: vbcst.16 x6, r6
8585
; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh2; vbcst.16 x4, r7
8686
; CHECK-NEXT: vmov wh6, wl2
8787
; CHECK-NEXT: vmin_ge.bf16 x3, r16, x3, x1
88-
; CHECK-NEXT: or r8, r16, r16; vmax_lt.bf16 x3, r16, x3, x10
89-
; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh3; vband x7, x8, x3
88+
; CHECK-NEXT: or r8, r16, r16; vmax_lt.bf16 x3, r16, x3, x8
89+
; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh3; vband x7, x10, x3
9090
; CHECK-NEXT: vmov wh7, wl2
9191
; CHECK-NEXT: vmin_ge.bf16 x5, r16, x5, x1
92-
; CHECK-NEXT: vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x5, x10
93-
; CHECK-NEXT: vband x7, x8, x5
92+
; CHECK-NEXT: vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x5, x8
93+
; CHECK-NEXT: vband x7, x10, x5
9494
; CHECK-NEXT: vldb wl7, [p0], #32; vmov wh7, wl2; vmul.f bmh2, x6, x7, r1
9595
; CHECK-NEXT: vmov wh4, wl2
9696
; CHECK-NEXT: vmov wh3, wl2; vmul.f bmh4, x6, x7, r1
@@ -105,16 +105,16 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non
105105
; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh5; movxm le, #.L_LEnd0
106106
; CHECK-NEXT: add.nc lc, r2, #-2
107107
; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh7; vmin_ge.bf16 x3, r16, x3, x1
108-
; CHECK-NEXT: vmax_lt.bf16 x3, r16, x3, x10
108+
; CHECK-NEXT: vmax_lt.bf16 x3, r16, x3, x8
109109
; CHECK-NEXT: mova r0, #28; vconv.bf16.fp32 wl7, bmh3; vmin_ge.bf16 x11, r16, x5, x1
110110
; CHECK-NEXT: .p2align 4
111111
; CHECK-NEXT: .LBB0_1: // %for.body
112112
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
113-
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vband x9, x8, x3; nopv
113+
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vband x9, x10, x3; nopv
114114
; CHECK-NEXT: vldb wl7, [p0], #32; vmov wh3, wl2
115115
; CHECK-NEXT: nopx ; vmov wh9, wl2; vmul.f bmh5, x7, x0, r1
116-
; CHECK-NEXT: vconv.bf16.fp32 wl7, bml4; vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x11, x10; vmac.f bmh4, bmh0, x3, x4, r1
117-
; CHECK-NEXT: vband x9, x8, x5; vmul.f bmh2, x6, x9, r1
116+
; CHECK-NEXT: vconv.bf16.fp32 wl7, bml4; vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x11, x8; vmac.f bmh4, bmh0, x3, x4, r1
117+
; CHECK-NEXT: vband x9, x10, x5; vmul.f bmh2, x6, x9, r1
118118
; CHECK-NEXT: vmov wh9, wl2; vmul.f bmh6, x7, x0, r1
119119
; CHECK-NEXT: vsub.f bml0, bmh5, bmh1, r0
120120
; CHECK-NEXT: vmul.f bmh3, x6, x9, r1
@@ -127,29 +127,29 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non
127127
; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh7
128128
; CHECK-NEXT: vst.conv.bf16.fp32 bml0, [p1], #32; vmsc.f bml4, bml2, x3, x5, r1
129129
; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh8; vmin_ge.bf16 x9, r16, x3, x1
130-
; CHECK-NEXT: vst.conv.bf16.fp32 bml1, [p1], #32; vmax_lt.bf16 x3, r16, x9, x10
130+
; CHECK-NEXT: vst.conv.bf16.fp32 bml1, [p1], #32; vmax_lt.bf16 x3, r16, x9, x8
131131
; CHECK-NEXT: .L_LEnd0:
132132
; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl7, bml3; nopx ; vmin_ge.bf16 x11, r16, x5, x1; nopv
133133
; CHECK-NEXT: // %bb.2:
134-
; CHECK-NEXT: nopa ; nopb ; nopxm
134+
; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv
135135
; CHECK-NEXT: vmov wh7, wl2
136136
; CHECK-NEXT: vconv.bf16.fp32 wl1, bml4; vmov wh1, wl2
137-
; CHECK-NEXT: vmov wh6, wl2; vmul.f bmh3, x7, x0, r1
138-
; CHECK-NEXT: vmax_lt.bf16 x10, r16, x11, x10; vmul.f bmh2, x1, x0, r1
139-
; CHECK-NEXT: vband x1, x8, x3
140-
; CHECK-NEXT: vband x8, x8, x10
137+
; CHECK-NEXT: vmul.f bmh3, x7, x0, r1
138+
; CHECK-NEXT: vmax_lt.bf16 x8, r16, x11, x8; vmul.f bmh2, x1, x0, r1
139+
; CHECK-NEXT: vband x1, x10, x3
140+
; CHECK-NEXT: vband x10, x10, x8
141141
; CHECK-NEXT: vmov wh1, wl2; vsub.f bmh3, bmh3, bmh1, r0
142-
; CHECK-NEXT: vmov wh8, wl2; vsub.f bmh2, bmh2, bmh1, r0
142+
; CHECK-NEXT: vmov wh10, wl2; vsub.f bmh2, bmh2, bmh1, r0
143143
; CHECK-NEXT: vmul.f bmh2, x6, x1, r1
144-
; CHECK-NEXT: vmov wh4, wl2; vmul.f bmh3, x6, x8, r1
144+
; CHECK-NEXT: vmul.f bmh3, x6, x10, r1
145145
; CHECK-NEXT: vmov wh3, wl2
146-
; CHECK-NEXT: vmov wh10, wl2
146+
; CHECK-NEXT: vmov wh8, wl2
147147
; CHECK-NEXT: vst.conv.bf16.fp32 bmh3, [p1], #32; vmac.f bmh4, bmh0, x3, x4, r1
148-
; CHECK-NEXT: vst.conv.bf16.fp32 bmh2, [p1], #32; vmac.f bmh0, bmh0, x10, x4, r1
149-
; CHECK-NEXT: vconv.bf16.fp32 wl4, bmh2
148+
; CHECK-NEXT: vst.conv.bf16.fp32 bmh2, [p1], #32; vmac.f bmh0, bmh0, x8, x4, r1
149+
; CHECK-NEXT: vconv.bf16.fp32 wl4, bmh2; vmov wh4, wl2
150150
; CHECK-NEXT: vconv.bf16.fp32 wl4, bmh3
151151
; CHECK-NEXT: vmsc.f bmh2, bmh4, x4, x3, r1
152-
; CHECK-NEXT: vmsc.f bmh0, bmh0, x4, x10, r1
152+
; CHECK-NEXT: vmsc.f bmh0, bmh0, x4, x8, r1
153153
; CHECK-NEXT: nop
154154
; CHECK-NEXT: nop
155155
; CHECK-NEXT: nop
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
#
3+
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
4+
# See https://llvm.org/LICENSE.txt for license information.
5+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
#
7+
# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
8+
9+
# RUN: llc -mtriple aie2p -run-pass=aie-post-select-optimize,dead-mi-elimination %s -verify-machineinstrs -o - | FileCheck %s
10+
11+
# This file tests various post-select combines for aie2p
12+
13+
---
14+
name: REG_SEQ_of_subreg_copies
15+
alignment: 16
16+
legalized: true
17+
regBankSelected: true
18+
selected: true
19+
tracksRegLiveness: true
20+
body: |
21+
bb.0.entry:
22+
liveins: $ex0
23+
; CHECK-LABEL: name: REG_SEQ_of_subreg_copies
24+
; CHECK: liveins: $ex0
25+
; CHECK-NEXT: {{ $}}
26+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vec576 = COPY $ex0
27+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec576 = COPY [[COPY]]
28+
; CHECK-NEXT: $ex0 = COPY [[COPY1]]
29+
; CHECK-NEXT: PseudoRET implicit $lr, implicit $ex0
30+
%0:vec576 = COPY $ex0
31+
%1:vec512 = COPY %0.sub_bfp16_x
32+
%2:expvec64 = COPY %0.sub_bfp16_e
33+
%3:vec576 = REG_SEQUENCE %1, %subreg.sub_bfp16_x, %2, %subreg.sub_bfp16_e
34+
$ex0 = COPY %3
35+
PseudoRET implicit $lr, implicit $ex0
36+
...
37+
38+
---
39+
name: dont_REG_SEQ_of_different_sources
40+
alignment: 16
41+
legalized: true
42+
regBankSelected: true
43+
selected: true
44+
tracksRegLiveness: true
45+
body: |
46+
bb.0.entry:
47+
liveins: $ex0, $ex1
48+
; CHECK-LABEL: name: dont_REG_SEQ_of_different_sources
49+
; CHECK: liveins: $ex0, $ex1
50+
; CHECK-NEXT: {{ $}}
51+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vec576 = COPY $ex0
52+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec576 = COPY $ex1
53+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec512 = COPY [[COPY]].sub_bfp16_x
54+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:expvec64 = COPY [[COPY1]].sub_bfp16_e
55+
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec576 = REG_SEQUENCE [[COPY2]], %subreg.sub_bfp16_x, [[COPY3]], %subreg.sub_bfp16_e
56+
; CHECK-NEXT: $ex0 = COPY [[REG_SEQUENCE]]
57+
; CHECK-NEXT: PseudoRET implicit $lr, implicit $ex0
58+
%0:vec576 = COPY $ex0
59+
%1:vec576 = COPY $ex1
60+
%2:vec512 = COPY %0.sub_bfp16_x
61+
%3:expvec64 = COPY %1.sub_bfp16_e
62+
%4:vec576 = REG_SEQUENCE %2, %subreg.sub_bfp16_x, %3, %subreg.sub_bfp16_e
63+
$ex0 = COPY %4
64+
PseudoRET implicit $lr, implicit $ex0
65+
...
66+
---
67+
name: dont_REG_SEQ_of_non_subreg
68+
alignment: 16
69+
legalized: true
70+
regBankSelected: true
71+
selected: true
72+
tracksRegLiveness: true
73+
body: |
74+
bb.0.entry:
75+
liveins: $ex0
76+
; CHECK-LABEL: name: dont_REG_SEQ_of_non_subreg
77+
; CHECK: liveins: $ex0
78+
; CHECK-NEXT: {{ $}}
79+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vec576 = COPY $ex0
80+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec512 = COPY [[COPY]].sub_bfp16_x
81+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:expvec64 = COPY $e0
82+
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec576 = REG_SEQUENCE [[COPY1]], %subreg.sub_bfp16_x, [[COPY2]], %subreg.sub_bfp16_e
83+
; CHECK-NEXT: $ex0 = COPY [[REG_SEQUENCE]]
84+
; CHECK-NEXT: PseudoRET implicit $lr, implicit $ex0
85+
%0:vec576 = COPY $ex0
86+
%2:vec512 = COPY %0.sub_bfp16_x
87+
%3:expvec64 = COPY $e0
88+
%4:vec576 = REG_SEQUENCE %2, %subreg.sub_bfp16_x, %3, %subreg.sub_bfp16_e
89+
$ex0 = COPY %4
90+
PseudoRET implicit $lr, implicit $ex0
91+
...
92+
---
93+
name: do_REG_SEQ_of_subregs
94+
alignment: 16
95+
legalized: true
96+
regBankSelected: true
97+
selected: true
98+
tracksRegLiveness: true
99+
body: |
100+
bb.0.entry:
101+
liveins: $x0
102+
; CHECK-LABEL: name: do_REG_SEQ_of_subregs
103+
; CHECK: liveins: $x0
104+
; CHECK-NEXT: {{ $}}
105+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0
106+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec512 = COPY [[COPY]]
107+
; CHECK-NEXT: $x0 = COPY [[COPY1]]
108+
; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0
109+
%0:vec512 = COPY $x0
110+
%2:vec256 = COPY %0.sub_256_lo
111+
%3:vec256 = COPY %0.sub_256_hi
112+
%4:vec512 = REG_SEQUENCE %2, %subreg.sub_256_lo, %3, %subreg.sub_256_hi
113+
$x0 = COPY %4
114+
PseudoRET implicit $lr, implicit $x0
115+
...
116+
---
117+
name: dont_REG_SEQ_of_swapped_index
118+
alignment: 16
119+
legalized: true
120+
regBankSelected: true
121+
selected: true
122+
tracksRegLiveness: true
123+
body: |
124+
bb.0.entry:
125+
liveins: $x0
126+
; CHECK-LABEL: name: dont_REG_SEQ_of_swapped_index
127+
; CHECK: liveins: $x0
128+
; CHECK-NEXT: {{ $}}
129+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0
130+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec256 = COPY [[COPY]].sub_256_lo
131+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec256 = COPY [[COPY]].sub_256_hi
132+
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[COPY2]], %subreg.sub_256_lo, [[COPY1]], %subreg.sub_256_hi
133+
; CHECK-NEXT: $x0 = COPY [[REG_SEQUENCE]]
134+
; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0
135+
%0:vec512 = COPY $x0
136+
%2:vec256 = COPY %0.sub_256_lo
137+
%3:vec256 = COPY %0.sub_256_hi
138+
%4:vec512 = REG_SEQUENCE %3, %subreg.sub_256_lo, %2, %subreg.sub_256_hi
139+
$x0 = COPY %4
140+
PseudoRET implicit $lr, implicit $x0
141+
...
142+
---
143+
name: dont_REG_SEQ_of_equal_index
144+
alignment: 16
145+
legalized: true
146+
regBankSelected: true
147+
selected: true
148+
tracksRegLiveness: true
149+
body: |
150+
bb.0.entry:
151+
liveins: $x0
152+
; CHECK-LABEL: name: dont_REG_SEQ_of_equal_index
153+
; CHECK: liveins: $x0
154+
; CHECK-NEXT: {{ $}}
155+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0
156+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vec256 = COPY [[COPY]].sub_256_lo
157+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vec256 = COPY [[COPY]].sub_256_lo
158+
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vec512 = REG_SEQUENCE [[COPY1]], %subreg.sub_256_lo, [[COPY2]], %subreg.sub_256_hi
159+
; CHECK-NEXT: $x0 = COPY [[REG_SEQUENCE]]
160+
; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0
161+
%0:vec512 = COPY $x0
162+
%2:vec256 = COPY %0.sub_256_lo
163+
%3:vec256 = COPY %0.sub_256_lo
164+
%4:vec512 = REG_SEQUENCE %2, %subreg.sub_256_lo, %3, %subreg.sub_256_hi
165+
$x0 = COPY %4
166+
PseudoRET implicit $lr, implicit $x0
167+
...

0 commit comments

Comments
 (0)