Skip to content

Commit a1f0d2e

Browse files
committed
[TailDup] Delay aggressive computed-goto taildup to after RegAlloc.
#114990 allowed more aggressive tail duplication for computed-gotos in both pre- and post-regalloc tail duplication. In some cases, performing tail-duplication too early can lead to worse results, especially if we duplicate blocks with a number of phi nodes. This is causing a ~3% performance regression in some workloads using Python 3.12. This patch updates TailDup to delay aggressive tail-duplication for computed gotos to after register allocation. This means we can keep the non-duplicated version for a bit longer throughout the backend, which should reduce compile-time as well as allowing a number of optimizations and simplifications to trigger before drastically expanding the CFG. For the case in #106846, I get the same performance with and without this patch on Skylake.
1 parent 7304df1 commit a1f0d2e

File tree

3 files changed

+62
-75
lines changed

3 files changed

+62
-75
lines changed

llvm/lib/CodeGen/TailDuplicator.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -610,6 +610,15 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
610610
if (HasIndirectbr && PreRegAlloc)
611611
MaxDuplicateCount = TailDupIndirectBranchSize;
612612

613+
// Allow higher limits when the block has computed-gotos and running after
614+
// register allocation. NB. This basically unfactors computed gotos that were
615+
// factored early on in the compilation process to speed up edge based data
616+
// flow. If we do not unfactor them again, it can seriously pessimize code
617+
// with many computed jumps in the source code, such as interpreters.
618+
// Therefore we do not restrict the computed gotos.
619+
if (HasComputedGoto && !PreRegAlloc)
620+
MaxDuplicateCount = std::max(MaxDuplicateCount, 10u);
621+
613622
// Check the instructions in the block to determine whether tail-duplication
614623
// is invalid or unlikely to be profitable.
615624
unsigned InstrCount = 0;
@@ -663,12 +672,7 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
663672
// Duplicating a BB which has both multiple predecessors and successors will
664673
// may cause huge amount of PHI nodes. If we want to remove this limitation,
665674
// we have to address https://github.com/llvm/llvm-project/issues/78578.
666-
// NB. This basically unfactors computed gotos that were factored early on in
667-
// the compilation process to speed up edge based data flow. If we do not
668-
// unfactor them again, it can seriously pessimize code with many computed
669-
// jumps in the source code, such as interpreters. Therefore we do not
670-
// restrict the computed gotos.
671-
if (!HasComputedGoto && TailBB.pred_size() > TailDupPredSize &&
675+
if (PreRegAlloc && TailBB.pred_size() > TailDupPredSize &&
672676
TailBB.succ_size() > TailDupSuccSize) {
673677
// If TailBB or any of its successors contains a phi, we may have to add a
674678
// large number of additional phis with additional incoming values.

llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll

Lines changed: 29 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -25,77 +25,58 @@ define void @test_interp(ptr %frame, ptr %dst) {
2525
; CHECK-NEXT: adrp x21, _opcode.targets@PAGE
2626
; CHECK-NEXT: Lloh1:
2727
; CHECK-NEXT: add x21, x21, _opcode.targets@PAGEOFF
28-
; CHECK-NEXT: mov x22, xzr
28+
; CHECK-NEXT: mov x24, xzr
2929
; CHECK-NEXT: add x8, x21, xzr, lsl #3
3030
; CHECK-NEXT: mov x19, x1
3131
; CHECK-NEXT: mov x20, x0
32-
; CHECK-NEXT: add x23, x22, #1
32+
; CHECK-NEXT: mov x23, xzr
33+
; CHECK-NEXT: mov w22, #1 ; =0x1
34+
; CHECK-NEXT: add x24, x24, #1
3335
; CHECK-NEXT: br x8
3436
; CHECK-NEXT: Ltmp0: ; Block address taken
3537
; CHECK-NEXT: LBB0_1: ; %loop.header
3638
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
37-
; CHECK-NEXT: add x8, x21, x23, lsl #3
39+
; CHECK-NEXT: add x8, x21, x24, lsl #3
3840
; CHECK-NEXT: mov x20, xzr
39-
; CHECK-NEXT: mov x22, xzr
40-
; CHECK-NEXT: add x23, x23, #1
41+
; CHECK-NEXT: mov x23, xzr
42+
; CHECK-NEXT: add x24, x24, #1
4143
; CHECK-NEXT: br x8
4244
; CHECK-NEXT: Ltmp1: ; Block address taken
4345
; CHECK-NEXT: LBB0_2: ; %op1.bb
44-
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
4546
; CHECK-NEXT: str xzr, [x19]
46-
; CHECK-NEXT: mov w8, #1 ; =0x1
47+
; CHECK-NEXT: Ltmp2: ; Block address taken
48+
; CHECK-NEXT: LBB0_3: ; %op6.bb
49+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
4750
; CHECK-NEXT: ldr x0, [x20, #-8]!
48-
; CHECK-NEXT: ldr x9, [x0, #8]
49-
; CHECK-NEXT: str x8, [x0]
50-
; CHECK-NEXT: ldr x8, [x9, #48]
51+
; CHECK-NEXT: ldr x8, [x0, #8]
52+
; CHECK-NEXT: str x22, [x0]
53+
; CHECK-NEXT: ldr x8, [x8, #48]
5154
; CHECK-NEXT: blr x8
52-
; CHECK-NEXT: add x8, x21, x23, lsl #3
53-
; CHECK-NEXT: add x23, x23, #1
55+
; CHECK-NEXT: add x8, x21, x24, lsl #3
56+
; CHECK-NEXT: add x24, x24, #1
5457
; CHECK-NEXT: br x8
55-
; CHECK-NEXT: Ltmp2: ; Block address taken
56-
; CHECK-NEXT: LBB0_3: ; %op2.bb
58+
; CHECK-NEXT: Ltmp3: ; Block address taken
59+
; CHECK-NEXT: LBB0_4: ; %op2.bb
5760
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
58-
; CHECK-NEXT: add x8, x21, x23, lsl #3
61+
; CHECK-NEXT: add x8, x21, x24, lsl #3
5962
; CHECK-NEXT: mov x20, xzr
60-
; CHECK-NEXT: add x23, x23, #1
61-
; CHECK-NEXT: str x22, [x19]
62-
; CHECK-NEXT: mov x22, xzr
63+
; CHECK-NEXT: str x23, [x19]
64+
; CHECK-NEXT: mov x23, xzr
65+
; CHECK-NEXT: add x24, x24, #1
6366
; CHECK-NEXT: br x8
64-
; CHECK-NEXT: Ltmp3: ; Block address taken
65-
; CHECK-NEXT: LBB0_4: ; %op4.bb
66-
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
67-
; CHECK-NEXT: str x22, [x19]
68-
; CHECK-NEXT: add x10, x21, x23, lsl #3
69-
; CHECK-NEXT: add x23, x23, #1
70-
; CHECK-NEXT: ldur x8, [x22, #12]
71-
; CHECK-NEXT: ldur x9, [x20, #-8]
72-
; CHECK-NEXT: add x22, x22, #20
73-
; CHECK-NEXT: stp x8, x9, [x20, #-8]
74-
; CHECK-NEXT: add x20, x20, #8
75-
; CHECK-NEXT: br x10
7667
; CHECK-NEXT: Ltmp4: ; Block address taken
77-
; CHECK-NEXT: LBB0_5: ; %op5.bb
68+
; CHECK-NEXT: LBB0_5: ; %op4.bb
69+
; CHECK-NEXT: Ltmp5: ; Block address taken
70+
; CHECK-NEXT: LBB0_6: ; %op5.bb
7871
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
79-
; CHECK-NEXT: str x22, [x19]
80-
; CHECK-NEXT: add x10, x21, x23, lsl #3
81-
; CHECK-NEXT: add x23, x23, #1
82-
; CHECK-NEXT: ldur x8, [x22, #12]
72+
; CHECK-NEXT: str x23, [x19]
73+
; CHECK-NEXT: ldur x8, [x23, #12]
8374
; CHECK-NEXT: ldur x9, [x20, #-8]
84-
; CHECK-NEXT: add x22, x22, #20
75+
; CHECK-NEXT: add x23, x23, #20
8576
; CHECK-NEXT: stp x8, x9, [x20, #-8]
77+
; CHECK-NEXT: add x8, x21, x24, lsl #3
8678
; CHECK-NEXT: add x20, x20, #8
87-
; CHECK-NEXT: br x10
88-
; CHECK-NEXT: Ltmp5: ; Block address taken
89-
; CHECK-NEXT: LBB0_6: ; %op6.bb
90-
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
91-
; CHECK-NEXT: ldr x0, [x20, #-8]!
92-
; CHECK-NEXT: mov w8, #1 ; =0x1
93-
; CHECK-NEXT: ldr x9, [x0, #8]
94-
; CHECK-NEXT: str x8, [x0]
95-
; CHECK-NEXT: ldr x8, [x9, #48]
96-
; CHECK-NEXT: blr x8
97-
; CHECK-NEXT: add x8, x21, x23, lsl #3
98-
; CHECK-NEXT: add x23, x23, #1
79+
; CHECK-NEXT: add x24, x24, #1
9980
; CHECK-NEXT: br x8
10081
; CHECK-NEXT: .loh AdrpAdd Lloh0, Lloh1
10182
entry:

llvm/test/CodeGen/X86/tail-dup-computed-goto.mir renamed to llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
22
# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=early-tailduplication -tail-dup-pred-size=1 -tail-dup-succ-size=1 %s -o - | FileCheck %s
3-
# Check that only the computed goto is not be restrict by tail-dup-pred-size and tail-dup-succ-size.
3+
#
4+
# Check that only the computed goto and others are restricted by tail-dup-pred-size and tail-dup-succ-size.
5+
#
46
--- |
57
@computed_goto.dispatch = constant [5 x ptr] [ptr null, ptr blockaddress(@computed_goto, %bb1), ptr blockaddress(@computed_goto, %bb2), ptr blockaddress(@computed_goto, %bb3), ptr blockaddress(@computed_goto, %bb4)]
68
declare i64 @f0()
@@ -30,54 +32,54 @@ tracksRegLiveness: true
3032
body: |
3133
; CHECK-LABEL: name: computed_goto
3234
; CHECK: bb.0:
33-
; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
35+
; CHECK-NEXT: successors: %bb.5(0x80000000)
3436
; CHECK-NEXT: {{ $}}
3537
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
3638
; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
3739
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
38-
; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64_nosp = COPY $rax
39-
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY]]
40-
; CHECK-NEXT: JMP64m $noreg, 8, [[COPY]], @computed_goto.dispatch, $noreg
40+
; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rax
41+
; CHECK-NEXT: JMP_1 %bb.5
4142
; CHECK-NEXT: {{ $}}
4243
; CHECK-NEXT: bb.1.bb1 (ir-block-address-taken %ir-block.bb1):
43-
; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
44+
; CHECK-NEXT: successors: %bb.5(0x80000000)
4445
; CHECK-NEXT: {{ $}}
4546
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
4647
; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
4748
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
48-
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_nosp = COPY $rax
49-
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64_nosp = COPY [[COPY2]]
50-
; CHECK-NEXT: JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
49+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rax
50+
; CHECK-NEXT: JMP_1 %bb.5
5151
; CHECK-NEXT: {{ $}}
5252
; CHECK-NEXT: bb.2.bb2 (ir-block-address-taken %ir-block.bb2):
53-
; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
53+
; CHECK-NEXT: successors: %bb.5(0x80000000)
5454
; CHECK-NEXT: {{ $}}
5555
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
5656
; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
5757
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
58-
; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr64_nosp = COPY $rax
59-
; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr64_nosp = COPY [[COPY4]]
60-
; CHECK-NEXT: JMP64m $noreg, 8, [[COPY4]], @computed_goto.dispatch, $noreg
58+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY $rax
59+
; CHECK-NEXT: JMP_1 %bb.5
6160
; CHECK-NEXT: {{ $}}
6261
; CHECK-NEXT: bb.3.bb3 (ir-block-address-taken %ir-block.bb3):
63-
; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
62+
; CHECK-NEXT: successors: %bb.5(0x80000000)
6463
; CHECK-NEXT: {{ $}}
6564
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
6665
; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
6766
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
68-
; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = COPY $rax
69-
; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = COPY [[COPY6]]
70-
; CHECK-NEXT: JMP64m $noreg, 8, [[COPY6]], @computed_goto.dispatch, $noreg
67+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY $rax
68+
; CHECK-NEXT: JMP_1 %bb.5
7169
; CHECK-NEXT: {{ $}}
7270
; CHECK-NEXT: bb.4.bb4 (ir-block-address-taken %ir-block.bb4):
73-
; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
71+
; CHECK-NEXT: successors: %bb.5(0x80000000)
7472
; CHECK-NEXT: {{ $}}
7573
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
7674
; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
7775
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
78-
; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr64_nosp = COPY $rax
79-
; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64_nosp = COPY [[COPY8]]
80-
; CHECK-NEXT: JMP64m $noreg, 8, [[COPY8]], @computed_goto.dispatch, $noreg
76+
; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr64 = COPY $rax
77+
; CHECK-NEXT: {{ $}}
78+
; CHECK-NEXT: bb.5:
79+
; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
80+
; CHECK-NEXT: {{ $}}
81+
; CHECK-NEXT: [[PHI:%[0-9]+]]:gr64_nosp = PHI [[COPY]], %bb.0, [[COPY4]], %bb.4, [[COPY3]], %bb.3, [[COPY2]], %bb.2, [[COPY1]], %bb.1
82+
; CHECK-NEXT: JMP64m $noreg, 8, [[PHI]], @computed_goto.dispatch, $noreg
8183
bb.0:
8284
ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
8385
CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax

0 commit comments

Comments
 (0)