Skip to content

release/21.x: [TailDup] Delay aggressive computed-goto taildup to after RegAlloc. (#150911) #151680

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: release/21.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions llvm/include/llvm/CodeGen/MachineBasicBlock.h
Original file line number Diff line number Diff line change
Expand Up @@ -323,10 +323,11 @@ class MachineBasicBlock
const MachineFunction *getParent() const { return xParent; }
MachineFunction *getParent() { return xParent; }

/// Returns true if the original IR terminator is an `indirectbr`. This
/// typically corresponds to a `goto` in C, rather than jump tables.
bool terminatorIsComputedGoto() const {
return back().isIndirectBranch() &&
/// Returns true if the original IR terminator is an `indirectbr` with
/// successor blocks. This typically corresponds to a `goto` in C, rather than
/// jump tables.
bool terminatorIsComputedGotoWithSuccessors() const {
return back().isIndirectBranch() && !succ_empty() &&
llvm::all_of(successors(), [](const MachineBasicBlock *Succ) {
return Succ->isIRBlockAddressTaken();
});
Expand Down
18 changes: 11 additions & 7 deletions llvm/lib/CodeGen/TailDuplicator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -604,12 +604,21 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
bool HasComputedGoto = false;
if (!TailBB.empty()) {
HasIndirectbr = TailBB.back().isIndirectBranch();
HasComputedGoto = TailBB.terminatorIsComputedGoto();
HasComputedGoto = TailBB.terminatorIsComputedGotoWithSuccessors();
}

if (HasIndirectbr && PreRegAlloc)
MaxDuplicateCount = TailDupIndirectBranchSize;

// Allow higher limits when the block has computed-gotos and running after
// register allocation. NB. This basically unfactors computed gotos that were
// factored early on in the compilation process to speed up edge based data
// flow. If we do not unfactor them again, it can seriously pessimize code
// with many computed jumps in the source code, such as interpreters.
// Therefore we do not restrict the computed gotos.
if (HasComputedGoto && !PreRegAlloc)
MaxDuplicateCount = std::max(MaxDuplicateCount, 10u);

// Check the instructions in the block to determine whether tail-duplication
// is invalid or unlikely to be profitable.
unsigned InstrCount = 0;
Expand Down Expand Up @@ -663,12 +672,7 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
// Duplicating a BB which has both multiple predecessors and successors will
// may cause huge amount of PHI nodes. If we want to remove this limitation,
// we have to address https://github.com/llvm/llvm-project/issues/78578.
// NB. This basically unfactors computed gotos that were factored early on in
// the compilation process to speed up edge based data flow. If we do not
// unfactor them again, it can seriously pessimize code with many computed
// jumps in the source code, such as interpreters. Therefore we do not
// restrict the computed gotos.
if (!HasComputedGoto && TailBB.pred_size() > TailDupPredSize &&
if (PreRegAlloc && TailBB.pred_size() > TailDupPredSize &&
TailBB.succ_size() > TailDupSuccSize) {
// If TailBB or any of its successors contains a phi, we may have to add a
// large number of additional phis with additional incoming values.
Expand Down
143 changes: 143 additions & 0 deletions llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -tail-dup-pred-size=2 -tail-dup-succ-size=2 -o - %s | FileCheck %s

target triple = "arm64-apple-macosx13.0.0"

@opcode.targets = local_unnamed_addr constant [6 x ptr] [ptr blockaddress(@test_interp, %op1.bb), ptr blockaddress(@test_interp, %op6.bb), ptr blockaddress(@test_interp, %loop.header), ptr blockaddress(@test_interp, %op2.bb), ptr blockaddress(@test_interp, %op4.bb), ptr blockaddress(@test_interp, %op5.bb)]

define void @test_interp(ptr %frame, ptr %dst) {
; CHECK-LABEL: test_interp:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: stp x24, x23, [sp, #-64]! ; 16-byte Folded Spill
; CHECK-NEXT: stp x22, x21, [sp, #16] ; 16-byte Folded Spill
; CHECK-NEXT: stp x20, x19, [sp, #32] ; 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #48] ; 16-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 64
; CHECK-NEXT: .cfi_offset w30, -8
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: .cfi_offset w19, -24
; CHECK-NEXT: .cfi_offset w20, -32
; CHECK-NEXT: .cfi_offset w21, -40
; CHECK-NEXT: .cfi_offset w22, -48
; CHECK-NEXT: .cfi_offset w23, -56
; CHECK-NEXT: .cfi_offset w24, -64
; CHECK-NEXT: Lloh0:
; CHECK-NEXT: adrp x21, _opcode.targets@PAGE
; CHECK-NEXT: Lloh1:
; CHECK-NEXT: add x21, x21, _opcode.targets@PAGEOFF
; CHECK-NEXT: mov x24, xzr
; CHECK-NEXT: add x8, x21, xzr, lsl #3
; CHECK-NEXT: mov x19, x1
; CHECK-NEXT: mov x20, x0
; CHECK-NEXT: mov x23, xzr
; CHECK-NEXT: mov w22, #1 ; =0x1
; CHECK-NEXT: add x24, x24, #1
; CHECK-NEXT: br x8
; CHECK-NEXT: Ltmp0: ; Block address taken
; CHECK-NEXT: LBB0_1: ; %loop.header
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add x8, x21, x24, lsl #3
; CHECK-NEXT: mov x20, xzr
; CHECK-NEXT: mov x23, xzr
; CHECK-NEXT: add x24, x24, #1
; CHECK-NEXT: br x8
; CHECK-NEXT: Ltmp1: ; Block address taken
; CHECK-NEXT: LBB0_2: ; %op1.bb
; CHECK-NEXT: str xzr, [x19]
; CHECK-NEXT: Ltmp2: ; Block address taken
; CHECK-NEXT: LBB0_3: ; %op6.bb
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr x0, [x20, #-8]!
; CHECK-NEXT: ldr x8, [x0, #8]
; CHECK-NEXT: str x22, [x0]
; CHECK-NEXT: ldr x8, [x8, #48]
; CHECK-NEXT: blr x8
; CHECK-NEXT: add x8, x21, x24, lsl #3
; CHECK-NEXT: add x24, x24, #1
; CHECK-NEXT: br x8
; CHECK-NEXT: Ltmp3: ; Block address taken
; CHECK-NEXT: LBB0_4: ; %op2.bb
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add x8, x21, x24, lsl #3
; CHECK-NEXT: mov x20, xzr
; CHECK-NEXT: str x23, [x19]
; CHECK-NEXT: mov x23, xzr
; CHECK-NEXT: add x24, x24, #1
; CHECK-NEXT: br x8
; CHECK-NEXT: Ltmp4: ; Block address taken
; CHECK-NEXT: LBB0_5: ; %op4.bb
; CHECK-NEXT: Ltmp5: ; Block address taken
; CHECK-NEXT: LBB0_6: ; %op5.bb
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: str x23, [x19]
; CHECK-NEXT: ldur x8, [x23, #12]
; CHECK-NEXT: ldur x9, [x20, #-8]
; CHECK-NEXT: add x23, x23, #20
; CHECK-NEXT: stp x8, x9, [x20, #-8]
; CHECK-NEXT: add x8, x21, x24, lsl #3
; CHECK-NEXT: add x20, x20, #8
; CHECK-NEXT: add x24, x24, #1
; CHECK-NEXT: br x8
; CHECK-NEXT: .loh AdrpAdd Lloh0, Lloh1
entry:
br label %loop.header

loop.header:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %op1.bb ], [ %iv.next, %op2.bb ], [ %iv.next, %op4.bb ], [ %iv.next, %op5.bb ], [ %iv.next, %op6.bb ], [ %iv.next, %loop.header ]
%stack.pointer = phi ptr [ %frame, %entry ], [ %stack.8, %op1.bb ], [ null, %op2.bb ], [ %stack.next, %op4.bb ], [ %stack.next.2, %op5.bb ], [ %stack.4, %op6.bb ], [ null, %loop.header ]
%next.instr = phi ptr [ null, %entry ], [ %next.instr, %op1.bb ], [ null, %op2.bb ], [ %next.instr.20, %op4.bb ], [ %next.instr.21, %op5.bb ], [ %next.instr, %op6.bb ], [ null, %loop.header ]
%iv.next = add i64 %iv, 1
%next_op = getelementptr [6 x ptr], ptr @opcode.targets, i64 0, i64 %iv
indirectbr ptr %next_op, [label %op1.bb, label %op6.bb, label %loop.header, label %op2.bb, label %op4.bb, label %op5.bb]

op1.bb:
store ptr null, ptr %dst, align 8
%stack.8 = getelementptr i8, ptr %stack.pointer, i64 -8
%l.0 = load ptr, ptr %stack.8, align 8
store i64 1, ptr %l.0, align 8
%gep.0 = getelementptr i8, ptr %l.0, i64 8
%l.1 = load ptr, ptr %gep.0, align 8
%gep.1 = getelementptr i8, ptr %l.1, i64 48
%l.2 = load ptr, ptr %gep.1, align 8
tail call void %l.2(ptr nonnull %l.0)
br label %loop.header

op2.bb:
store ptr %next.instr, ptr %dst, align 8
br label %loop.header

op4.bb:
store ptr %next.instr, ptr %dst, align 8
%next.instr.20 = getelementptr i8, ptr %next.instr, i64 20
%stack.2 = getelementptr i8, ptr %stack.pointer, i64 -8
%l.3 = load ptr, ptr %stack.2, align 8
%next.instr.12 = getelementptr i8, ptr %next.instr, i64 12
%next.instr.12.val = load ptr, ptr %next.instr.12, align 2
store ptr %next.instr.12.val, ptr %stack.2, align 8
store ptr %l.3, ptr %stack.pointer, align 8
%stack.next = getelementptr i8, ptr %stack.pointer, i64 8
br label %loop.header

op5.bb:
store ptr %next.instr, ptr %dst, align 8
%next.instr.21 = getelementptr i8, ptr %next.instr, i64 20
%stack.3 = getelementptr i8, ptr %stack.pointer, i64 -8
%l.4 = load ptr, ptr %stack.3, align 8
%next.instr.2 = getelementptr i8, ptr %next.instr, i64 12
%next.instr.2.val = load ptr, ptr %next.instr.2, align 2
store ptr %next.instr.2.val, ptr %stack.3, align 8
store ptr %l.4, ptr %stack.pointer, align 8
%stack.next.2 = getelementptr i8, ptr %stack.pointer, i64 8
br label %loop.header

op6.bb:
%stack.4 = getelementptr i8, ptr %stack.pointer, i64 -8
%l.5 = load ptr, ptr %stack.4, align 8
store i64 1, ptr %l.5, align 8
%gep.5 = getelementptr i8, ptr %l.5, i64 8
%l.6 = load ptr, ptr %gep.5, align 8
%gep.6 = getelementptr i8, ptr %l.6, i64 48
%l.7 = load ptr, ptr %gep.6, align 8
tail call void %l.7(ptr nonnull %l.5)
br label %loop.header
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=early-tailduplication -tail-dup-pred-size=1 -tail-dup-succ-size=1 %s -o - | FileCheck %s
# Check that only the computed goto is not be restrict by tail-dup-pred-size and tail-dup-succ-size.
#
# Check that only the computed goto and others are restricted by tail-dup-pred-size and tail-dup-succ-size.
#
--- |
@computed_goto.dispatch = constant [5 x ptr] [ptr null, ptr blockaddress(@computed_goto, %bb1), ptr blockaddress(@computed_goto, %bb2), ptr blockaddress(@computed_goto, %bb3), ptr blockaddress(@computed_goto, %bb4)]
declare i64 @f0()
Expand Down Expand Up @@ -30,54 +32,54 @@ tracksRegLiveness: true
body: |
; CHECK-LABEL: name: computed_goto
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
; CHECK-NEXT: successors: %bb.5(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64_nosp = COPY $rax
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY]]
; CHECK-NEXT: JMP64m $noreg, 8, [[COPY]], @computed_goto.dispatch, $noreg
; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rax
; CHECK-NEXT: JMP_1 %bb.5
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1.bb1 (ir-block-address-taken %ir-block.bb1):
; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
; CHECK-NEXT: successors: %bb.5(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_nosp = COPY $rax
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64_nosp = COPY [[COPY2]]
; CHECK-NEXT: JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rax
; CHECK-NEXT: JMP_1 %bb.5
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2.bb2 (ir-block-address-taken %ir-block.bb2):
; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
; CHECK-NEXT: successors: %bb.5(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr64_nosp = COPY $rax
; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr64_nosp = COPY [[COPY4]]
; CHECK-NEXT: JMP64m $noreg, 8, [[COPY4]], @computed_goto.dispatch, $noreg
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY $rax
; CHECK-NEXT: JMP_1 %bb.5
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3.bb3 (ir-block-address-taken %ir-block.bb3):
; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
; CHECK-NEXT: successors: %bb.5(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = COPY $rax
; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = COPY [[COPY6]]
; CHECK-NEXT: JMP64m $noreg, 8, [[COPY6]], @computed_goto.dispatch, $noreg
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY $rax
; CHECK-NEXT: JMP_1 %bb.5
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.4.bb4 (ir-block-address-taken %ir-block.bb4):
; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
; CHECK-NEXT: successors: %bb.5(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr64_nosp = COPY $rax
; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64_nosp = COPY [[COPY8]]
; CHECK-NEXT: JMP64m $noreg, 8, [[COPY8]], @computed_goto.dispatch, $noreg
; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr64 = COPY $rax
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.5:
; CHECK-NEXT: successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI:%[0-9]+]]:gr64_nosp = PHI [[COPY]], %bb.0, [[COPY4]], %bb.4, [[COPY3]], %bb.3, [[COPY2]], %bb.2, [[COPY1]], %bb.1
; CHECK-NEXT: JMP64m $noreg, 8, [[PHI]], @computed_goto.dispatch, $noreg
bb.0:
ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
Expand Down
Loading
Loading