Skip to content

Commit 1d999b8

Browse files
committed
[AArch64] Optimize CBZ wzr and friends.
In certain situations, especially with zero phi operands propagated after tail duplications, we can end up with CBZ/CBNZ/TBZ/TBNZ with a zero register. It can can be introduced late in the pipeline. This patch adds a basic pass to fold them away to either a direct branch or removing the instruction entirely. It runs quite late n the pipeline, so doesnt fit into any of the existing passes. It only needs to look at the terminators to each BB, so the new pass should have a limited in compile-time impact.
1 parent 879dddf commit 1d999b8

File tree

14 files changed

+159
-62
lines changed

14 files changed

+159
-62
lines changed

llvm/lib/Target/AArch64/AArch64.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ class ModulePass;
3333

3434
FunctionPass *createAArch64DeadRegisterDefinitions();
3535
FunctionPass *createAArch64RedundantCopyEliminationPass();
36+
FunctionPass *createAArch64RedundantCondBranchPass();
3637
FunctionPass *createAArch64CondBrTuning();
3738
FunctionPass *createAArch64CompressJumpTablesPass();
3839
FunctionPass *createAArch64ConditionalCompares();
@@ -103,6 +104,7 @@ void initializeAArch64PostSelectOptimizePass(PassRegistry &);
103104
void initializeAArch64PreLegalizerCombinerPass(PassRegistry &);
104105
void initializeAArch64PromoteConstantPass(PassRegistry&);
105106
void initializeAArch64RedundantCopyEliminationPass(PassRegistry&);
107+
void initializeAArch64RedundantCondBranchPass(PassRegistry &);
106108
void initializeAArch64SIMDInstrOptPass(PassRegistry &);
107109
void initializeAArch64SLSHardeningPass(PassRegistry &);
108110
void initializeAArch64SpeculationHardeningPass(PassRegistry &);
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
//=- AArch64RedundantCondBranch.cpp - Remove redundant cbz wzr --------------=//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// Late in the pipeline, especially with zero phi operands propagated after tail
10+
// duplications, we can end up with CBZ/CBNZ/TBZ/TBNZ with a zero register. This
11+
// simple pass looks at the terminators to a block, removing the redundant
12+
// instructions where necessary.
13+
//
14+
//===----------------------------------------------------------------------===//
15+
16+
#include "AArch64.h"
17+
#include "llvm/CodeGen/MachineFunctionPass.h"
18+
#include "llvm/CodeGen/MachineInstrBuilder.h"
19+
#include "llvm/CodeGen/TargetInstrInfo.h"
20+
#include "llvm/Support/Debug.h"
21+
22+
using namespace llvm;
23+
24+
#define DEBUG_TYPE "aarch64-redundantcondbranch"
25+
26+
namespace {
27+
class AArch64RedundantCondBranch : public MachineFunctionPass {
28+
public:
29+
static char ID;
30+
AArch64RedundantCondBranch() : MachineFunctionPass(ID) {}
31+
32+
bool runOnMachineFunction(MachineFunction &MF) override;
33+
34+
MachineFunctionProperties getRequiredProperties() const override {
35+
return MachineFunctionProperties().setNoVRegs();
36+
}
37+
StringRef getPassName() const override {
38+
return "AArch64 Redundant Conditional Branch Elimination";
39+
}
40+
};
41+
char AArch64RedundantCondBranch::ID = 0;
42+
} // namespace
43+
44+
INITIALIZE_PASS(AArch64RedundantCondBranch, "aarch64-redundantcondbranch",
45+
"AArch64 Redundant Conditional Branch Elimination pass", false,
46+
false)
47+
48+
static bool optimizeTerminators(MachineBasicBlock *MBB) {
49+
for (MachineInstr &MI : make_early_inc_range(MBB->terminators())) {
50+
unsigned Opc = MI.getOpcode();
51+
switch (Opc) {
52+
case AArch64::CBZW:
53+
case AArch64::CBZX:
54+
case AArch64::TBZW:
55+
case AArch64::TBZX:
56+
// CBZ XZR -> B
57+
if (MI.getOperand(0).getReg() == AArch64::WZR ||
58+
MI.getOperand(0).getReg() == AArch64::XZR) {
59+
LLVM_DEBUG(dbgs() << "Removing redundant branch: " << MI);
60+
MachineBasicBlock *Target =
61+
MI.getOperand(Opc == AArch64::TBZW || Opc == AArch64::TBZX ? 2 : 1)
62+
.getMBB();
63+
MachineBasicBlock *MBB = MI.getParent();
64+
SmallVector<MachineBasicBlock *> Succs(MBB->successors());
65+
for (auto *S : Succs)
66+
if (S != Target)
67+
MBB->removeSuccessor(S);
68+
SmallVector<MachineInstr *> DeadInstrs;
69+
for (auto It = MI.getIterator(); It != MBB->end(); ++It)
70+
DeadInstrs.push_back(&*It);
71+
const MachineFunction *MF = MBB->getParent();
72+
const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
73+
BuildMI(MBB, MI.getDebugLoc(), TII->get(AArch64::B)).addMBB(Target);
74+
for (auto It : DeadInstrs)
75+
It->eraseFromParent();
76+
return true;
77+
}
78+
break;
79+
case AArch64::CBNZW:
80+
case AArch64::CBNZX:
81+
case AArch64::TBNZW:
82+
case AArch64::TBNZX:
83+
// CBNZ XZR -> nop
84+
if (MI.getOperand(0).getReg() == AArch64::WZR ||
85+
MI.getOperand(0).getReg() == AArch64::XZR) {
86+
LLVM_DEBUG(dbgs() << "Removing redundant branch: " << MI);
87+
MachineBasicBlock *Target =
88+
MI.getOperand((Opc == AArch64::TBNZW || Opc == AArch64::TBNZX) ? 2
89+
: 1)
90+
.getMBB();
91+
MI.getParent()->removeSuccessor(Target);
92+
MI.eraseFromParent();
93+
return true;
94+
}
95+
break;
96+
}
97+
}
98+
return false;
99+
}
100+
101+
bool AArch64RedundantCondBranch::runOnMachineFunction(MachineFunction &MF) {
102+
if (skipFunction(MF.getFunction()))
103+
return false;
104+
105+
bool Changed = false;
106+
for (MachineBasicBlock &MBB : MF)
107+
Changed |= optimizeTerminators(&MBB);
108+
return Changed;
109+
}
110+
111+
FunctionPass *llvm::createAArch64RedundantCondBranchPass() {
112+
return new AArch64RedundantCondBranch();
113+
}

llvm/lib/Target/AArch64/AArch64TargetMachine.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,7 @@ LLVMInitializeAArch64Target() {
260260
initializeAArch64PostSelectOptimizePass(PR);
261261
initializeAArch64PromoteConstantPass(PR);
262262
initializeAArch64RedundantCopyEliminationPass(PR);
263+
initializeAArch64RedundantCondBranchPass(PR);
263264
initializeAArch64StorePairSuppressPass(PR);
264265
initializeFalkorHWPFFixPass(PR);
265266
initializeFalkorMarkStridedAccessesLegacyPass(PR);
@@ -862,6 +863,8 @@ void AArch64PassConfig::addPreEmitPass() {
862863
if (TM->getOptLevel() >= CodeGenOptLevel::Aggressive &&
863864
EnableAArch64CopyPropagation)
864865
addPass(createMachineCopyPropagationPass(true));
866+
if (TM->getOptLevel() != CodeGenOptLevel::None)
867+
addPass(createAArch64RedundantCondBranchPass());
865868

866869
addPass(createAArch64A53Fix835769());
867870

llvm/lib/Target/AArch64/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ add_llvm_target(AArch64CodeGen
6161
AArch64CompressJumpTables.cpp
6262
AArch64ConditionOptimizer.cpp
6363
AArch64RedundantCopyElimination.cpp
64+
AArch64RedundantCondBranchPass.cpp
6465
AArch64ISelDAGToDAG.cpp
6566
AArch64ISelLowering.cpp
6667
AArch64InstrInfo.cpp

llvm/test/CodeGen/AArch64/O3-pipeline.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@
227227
; CHECK-NEXT: Implement the 'patchable-function' attribute
228228
; CHECK-NEXT: AArch64 load / store optimization pass
229229
; CHECK-NEXT: Machine Copy Propagation Pass
230+
; CHECK-NEXT: AArch64 Redundant Conditional Branch Elimination
230231
; CHECK-NEXT: Workaround A53 erratum 835769 pass
231232
; CHECK-NEXT: Contiguously Lay Out Funclets
232233
; CHECK-NEXT: Remove Loads Into Fake Uses

llvm/test/CodeGen/AArch64/arm64-rev.ll

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -530,28 +530,22 @@ declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
530530
define void @test_rev16_truncstore() {
531531
; CHECK-SD-LABEL: test_rev16_truncstore:
532532
; CHECK-SD: // %bb.0: // %entry
533-
; CHECK-SD-NEXT: cbnz wzr, .LBB38_2
534533
; CHECK-SD-NEXT: .LBB38_1: // %cleanup
535534
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
536535
; CHECK-SD-NEXT: ldrh w8, [x8]
537536
; CHECK-SD-NEXT: rev16 w8, w8
538537
; CHECK-SD-NEXT: strh w8, [x8]
539-
; CHECK-SD-NEXT: cbz wzr, .LBB38_1
540-
; CHECK-SD-NEXT: .LBB38_2: // %fail
541-
; CHECK-SD-NEXT: ret
538+
; CHECK-SD-NEXT: b .LBB38_1
542539
;
543540
; CHECK-GI-LABEL: test_rev16_truncstore:
544541
; CHECK-GI: // %bb.0: // %entry
545-
; CHECK-GI-NEXT: tbnz wzr, #0, .LBB38_2
546542
; CHECK-GI-NEXT: .LBB38_1: // %cleanup
547543
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
548544
; CHECK-GI-NEXT: ldrh w8, [x8]
549545
; CHECK-GI-NEXT: rev w8, w8
550546
; CHECK-GI-NEXT: lsr w8, w8, #16
551547
; CHECK-GI-NEXT: strh w8, [x8]
552-
; CHECK-GI-NEXT: tbz wzr, #0, .LBB38_1
553-
; CHECK-GI-NEXT: .LBB38_2: // %fail
554-
; CHECK-GI-NEXT: ret
548+
; CHECK-GI-NEXT: b .LBB38_1
555549
entry:
556550
br label %body
557551

llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -735,7 +735,6 @@ define void @infiniteloop() {
735735
; ENABLE-NEXT: .cfi_offset w29, -16
736736
; ENABLE-NEXT: .cfi_offset w19, -24
737737
; ENABLE-NEXT: .cfi_offset w20, -32
738-
; ENABLE-NEXT: cbnz wzr, LBB10_3
739738
; ENABLE-NEXT: ; %bb.1: ; %if.then
740739
; ENABLE-NEXT: sub x19, sp, #16
741740
; ENABLE-NEXT: mov sp, x19
@@ -746,7 +745,7 @@ define void @infiniteloop() {
746745
; ENABLE-NEXT: add w20, w0, w20
747746
; ENABLE-NEXT: str w20, [x19]
748747
; ENABLE-NEXT: b LBB10_2
749-
; ENABLE-NEXT: LBB10_3: ; %if.end
748+
; ENABLE-NEXT: ; %bb.3: ; %if.end
750749
; ENABLE-NEXT: sub sp, x29, #16
751750
; ENABLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
752751
; ENABLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
@@ -762,7 +761,6 @@ define void @infiniteloop() {
762761
; DISABLE-NEXT: .cfi_offset w29, -16
763762
; DISABLE-NEXT: .cfi_offset w19, -24
764763
; DISABLE-NEXT: .cfi_offset w20, -32
765-
; DISABLE-NEXT: cbnz wzr, LBB10_3
766764
; DISABLE-NEXT: ; %bb.1: ; %if.then
767765
; DISABLE-NEXT: sub x19, sp, #16
768766
; DISABLE-NEXT: mov sp, x19
@@ -773,7 +771,7 @@ define void @infiniteloop() {
773771
; DISABLE-NEXT: add w20, w0, w20
774772
; DISABLE-NEXT: str w20, [x19]
775773
; DISABLE-NEXT: b LBB10_2
776-
; DISABLE-NEXT: LBB10_3: ; %if.end
774+
; DISABLE-NEXT: ; %bb.3: ; %if.end
777775
; DISABLE-NEXT: sub sp, x29, #16
778776
; DISABLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
779777
; DISABLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
@@ -808,7 +806,6 @@ define void @infiniteloop2() {
808806
; ENABLE-NEXT: .cfi_offset w29, -16
809807
; ENABLE-NEXT: .cfi_offset w19, -24
810808
; ENABLE-NEXT: .cfi_offset w20, -32
811-
; ENABLE-NEXT: cbnz wzr, LBB11_3
812809
; ENABLE-NEXT: ; %bb.1: ; %if.then
813810
; ENABLE-NEXT: sub x8, sp, #16
814811
; ENABLE-NEXT: mov sp, x8
@@ -825,7 +822,7 @@ define void @infiniteloop2() {
825822
; ENABLE-NEXT: nop
826823
; ENABLE-NEXT: ; InlineAsm End
827824
; ENABLE-NEXT: b LBB11_2
828-
; ENABLE-NEXT: LBB11_3: ; %if.end
825+
; ENABLE-NEXT: ; %bb.3: ; %if.end
829826
; ENABLE-NEXT: sub sp, x29, #16
830827
; ENABLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
831828
; ENABLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
@@ -841,7 +838,6 @@ define void @infiniteloop2() {
841838
; DISABLE-NEXT: .cfi_offset w29, -16
842839
; DISABLE-NEXT: .cfi_offset w19, -24
843840
; DISABLE-NEXT: .cfi_offset w20, -32
844-
; DISABLE-NEXT: cbnz wzr, LBB11_3
845841
; DISABLE-NEXT: ; %bb.1: ; %if.then
846842
; DISABLE-NEXT: sub x8, sp, #16
847843
; DISABLE-NEXT: mov sp, x8
@@ -858,7 +854,7 @@ define void @infiniteloop2() {
858854
; DISABLE-NEXT: nop
859855
; DISABLE-NEXT: ; InlineAsm End
860856
; DISABLE-NEXT: b LBB11_2
861-
; DISABLE-NEXT: LBB11_3: ; %if.end
857+
; DISABLE-NEXT: ; %bb.3: ; %if.end
862858
; DISABLE-NEXT: sub sp, x29, #16
863859
; DISABLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
864860
; DISABLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
@@ -893,7 +889,6 @@ if.end:
893889
define void @infiniteloop3() {
894890
; ENABLE-LABEL: infiniteloop3:
895891
; ENABLE: ; %bb.0: ; %entry
896-
; ENABLE-NEXT: cbnz wzr, LBB12_5
897892
; ENABLE-NEXT: ; %bb.1: ; %loop2a.preheader
898893
; ENABLE-NEXT: mov x8, xzr
899894
; ENABLE-NEXT: mov x9, xzr
@@ -912,12 +907,11 @@ define void @infiniteloop3() {
912907
; ENABLE-NEXT: mov x8, x10
913908
; ENABLE-NEXT: mov x11, x10
914909
; ENABLE-NEXT: b LBB12_3
915-
; ENABLE-NEXT: LBB12_5: ; %end
910+
; ENABLE-NEXT: ; %bb.5: ; %end
916911
; ENABLE-NEXT: ret
917912
;
918913
; DISABLE-LABEL: infiniteloop3:
919914
; DISABLE: ; %bb.0: ; %entry
920-
; DISABLE-NEXT: cbnz wzr, LBB12_5
921915
; DISABLE-NEXT: ; %bb.1: ; %loop2a.preheader
922916
; DISABLE-NEXT: mov x8, xzr
923917
; DISABLE-NEXT: mov x9, xzr
@@ -936,7 +930,7 @@ define void @infiniteloop3() {
936930
; DISABLE-NEXT: mov x8, x10
937931
; DISABLE-NEXT: mov x11, x10
938932
; DISABLE-NEXT: b LBB12_3
939-
; DISABLE-NEXT: LBB12_5: ; %end
933+
; DISABLE-NEXT: ; %bb.5: ; %end
940934
; DISABLE-NEXT: ret
941935
entry:
942936
br i1 undef, label %loop2a, label %body

llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,20 @@
88
define i8 @foo_optsize(i32 %v4) optsize {
99
; CHECK-LABEL: foo_optsize:
1010
; CHECK: // %bb.0: // %entry
11-
; CHECK-NEXT: cbz wzr, .LBB0_2
11+
; CHECK-NEXT: b .LBB0_2
1212
; CHECK-NEXT: .LBB0_1:
1313
; CHECK-NEXT: mov w0, wzr
1414
; CHECK-NEXT: ret
1515
; CHECK-NEXT: .LBB0_2: // %b1
1616
; CHECK-NEXT: cbnz w0, .LBB0_4
17-
; CHECK-NEXT: .LBB0_3: // %b2
17+
; CHECK-NEXT: // %bb.3: // %b2
1818
; CHECK-NEXT: mov w0, #1 // =0x1
1919
; CHECK-NEXT: ret
2020
; CHECK-NEXT: .LBB0_4: // %b1
2121
; CHECK-NEXT: cmp w0, #1
2222
; CHECK-NEXT: b.ne .LBB0_1
2323
; CHECK-NEXT: // %bb.5: // %b3
24-
; CHECK-NEXT: cbz wzr, .LBB0_1
25-
; CHECK-NEXT: b .LBB0_3
24+
; CHECK-NEXT: b .LBB0_1
2625
entry:
2726
%v2 = icmp eq i32 0, 0
2827
br i1 %v2, label %b1, label %b4
@@ -48,20 +47,19 @@ b4:
4847
define i8 @foo_optspeed(i32 %v4) {
4948
; CHECK-LABEL: foo_optspeed:
5049
; CHECK: // %bb.0: // %entry
51-
; CHECK-NEXT: cbz wzr, .LBB1_2
50+
; CHECK-NEXT: b .LBB1_2
5251
; CHECK-NEXT: .LBB1_1:
5352
; CHECK-NEXT: mov w0, wzr
5453
; CHECK-NEXT: ret
5554
; CHECK-NEXT: .LBB1_2: // %b1
5655
; CHECK-NEXT: cbnz w0, .LBB1_4
57-
; CHECK-NEXT: .LBB1_3: // %b2
56+
; CHECK-NEXT: // %bb.3: // %b2
5857
; CHECK-NEXT: mov w0, #1 // =0x1
5958
; CHECK-NEXT: ret
6059
; CHECK-NEXT: .LBB1_4: // %b1
6160
; CHECK-NEXT: cmp w0, #1
6261
; CHECK-NEXT: b.ne .LBB1_1
6362
; CHECK-NEXT: // %bb.5: // %b3
64-
; CHECK-NEXT: cbnz wzr, .LBB1_3
6563
; CHECK-NEXT: b .LBB1_1
6664
entry:
6765
%v2 = icmp eq i32 0, 0

0 commit comments

Comments
 (0)