[InlineCost] Improve the cost heuristic for Switch
Summary:
The motivation example is like below which has 13 cases but only 2 distinct targets
```
lor.lhs.false2: ; preds = %if.then
switch i32 %Status, label %if.then27 [
i32 -7012, label %if.end35
i32 -10008, label %if.end35
i32 -10016, label %if.end35
i32 15000, label %if.end35
i32 14013, label %if.end35
i32 10114, label %if.end35
i32 10107, label %if.end35
i32 10105, label %if.end35
i32 10013, label %if.end35
i32 10011, label %if.end35
i32 7008, label %if.end35
i32 7007, label %if.end35
i32 5002, label %if.end35
]
```
which is compiled into a balanced binary tree like this on AArch64 (similar on X86)
```
.LBB853_9: // %lor.lhs.false2
mov w8, #10012
cmp w19, w8
b.gt .LBB853_14
// BB#10: // %lor.lhs.false2
mov w8, #5001
cmp w19, w8
b.gt .LBB853_18
// BB#11: // %lor.lhs.false2
mov w8, #-10016
cmp w19, w8
b.eq .LBB853_23
// BB#12: // %lor.lhs.false2
mov w8, #-10008
cmp w19, w8
b.eq .LBB853_23
// BB#13: // %lor.lhs.false2
mov w8, #-7012
cmp w19, w8
b.eq .LBB853_23
b .LBB853_3
.LBB853_14: // %lor.lhs.false2
mov w8, #14012
cmp w19, w8
b.gt .LBB853_21
// BB#15: // %lor.lhs.false2
mov w8, #-10105
add w8, w19, w8
cmp w8, #9 // =9
b.hi .LBB853_17
// BB#16: // %lor.lhs.false2
orr w9, wzr, #0x1
lsl w8, w9, w8
mov w9, #517
and w8, w8, w9
cbnz w8, .LBB853_23
.LBB853_17: // %lor.lhs.false2
mov w8, #10013
cmp w19, w8
b.eq .LBB853_23
b .LBB853_3
.LBB853_18: // %lor.lhs.false2
mov w8, #-7007
add w8, w19, w8
cmp w8, #2 // =2
b.lo .LBB853_23
// BB#19: // %lor.lhs.false2
mov w8, #5002
cmp w19, w8
b.eq .LBB853_23
// BB#20: // %lor.lhs.false2
mov w8, #10011
cmp w19, w8
b.eq .LBB853_23
b .LBB853_3
.LBB853_21: // %lor.lhs.false2
mov w8, #14013
cmp w19, w8
b.eq .LBB853_23
// BB#22: // %lor.lhs.false2
mov w8, #15000
cmp w19, w8
b.ne .LBB853_3
```
However, the inline cost model estimates the cost to be linear with the number
of distinct targets and the cost of the above switch is just 2 InstrCosts.
The function containing this switch is then inlined about 900 times.
This change use the general way of switch lowering for the inline heuristic. It
etimate the number of case clusters with the suitability check for a jump table
or bit test. Considering the binary search tree built for the clusters, this
change modifies the model to be linear with the size of the balanced binary
tree. The model is off by default for now :
-inline-generic-switch-cost=false
This change was originally proposed by Haicheng in D29870.
Reviewers: hans, bmakam, chandlerc, eraman, haicheng, mcrosier
Reviewed By: hans
Subscribers: joerg, aemerson, llvm-commits, rengolin
Differential Revision: https://reviews.llvm.org/D31085
llvm-svn: 301649
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 788f908..019051b 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -54,6 +54,11 @@
cl::init(45),
cl::desc("Threshold for inlining cold callsites"));
+static cl::opt<bool>
+ EnableGenericSwitchCost("inline-generic-switch-cost", cl::Hidden,
+ cl::init(false),
+ cl::desc("Enable generic switch cost model"));
+
// We introduce this threshold to help performance of instrumentation based
// PGO before we actually hook up inliner with analysis passes such as BPI and
// BFI.
@@ -998,11 +1003,72 @@
if (isa<ConstantInt>(V))
return true;
- // Otherwise, we need to accumulate a cost proportional to the number of
- // distinct successor blocks. This fan-out in the CFG cannot be represented
- // for free even if we can represent the core switch as a jumptable that
- // takes a single instruction.
- //
+ if (EnableGenericSwitchCost) {
+ // Assume the most general case where the swith is lowered into
+ // either a jump table, bit test, or a balanced binary tree consisting of
+ // case clusters without merging adjacent clusters with the same
+ // destination. We do not consider the switches that are lowered with a mix
+ // of jump table/bit test/binary search tree. The cost of the switch is
+ // proportional to the size of the tree or the size of jump table range.
+
+ // Exit early for a large switch, assuming one case needs at least one
+ // instruction.
+ // FIXME: This is not true for a bit test, but ignore such case for now to
+ // save compile-time.
+ int64_t CostLowerBound =
+ std::min((int64_t)INT_MAX,
+ (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
+
+ if (CostLowerBound > Threshold) {
+ Cost = CostLowerBound;
+ return false;
+ }
+
+ unsigned JumpTableSize = 0;
+ unsigned NumCaseCluster =
+ TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
+
+ // If suitable for a jump table, consider the cost for the table size and
+ // branch to destination.
+ if (JumpTableSize) {
+ int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
+ 4 * InlineConstants::InstrCost;
+ Cost = std::min((int64_t)INT_MAX, JTCost + Cost);
+ return false;
+ }
+
+ // Considering forming a binary search, we should find the number of nodes
+ // which is same as the number of comparisons when lowered. For a given
+ // number of clusters, n, we can define a recursive function, f(n), to find
+ // the number of nodes in the tree. The recursion is :
+ // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
+ // and f(n) = n, when n <= 3.
+ // This will lead a binary tree where the leaf should be either f(2) or f(3)
+ // when n > 3. So, the number of comparisons from leaves should be n, while
+ // the number of non-leaf should be :
+ // 2^(log2(n) - 1) - 1
+ // = 2^log2(n) * 2^-1 - 1
+ // = n / 2 - 1.
+ // Considering comparisons from leaf and non-leaf nodes, we can estimate the
+ // number of comparisons in a simple closed form :
+ // n + n / 2 - 1 = n * 3 / 2 - 1
+ if (NumCaseCluster <= 3) {
+ // Suppose a comparison includes one compare and one conditional branch.
+ Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
+ return false;
+ }
+ int64_t ExpectedNumberOfCompare = 3 * (uint64_t)NumCaseCluster / 2 - 1;
+ uint64_t SwitchCost =
+ ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
+ Cost = std::min((uint64_t)INT_MAX, SwitchCost + Cost);
+ return false;
+ }
+
+ // Use a simple switch cost model where we accumulate a cost proportional to
+ // the number of distinct successor blocks. This fan-out in the CFG cannot
+ // be represented for free even if we can represent the core switch as a
+ // jumptable that takes a single instruction.
+ ///
// NB: We convert large switches which are just used to initialize large phi
// nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
// inlining those. It will prevent inlining in cases where the optimization