[HotColdSplit] Introduce a cost model to control splitting behavior
The main goal of the model is to avoid *increasing* function size, as
that would eradicate any memory locality benefits from splitting. This
happens when:
- There are too many inputs or outputs to the cold region. Argument
materialization and reloads of outputs have a cost.
- The cold region has too many distinct exit blocks, causing a large
switch to be formed in the caller.
- The code size cost of the split code is less than the cost of a
set-up call.
A secondary goal is to prevent excessive overall binary size growth.
With the cost model in place, I experimented to find a splitting
threshold that works well in practice. To make warm & cold code easily
separable for analysis purposes, I moved split functions to a "cold"
section. I experimented with thresholds between [0, 4] and set the
default to the threshold which minimized geomean __text size.
Experiment data from building LNT+externals for X86 (N = 639 programs,
all sizes in bytes):
| Configuration | __text geom size | __cold geom size | TEXT geom size |
| **-Os** | 1736.3 | 0, n=0 | 10961.6 |
| -Os, thresh=0 | 1740.53 | 124.482, n=134 | 11014 |
| -Os, thresh=1 | 1734.79 | 57.8781, n=90 | 10978.6 |
| -Os, thresh=2 | ** 1733.85 ** | 65.6604, n=61 | 10977.6 |
| -Os, thresh=3 | 1733.85 | 65.3071, n=61 | 10977.6 |
| -Os, thresh=4 | 1735.08 | 67.5156, n=54 | 10965.7 |
| **-Oz** | 1554.4 | 0, n=0 | 10153 |
| -Oz, thresh=2 | ** 1552.2 ** | 65.633, n=61 | 10176 |
| **-O3** | 2563.37 | 0, n=0 | 13105.4 |
| -O3, thresh=2 | ** 2559.49 ** | 71.1072, n=61 | 13162.4 |
Picking thresh=2 reduces the geomean __text section size by 0.14% at
-Os, -Oz, and -O3 and causes ~0.2% growth in the TEXT segment. Note that
TEXT size is page-aligned, whereas section sizes are byte-aligned.
Experiment data from building LNT+externals for ARM64 (N = 558 programs,
all sizes in bytes):
| Configuration | __text geom size | __cold geom size | TEXT geom size |
| **-Os** | 1763.96 | 0, n=0 | 42934.9 |
| -Os, thresh=2 | ** 1760.9 ** | 76.6755, n=61 | 42934.9 |
Picking thresh=2 reduces the geomean __text section size by 0.17% at
-Os and causes no growth in the TEXT segment.
Measurements were done with D57082 (r352080) applied.
Differential Revision: https://reviews.llvm.org/D57125
llvm-svn: 352228
diff --git a/llvm/test/Transforms/HotColdSplit/X86/extraction-subregion-breaks-phis.ll b/llvm/test/Transforms/HotColdSplit/X86/extraction-subregion-breaks-phis.ll
deleted file mode 100644
index 9a751e3..0000000
--- a/llvm/test/Transforms/HotColdSplit/X86/extraction-subregion-breaks-phis.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: opt -S -hotcoldsplit -hotcoldsplit-threshold=1 < %s | FileCheck %s
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.14.0"
-
-; CHECK-LABEL: define {{.*}}@foo(
-; CHECK: call {{.*}}@foo.cold.1(
-; CHECK: unreachable
-
-; CHECK-LABEL: define {{.*}}@foo.cold.1(
-; CHECK: switch i32 undef, label %sw.epilog.i
-define void @foo(i32 %QMM) {
-entry:
- switch i32 %QMM, label %entry.if.end16_crit_edge [
- i32 1, label %if.then
- ]
-
-entry.if.end16_crit_edge: ; preds = %entry
- br label %if.end16
-
-if.then: ; preds = %entry
- br i1 undef, label %cond.true.i.i, label %_ZN10StringView8popFrontEv.exit.i
-
-cond.true.i.i: ; preds = %if.then
- ret void
-
-_ZN10StringView8popFrontEv.exit.i: ; preds = %if.then
- switch i32 undef, label %sw.epilog.i [
- i32 81, label %if.end16
- i32 82, label %sw.bb4.i
- i32 83, label %sw.bb8.i
- i32 84, label %sw.bb12.i
- i32 65, label %if.end16
- i32 66, label %sw.bb20.i
- i32 67, label %sw.bb24.i
- i32 68, label %sw.bb28.i
- ]
-
-sw.bb4.i: ; preds = %_ZN10StringView8popFrontEv.exit.i
- br label %if.end16
-
-sw.bb8.i: ; preds = %_ZN10StringView8popFrontEv.exit.i
- br label %if.end16
-
-sw.bb12.i: ; preds = %_ZN10StringView8popFrontEv.exit.i
- br label %if.end16
-
-sw.bb20.i: ; preds = %_ZN10StringView8popFrontEv.exit.i
- br label %if.end16
-
-sw.bb24.i: ; preds = %_ZN10StringView8popFrontEv.exit.i
- br label %if.end16
-
-sw.bb28.i: ; preds = %_ZN10StringView8popFrontEv.exit.i
- br label %if.end16
-
-sw.epilog.i: ; preds = %_ZN10StringView8popFrontEv.exit.i
- br label %if.end16
-
-if.end16: ; preds = %sw.epilog.i, %sw.bb28.i, %sw.bb24.i, %sw.bb20.i, %sw.bb12.i, %sw.bb8.i, %sw.bb4.i, %_ZN10StringView8popFrontEv.exit.i, %_ZN10StringView8popFrontEv.exit.i, %entry.if.end16_crit_edge
- %0 = phi i8 [ 0, %entry.if.end16_crit_edge ], [ 0, %_ZN10StringView8popFrontEv.exit.i ], [ 0, %_ZN10StringView8popFrontEv.exit.i ], [ 1, %sw.bb4.i ], [ 2, %sw.bb8.i ], [ 3, %sw.bb12.i ], [ 1, %sw.bb20.i ], [ 2, %sw.bb24.i ], [ 3, %sw.bb28.i ], [ 0, %sw.epilog.i ]
- unreachable
-}
diff --git a/llvm/test/Transforms/HotColdSplit/X86/outline-expensive.ll b/llvm/test/Transforms/HotColdSplit/X86/outline-expensive.ll
deleted file mode 100644
index 3f04283..0000000
--- a/llvm/test/Transforms/HotColdSplit/X86/outline-expensive.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; The magic number 6 comes from (1 * TCC_Expensive) + (1 * CostOfCallX86).
-; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=6 -S < %s | FileCheck %s
-
-; Test that we outline even though there are only two cold instructions. TTI
-; should determine that they are expensive in terms of code size.
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.14.0"
-
-; CHECK-LABEL: @fun
-; CHECK: call void @fun.cold.1
-define void @fun(i32 %x) {
-entry:
- br i1 undef, label %if.then, label %if.else
-
-if.then:
- ret void
-
-if.else:
- %y = sdiv i32 %x, 111
- call void @sink(i32 %y)
- ret void
-}
-
-declare void @sink(i32 %x) cold
diff --git a/llvm/test/Transforms/HotColdSplit/addr-taken.ll b/llvm/test/Transforms/HotColdSplit/addr-taken.ll
index f2f448c..19f1d4f 100644
--- a/llvm/test/Transforms/HotColdSplit/addr-taken.ll
+++ b/llvm/test/Transforms/HotColdSplit/addr-taken.ll
@@ -1,4 +1,4 @@
-; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=0 -S < %s | FileCheck %s
+; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=-1 -S < %s | FileCheck %s
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.14.0"
diff --git a/llvm/test/Transforms/HotColdSplit/apply-noreturn-bonus.ll b/llvm/test/Transforms/HotColdSplit/apply-noreturn-bonus.ll
new file mode 100644
index 0000000..c1d9af8
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/apply-noreturn-bonus.ll
@@ -0,0 +1,26 @@
+; REQUIRES: asserts
+; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
+
+declare void @sink() cold
+
+define void @foo(i32 %arg) {
+entry:
+ br i1 undef, label %cold1, label %exit
+
+cold1:
+ ; CHECK: Applying bonus for: 4 non-returning terminators
+ call void @sink()
+ br i1 undef, label %cold2, label %cold3
+
+cold2:
+ br label %cold4
+
+cold3:
+ br label %cold4
+
+cold4:
+ unreachable
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/HotColdSplit/apply-penalty-for-inputs.ll b/llvm/test/Transforms/HotColdSplit/apply-penalty-for-inputs.ll
new file mode 100644
index 0000000..fffd6f9
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/apply-penalty-for-inputs.ll
@@ -0,0 +1,19 @@
+; REQUIRES: asserts
+; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
+
+declare void @sink(i32*, i32, i32) cold
+
+@g = global i32 0
+
+define void @foo(i32 %arg) {
+ %local = load i32, i32* @g
+ br i1 undef, label %cold, label %exit
+
+cold:
+ ; CHECK: Applying penalty for: 2 inputs
+ call void @sink(i32* @g, i32 %arg, i32 %local)
+ ret void
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/HotColdSplit/apply-penalty-for-outputs.ll b/llvm/test/Transforms/HotColdSplit/apply-penalty-for-outputs.ll
new file mode 100644
index 0000000..a7d9f97
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/apply-penalty-for-outputs.ll
@@ -0,0 +1,22 @@
+; REQUIRES: asserts
+; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
+
+declare void @sink() cold
+
+@g = global i32 0
+
+define i32 @foo(i32 %arg) {
+entry:
+ br i1 undef, label %cold, label %exit
+
+cold:
+ ; CHECK: Applying penalty for: 1 output
+ ; CHECK: Applying penalty for: 1 non-region successors
+ %local = load i32, i32* @g
+ call void @sink()
+ br label %exit
+
+exit:
+ %p = phi i32 [ %local, %cold ], [ 0, %entry ]
+ ret i32 %p
+}
diff --git a/llvm/test/Transforms/HotColdSplit/apply-successor-penalty.ll b/llvm/test/Transforms/HotColdSplit/apply-successor-penalty.ll
new file mode 100644
index 0000000..3886d76
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/apply-successor-penalty.ll
@@ -0,0 +1,53 @@
+; REQUIRES: asserts
+; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
+
+declare void @sink() cold
+
+; CHECK-LABEL: Outlining in one_non_region_successor
+define void @one_non_region_successor(i32 %arg) {
+entry:
+ br i1 undef, label %cold1, label %exit
+
+cold1:
+ ; CHECK: Applying penalty for: 1 non-region successor
+ call void @sink()
+ br i1 undef, label %cold2, label %cold3
+
+cold2:
+ br i1 undef, label %cold4, label %exit
+
+cold3:
+ br i1 undef, label %cold4, label %exit
+
+cold4:
+ unreachable
+
+exit:
+ ret void
+}
+
+; CHECK-LABEL: Outlining in two_non_region_successor
+define void @two_non_region_successors(i32 %arg) {
+entry:
+ br i1 undef, label %cold1, label %exit1
+
+cold1:
+ ; CHECK: Applying penalty for: 2 non-region successors
+ call void @sink()
+ br i1 undef, label %cold2, label %cold3
+
+cold2:
+ br i1 undef, label %cold4, label %exit1
+
+cold3:
+ br i1 undef, label %cold4, label %exit2
+
+cold4:
+ unreachable
+
+exit1:
+ br label %exit2
+
+exit2:
+ ret void
+}
diff --git a/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll b/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
index 64bc94e..b33454b 100644
--- a/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
+++ b/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -hotcoldsplit -hotcoldsplit-threshold=0 < %s 2>&1 | FileCheck %s
+; RUN: opt -S -hotcoldsplit -hotcoldsplit-threshold=-1 < %s 2>&1 | FileCheck %s
; CHECK-LABEL: define {{.*}}@fun
; CHECK: call {{.*}}@fun.cold.2(
diff --git a/llvm/test/Transforms/HotColdSplit/resume.ll b/llvm/test/Transforms/HotColdSplit/resume.ll
index cbda078..2b8ea7d 100644
--- a/llvm/test/Transforms/HotColdSplit/resume.ll
+++ b/llvm/test/Transforms/HotColdSplit/resume.ll
@@ -1,4 +1,4 @@
-; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=0 -S < %s | FileCheck %s
+; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=-1 -S < %s | FileCheck %s
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.14.0"
diff --git a/llvm/test/Transforms/HotColdSplit/split-cold-2.ll b/llvm/test/Transforms/HotColdSplit/split-cold-2.ll
index 0ce1681..0b228a5 100644
--- a/llvm/test/Transforms/HotColdSplit/split-cold-2.ll
+++ b/llvm/test/Transforms/HotColdSplit/split-cold-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=0 -pass-remarks=hotcoldsplit -S < %s 2>&1 | FileCheck %s
-; RUN: opt -hotcoldsplit-threshold=0 -passes=hotcoldsplit -pass-remarks=hotcoldsplit -S < %s 2>&1 | FileCheck %s
+; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=-1 -pass-remarks=hotcoldsplit -S < %s 2>&1 | FileCheck %s
+; RUN: opt -passes=hotcoldsplit -hotcoldsplit-threshold=-1 -pass-remarks=hotcoldsplit -S < %s 2>&1 | FileCheck %s
; Make sure this compiles. This test used to fail with an invalid phi node: the
; two predecessors were outlined and the SSA representation was invalid.