[HotColdSplitting] Outline more than once per function
Algorithm: Identify maximal cold regions and put them in a worklist. If
a candidate region overlaps with another, discard it. While the worklist
is full, remove a single-entry sub-region from the worklist and attempt
to outline it. By the non-overlap property, this should not invalidate
parts of the domtree pertaining to other outlining regions.
Testing: LNT results on X86 are clean. With test-suite + externals, llvm
outlines 134KB pre-patch, and 352KB post-patch (+ ~2.6x). The file
483.xalancbmk/src/Constants.cpp stands out as an extreme case where llvm
outlines over 100 times in some functions (mostly EH paths). There was
not a significant performance impact pre vs. post-patch.
Differential Revision: https://reviews.llvm.org/D53887
llvm-svn: 348639
diff --git a/llvm/test/Transforms/HotColdSplit/eh-pads.ll b/llvm/test/Transforms/HotColdSplit/eh-pads.ll
new file mode 100644
index 0000000..cf413b4
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/eh-pads.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK: landingpad
+; CHECK: sideeffect(i32 2)
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1(
+; CHECK: sideeffect(i32 0)
+; CHECK: sideeffect(i32 1)
+; CHECK: sink
+
+define void @foo(i32 %cond) personality i8 0 {
+entry:
+ invoke void @llvm.donothing() to label %normal unwind label %exception
+
+exception:
+ ; Note: EH pads are not candidates for region entry points.
+ %cleanup = landingpad i8 cleanup
+ br label %continue_exception
+
+continue_exception:
+ call void @sideeffect(i32 0)
+ call void @sideeffect(i32 1)
+ call void @sink()
+ ret void
+
+normal:
+ call void @sideeffect(i32 2)
+ ret void
+}
+
+declare void @sideeffect(i32)
+
+declare void @sink() cold
+
+declare void @llvm.donothing() nounwind readnone
diff --git a/llvm/test/Transforms/HotColdSplit/extraction-subregion-breaks-phis.ll b/llvm/test/Transforms/HotColdSplit/extraction-subregion-breaks-phis.ll
new file mode 100644
index 0000000..11256a4
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/extraction-subregion-breaks-phis.ll
@@ -0,0 +1,63 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK: call {{.*}}@foo.cold.1(
+; CHECK: unreachable
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1(
+; CHECK: switch i32 undef, label %sw.epilog.i
+define void @foo(i32 %QMM) {
+entry:
+ switch i32 %QMM, label %entry.if.end16_crit_edge [
+ i32 1, label %if.then
+ ]
+
+entry.if.end16_crit_edge: ; preds = %entry
+ br label %if.end16
+
+if.then: ; preds = %entry
+ br i1 undef, label %cond.true.i.i, label %_ZN10StringView8popFrontEv.exit.i
+
+cond.true.i.i: ; preds = %if.then
+ ret void
+
+_ZN10StringView8popFrontEv.exit.i: ; preds = %if.then
+ switch i32 undef, label %sw.epilog.i [
+ i32 81, label %if.end16
+ i32 82, label %sw.bb4.i
+ i32 83, label %sw.bb8.i
+ i32 84, label %sw.bb12.i
+ i32 65, label %if.end16
+ i32 66, label %sw.bb20.i
+ i32 67, label %sw.bb24.i
+ i32 68, label %sw.bb28.i
+ ]
+
+sw.bb4.i: ; preds = %_ZN10StringView8popFrontEv.exit.i
+ br label %if.end16
+
+sw.bb8.i: ; preds = %_ZN10StringView8popFrontEv.exit.i
+ br label %if.end16
+
+sw.bb12.i: ; preds = %_ZN10StringView8popFrontEv.exit.i
+ br label %if.end16
+
+sw.bb20.i: ; preds = %_ZN10StringView8popFrontEv.exit.i
+ br label %if.end16
+
+sw.bb24.i: ; preds = %_ZN10StringView8popFrontEv.exit.i
+ br label %if.end16
+
+sw.bb28.i: ; preds = %_ZN10StringView8popFrontEv.exit.i
+ br label %if.end16
+
+sw.epilog.i: ; preds = %_ZN10StringView8popFrontEv.exit.i
+ br label %if.end16
+
+if.end16: ; preds = %sw.epilog.i, %sw.bb28.i, %sw.bb24.i, %sw.bb20.i, %sw.bb12.i, %sw.bb8.i, %sw.bb4.i, %_ZN10StringView8popFrontEv.exit.i, %_ZN10StringView8popFrontEv.exit.i, %entry.if.end16_crit_edge
+ %0 = phi i8 [ 0, %entry.if.end16_crit_edge ], [ 0, %_ZN10StringView8popFrontEv.exit.i ], [ 0, %_ZN10StringView8popFrontEv.exit.i ], [ 1, %sw.bb4.i ], [ 2, %sw.bb8.i ], [ 3, %sw.bb12.i ], [ 1, %sw.bb20.i ], [ 2, %sw.bb24.i ], [ 3, %sw.bb28.i ], [ 0, %sw.epilog.i ]
+ unreachable
+}
diff --git a/llvm/test/Transforms/HotColdSplit/forward-dfs-reaches-marked-block.ll b/llvm/test/Transforms/HotColdSplit/forward-dfs-reaches-marked-block.ll
new file mode 100644
index 0000000..81a8f42
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/forward-dfs-reaches-marked-block.ll
@@ -0,0 +1,29 @@
+; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@fun
+; CHECK: call {{.*}}@fun.cold.1(
+define void @fun() {
+entry:
+ br i1 undef, label %if.then, label %if.else
+
+if.then:
+ ; This will be marked by the inverse DFS on sink-predecesors.
+ br label %sink
+
+sink:
+ call void @sink()
+
+ ; Do not allow the forward-DFS on sink-successors to mark the block again.
+ br i1 undef, label %if.then, label %if.then.exit
+
+if.then.exit:
+ ret void
+
+if.else:
+ ret void
+}
+
+declare void @sink() cold
diff --git a/llvm/test/Transforms/HotColdSplit/mark-the-whole-func-cold.ll b/llvm/test/Transforms/HotColdSplit/mark-the-whole-func-cold.ll
new file mode 100644
index 0000000..d8bd55e
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/mark-the-whole-func-cold.ll
@@ -0,0 +1,64 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+; Source:
+;
+; extern __attribute__((cold)) void sink();
+; extern void sideeffect(int);
+; void foo(int cond1, int cond2) {
+; if (cond1) {
+; if (cond2) {
+; sideeffect(0);
+; } else {
+; sideeffect(1);
+; }
+; sink();
+; } else {
+; sideeffect(2);
+; }
+; sink();
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK: define {{.*}}@_Z3fooii{{.*}}#[[outlined_func_attr:[0-9]+]]
+; CHECK-NOT: _Z3fooii.cold
+; CHECK: attributes #[[outlined_func_attr]] = { {{.*}}minsize
+define void @_Z3fooii(i32, i32) {
+ %3 = alloca i32, align 4
+ %4 = alloca i32, align 4
+ store i32 %0, i32* %3, align 4
+ store i32 %1, i32* %4, align 4
+ %5 = load i32, i32* %3, align 4
+ %6 = icmp ne i32 %5, 0
+ br i1 %6, label %7, label %13
+
+; <label>:7: ; preds = %2
+ %8 = load i32, i32* %4, align 4
+ %9 = icmp ne i32 %8, 0
+ br i1 %9, label %10, label %11
+
+; <label>:10: ; preds = %7
+ call void @_Z10sideeffecti(i32 0)
+ br label %12
+
+; <label>:11: ; preds = %7
+ call void @_Z10sideeffecti(i32 1)
+ br label %12
+
+; <label>:12: ; preds = %11, %10
+ call void @_Z4sinkv() #3
+ br label %14
+
+; <label>:13: ; preds = %2
+ call void @_Z10sideeffecti(i32 2)
+ br label %14
+
+; <label>:14: ; preds = %13, %12
+ call void @_Z4sinkv() #3
+ ret void
+}
+
+declare void @_Z10sideeffecti(i32)
+
+declare void @_Z4sinkv() cold
diff --git a/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll b/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
new file mode 100644
index 0000000..d10240a
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
@@ -0,0 +1,57 @@
+; RUN: opt -S -hotcoldsplit < %s 2>&1 | FileCheck %s
+
+; CHECK-LABEL: define {{.*}}@fun
+; CHECK: call {{.*}}@fun.cold.2(
+; CHECK-NEXT: ret void
+; CHECK: call {{.*}}@fun.cold.1(
+; CHECK-NEXT: ret void
+define void @fun() {
+entry:
+ br i1 undef, label %A.then, label %A.else
+
+A.else:
+ br label %A.then4
+
+A.then4:
+ br i1 undef, label %A.then5, label %A.end
+
+A.then5:
+ br label %A.cleanup
+
+A.end:
+ br label %A.cleanup
+
+A.cleanup:
+ %A.cleanup.dest.slot.0 = phi i32 [ 1, %A.then5 ], [ 0, %A.end ]
+ unreachable
+
+A.then:
+ br i1 undef, label %B.then, label %B.else
+
+B.then:
+ ret void
+
+B.else:
+ br label %B.then4
+
+B.then4:
+ br i1 undef, label %B.then5, label %B.end
+
+B.then5:
+ br label %B.cleanup
+
+B.end:
+ br label %B.cleanup
+
+B.cleanup:
+ %B.cleanup.dest.slot.0 = phi i32 [ 1, %B.then5 ], [ 0, %B.end ]
+ unreachable
+}
+
+; CHECK-LABEL: define {{.*}}@fun.cold.1(
+; CHECK: %B.cleanup.dest.slot.0 = phi i32 [ 1, %B.then5 ], [ 0, %B.end ]
+; CHECK-NEXT: unreachable
+
+; CHECK-LABEL: define {{.*}}@fun.cold.2(
+; CHECK: %A.cleanup.dest.slot.0 = phi i32 [ 1, %A.then5 ], [ 0, %A.end ]
+; CHECK-NEXT: unreachable
diff --git a/llvm/test/Transforms/HotColdSplit/outline-multiple-entry-region.ll b/llvm/test/Transforms/HotColdSplit/outline-multiple-entry-region.ll
new file mode 100644
index 0000000..8fd20c0
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/outline-multiple-entry-region.ll
@@ -0,0 +1,81 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+; Source:
+;
+; extern __attribute__((cold)) void sink();
+; extern void sideeffect(int);
+; void foo(int cond1, int cond2) {
+; while (true) {
+; if (cond1) {
+; sideeffect(0); // This is cold (it reaches sink()).
+; break;
+; }
+; if (cond2) {
+; sideeffect(1); // This is cold (it reaches sink()).
+; break;
+; }
+; sideeffect(2);
+; return;
+; }
+; sink();
+; sideeffect(3);
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@_Z3fooii.cold.1
+; CHECK: call void @_Z10sideeffecti(i32 1)
+; CHECK: call void @_Z10sideeffecti(i32 11)
+
+; CHECK-LABEL: define {{.*}}@_Z3fooii.cold.2
+; CHECK: call void @_Z10sideeffecti(i32 0)
+; CHECK: call void @_Z10sideeffecti(i32 10)
+
+; CHECK-LABEL: define {{.*}}@_Z3fooii.cold.3
+; CHECK: call void @_Z4sinkv
+; CHECK: call void @_Z10sideeffecti(i32 3)
+
+define void @_Z3fooii(i32, i32) {
+ %3 = alloca i32, align 4
+ %4 = alloca i32, align 4
+ store i32 %0, i32* %3, align 4
+ store i32 %1, i32* %4, align 4
+ br label %5
+
+; <label>:5: ; preds = %2
+ %6 = load i32, i32* %3, align 4
+ %7 = icmp ne i32 %6, 0
+ br i1 %7, label %8, label %9
+
+; <label>:8: ; preds = %5
+ call void @_Z10sideeffecti(i32 0)
+ call void @_Z10sideeffecti(i32 10)
+ br label %14
+
+; <label>:9: ; preds = %5
+ %10 = load i32, i32* %4, align 4
+ %11 = icmp ne i32 %10, 0
+ br i1 %11, label %12, label %13
+
+; <label>:12: ; preds = %9
+ call void @_Z10sideeffecti(i32 1)
+ call void @_Z10sideeffecti(i32 11)
+ br label %14
+
+; <label>:13: ; preds = %9
+ call void @_Z10sideeffecti(i32 2)
+ br label %15
+
+; <label>:14: ; preds = %12, %8
+ call void @_Z4sinkv() #3
+ call void @_Z10sideeffecti(i32 3)
+ br label %15
+
+; <label>:15: ; preds = %14, %13
+ ret void
+}
+
+declare void @_Z10sideeffecti(i32)
+
+declare void @_Z4sinkv() cold
diff --git a/llvm/test/Transforms/HotColdSplit/outline-while-loop.ll b/llvm/test/Transforms/HotColdSplit/outline-while-loop.ll
index 2a132bd..4c4e1f9 100644
--- a/llvm/test/Transforms/HotColdSplit/outline-while-loop.ll
+++ b/llvm/test/Transforms/HotColdSplit/outline-while-loop.ll
@@ -55,6 +55,47 @@
ret void
}
+; This is the same as @foo, but the while loop comes after the sink block.
+; CHECK-LABEL: define {{.*}}@while_loop_after_sink(
+; CHECK: br i1 {{.*}}, label %if.end, label %codeRepl
+; CHECK-LABEL: codeRepl:
+; CHECK-NEXT: call void @while_loop_after_sink.cold.1
+; CHECK-LABEL: if.end:
+; CHECK: call void @sideeffect(i32 1)
+define void @while_loop_after_sink(i32 %cond) {
+entry:
+ %tobool = icmp eq i32 %cond, 0
+ br i1 %tobool, label %if.end, label %sink
+
+sink:
+ tail call void (...) @sink()
+ br label %while.cond.preheader
+
+while.cond.preheader:
+ %cmp3 = icmp sgt i32 %cond, 10
+ br i1 %cmp3, label %while.body.preheader, label %while.end
+
+while.body.preheader: ; preds = %while.cond.preheader
+ br label %while.body
+
+while.body: ; preds = %while.body.preheader, %while.body
+ %cond.addr.04 = phi i32 [ %dec, %while.body ], [ %cond, %while.body.preheader ]
+ %dec = add nsw i32 %cond.addr.04, -1
+ tail call void @sideeffect(i32 0) #3
+ %cmp = icmp sgt i32 %dec, 10
+ br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit: ; preds = %while.body
+ br label %while.end
+
+while.end: ; preds = %while.end.loopexit, %while.cond.preheader
+ ret void
+
+if.end: ; preds = %entry
+ tail call void @sideeffect(i32 1)
+ ret void
+}
+
; CHECK-LABEL: define {{.*}}@foo.cold.1
; CHECK: phi i32
; CHECK-NEXT: add nsw i32
@@ -62,6 +103,14 @@
; CHECK-NEXT: icmp
; CHECK-NEXT: br
+; CHECK-LABEL: define {{.*}}@while_loop_after_sink.cold.1
+; CHECK: call {{.*}}@sink
+; CHECK: phi i32
+; CHECK-NEXT: add nsw i32
+; CHECK-NEXT: call {{.*}}@sideeffect
+; CHECK-NEXT: icmp
+; CHECK-NEXT: br
+
declare void @sideeffect(i32)
declare void @sink(...) cold
diff --git a/llvm/test/Transforms/HotColdSplit/phi-with-distinct-outlined-values.ll b/llvm/test/Transforms/HotColdSplit/phi-with-distinct-outlined-values.ll
new file mode 100644
index 0000000..6e8b13a
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/phi-with-distinct-outlined-values.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK: phi i32 [ 0, %entry ], [ %p.ce.reload, %codeRepl ]
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1(
+; CHECK: call {{.*}}@sink
+; CHECK: %p.ce = phi i32 [ 1, %coldbb ], [ 3, %coldbb2 ]
+; CHECK-NEXT: store i32 %p.ce, i32* %p.ce.out
+
+define void @foo(i32 %cond) {
+entry:
+ %tobool = icmp eq i32 %cond, 0
+ br i1 %tobool, label %if.end, label %coldbb
+
+coldbb:
+ call void @sink()
+ call void @sideeffect()
+ call void @sideeffect()
+ br i1 undef, label %if.end, label %coldbb2
+
+coldbb2:
+ br label %if.end
+
+if.end:
+ %p = phi i32 [0, %entry], [1, %coldbb], [3, %coldbb2]
+ ret void
+}
+
+declare void @sink() cold
+
+declare void @sideeffect()
diff --git a/llvm/test/Transforms/HotColdSplit/region-overlap.ll b/llvm/test/Transforms/HotColdSplit/region-overlap.ll
new file mode 100644
index 0000000..8064a54
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/region-overlap.ll
@@ -0,0 +1,65 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+; Source:
+;
+; __attribute__((cold)) extern void sink(int);
+; extern void sideeffect(int);
+; void foo(int cond1, int cond2) {
+; if (cond1) {
+; if (cond2) { // This is the first cold region we visit.
+; sideeffect(0);
+; sideeffect(10);
+; sink(0);
+; }
+;
+; // There's a larger, overlapping cold region here. But we ignore it.
+; // This could be improved.
+; sideeffect(1);
+; sideeffect(11);
+; sink(1);
+; }
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@_Z3fooii
+; CHECK: call {{.*}}@_Z3fooii.cold.1
+; CHECK-NOT: _Z3fooii.cold
+define void @_Z3fooii(i32, i32) {
+ %3 = alloca i32, align 4
+ %4 = alloca i32, align 4
+ store i32 %0, i32* %3, align 4
+ store i32 %1, i32* %4, align 4
+ %5 = load i32, i32* %3, align 4
+ %6 = icmp ne i32 %5, 0
+ br i1 %6, label %7, label %12
+
+; <label>:7: ; preds = %2
+ %8 = load i32, i32* %4, align 4
+ %9 = icmp ne i32 %8, 0
+ br i1 %9, label %10, label %11
+
+; <label>:10: ; preds = %7
+ call void @_Z10sideeffecti(i32 0)
+ call void @_Z10sideeffecti(i32 10)
+ call void @_Z4sinki(i32 0) #3
+ br label %11
+
+; <label>:11: ; preds = %10, %7
+ call void @_Z10sideeffecti(i32 1)
+ call void @_Z10sideeffecti(i32 11)
+ call void @_Z4sinki(i32 1) #3
+ br label %12
+
+; <label>:12: ; preds = %11, %2
+ ret void
+}
+
+; CHECK-LABEL: define {{.*}}@_Z3fooii.cold.1
+; CHECK: call void @_Z10sideeffecti(i32 0)
+; CHECK: call void @_Z10sideeffecti(i32 10)
+
+declare void @_Z10sideeffecti(i32)
+
+declare void @_Z4sinki(i32) cold
diff --git a/llvm/test/Transforms/HotColdSplit/succ-block-with-self-edge.ll b/llvm/test/Transforms/HotColdSplit/succ-block-with-self-edge.ll
new file mode 100644
index 0000000..f8cff9e
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/succ-block-with-self-edge.ll
@@ -0,0 +1,56 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@exit_block_with_same_incoming_vals
+; CHECK: call {{.*}}@exit_block_with_same_incoming_vals.cold.1(
+; CHECK-NOT: br i1 undef
+; CHECK: phi i32 [ 0, %entry ], [ %p.ce.reload, %codeRepl ]
+define void @exit_block_with_same_incoming_vals(i32 %cond) {
+entry:
+ %tobool = icmp eq i32 %cond, 0
+ br i1 %tobool, label %if.end, label %coldbb
+
+coldbb:
+ call void @sink()
+ call void @sideeffect()
+ call void @sideeffect()
+ br i1 undef, label %if.end, label %coldbb2
+
+coldbb2:
+ %p2 = phi i32 [0, %coldbb], [1, %coldbb2]
+ br i1 undef, label %if.end, label %coldbb2
+
+if.end:
+ %p = phi i32 [0, %entry], [1, %coldbb], [1, %coldbb2]
+ ret void
+}
+
+; CHECK-LABEL: define {{.*}}@exit_block_with_distinct_incoming_vals
+; CHECK: call {{.*}}@exit_block_with_distinct_incoming_vals.cold.1(
+; CHECK-NOT: br i1 undef
+; CHECK: phi i32 [ 0, %entry ], [ %p.ce.reload, %codeRepl ]
+define void @exit_block_with_distinct_incoming_vals(i32 %cond) {
+entry:
+ %tobool = icmp eq i32 %cond, 0
+ br i1 %tobool, label %if.end, label %coldbb
+
+coldbb:
+ call void @sink()
+ call void @sideeffect()
+ call void @sideeffect()
+ br i1 undef, label %if.end, label %coldbb2
+
+coldbb2:
+ %p2 = phi i32 [0, %coldbb], [1, %coldbb2]
+ br i1 undef, label %if.end, label %coldbb2
+
+if.end:
+ %p = phi i32 [0, %entry], [1, %coldbb], [2, %coldbb2]
+ ret void
+}
+
+declare void @sink() cold
+
+declare void @sideeffect()