[HotColdSplitting] Outline more than once per function

Algorithm: Identify maximal cold regions and put them in a worklist. If
a candidate region overlaps with another, discard it. While the worklist
is full, remove a single-entry sub-region from the worklist and attempt
to outline it. By the non-overlap property, this should not invalidate
parts of the domtree pertaining to other outlining regions.

Testing: LNT results on X86 are clean. With test-suite + externals, llvm
outlines 134KB pre-patch, and 352KB post-patch (+ ~2.6x). The file
483.xalancbmk/src/Constants.cpp stands out as an extreme case where llvm
outlines over 100 times in some functions (mostly EH paths). There was
not a significant performance impact pre vs. post-patch.

Differential Revision: https://reviews.llvm.org/D53887

llvm-svn: 348639
diff --git a/llvm/test/Transforms/HotColdSplit/eh-pads.ll b/llvm/test/Transforms/HotColdSplit/eh-pads.ll
new file mode 100644
index 0000000..cf413b4
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/eh-pads.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK: landingpad
+; CHECK: sideeffect(i32 2)
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1(
+; CHECK: sideeffect(i32 0)
+; CHECK: sideeffect(i32 1)
+; CHECK: sink
+
+define void @foo(i32 %cond) personality i8 0 {
+entry:
+  invoke void @llvm.donothing() to label %normal unwind label %exception
+
+exception:
+  ; Note: EH pads are not candidates for region entry points.
+  %cleanup = landingpad i8 cleanup
+  br label %continue_exception
+
+continue_exception:
+  call void @sideeffect(i32 0)
+  call void @sideeffect(i32 1)
+  call void @sink()
+  ret void
+
+normal:
+  call void @sideeffect(i32 2)
+  ret void
+}
+
+declare void @sideeffect(i32)
+
+declare void @sink() cold
+
+declare void @llvm.donothing() nounwind readnone
diff --git a/llvm/test/Transforms/HotColdSplit/extraction-subregion-breaks-phis.ll b/llvm/test/Transforms/HotColdSplit/extraction-subregion-breaks-phis.ll
new file mode 100644
index 0000000..11256a4
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/extraction-subregion-breaks-phis.ll
@@ -0,0 +1,63 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK: call {{.*}}@foo.cold.1(
+; CHECK: unreachable
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1(
+; CHECK: switch i32 undef, label %sw.epilog.i
+define void @foo(i32 %QMM) {
+entry:
+  switch i32 %QMM, label %entry.if.end16_crit_edge [
+    i32 1, label %if.then
+  ]
+
+entry.if.end16_crit_edge:                         ; preds = %entry
+  br label %if.end16
+
+if.then:                                          ; preds = %entry
+  br i1 undef, label %cond.true.i.i, label %_ZN10StringView8popFrontEv.exit.i
+
+cond.true.i.i:                                    ; preds = %if.then
+  ret void
+
+_ZN10StringView8popFrontEv.exit.i:                ; preds = %if.then
+  switch i32 undef, label %sw.epilog.i [
+    i32 81, label %if.end16
+    i32 82, label %sw.bb4.i
+    i32 83, label %sw.bb8.i
+    i32 84, label %sw.bb12.i
+    i32 65, label %if.end16
+    i32 66, label %sw.bb20.i
+    i32 67, label %sw.bb24.i
+    i32 68, label %sw.bb28.i
+  ]
+
+sw.bb4.i:                                         ; preds = %_ZN10StringView8popFrontEv.exit.i
+  br label %if.end16
+
+sw.bb8.i:                                         ; preds = %_ZN10StringView8popFrontEv.exit.i
+  br label %if.end16
+
+sw.bb12.i:                                        ; preds = %_ZN10StringView8popFrontEv.exit.i
+  br label %if.end16
+
+sw.bb20.i:                                        ; preds = %_ZN10StringView8popFrontEv.exit.i
+  br label %if.end16
+
+sw.bb24.i:                                        ; preds = %_ZN10StringView8popFrontEv.exit.i
+  br label %if.end16
+
+sw.bb28.i:                                        ; preds = %_ZN10StringView8popFrontEv.exit.i
+  br label %if.end16
+
+sw.epilog.i:                                      ; preds = %_ZN10StringView8popFrontEv.exit.i
+  br label %if.end16
+
+if.end16:                                         ; preds = %sw.epilog.i, %sw.bb28.i, %sw.bb24.i, %sw.bb20.i, %sw.bb12.i, %sw.bb8.i, %sw.bb4.i, %_ZN10StringView8popFrontEv.exit.i, %_ZN10StringView8popFrontEv.exit.i, %entry.if.end16_crit_edge
+  %0 = phi i8 [ 0, %entry.if.end16_crit_edge ], [ 0, %_ZN10StringView8popFrontEv.exit.i ], [ 0, %_ZN10StringView8popFrontEv.exit.i ], [ 1, %sw.bb4.i ], [ 2, %sw.bb8.i ], [ 3, %sw.bb12.i ], [ 1, %sw.bb20.i ], [ 2, %sw.bb24.i ], [ 3, %sw.bb28.i ], [ 0, %sw.epilog.i ]
+  unreachable
+}
diff --git a/llvm/test/Transforms/HotColdSplit/forward-dfs-reaches-marked-block.ll b/llvm/test/Transforms/HotColdSplit/forward-dfs-reaches-marked-block.ll
new file mode 100644
index 0000000..81a8f42
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/forward-dfs-reaches-marked-block.ll
@@ -0,0 +1,29 @@
+; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@fun
+; CHECK: call {{.*}}@fun.cold.1(
+define void @fun() {
+entry:
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  ; This will be marked by the inverse DFS on sink-predecesors.
+  br label %sink
+
+sink:
+  call void @sink()
+
+  ; Do not allow the forward-DFS on sink-successors to mark the block again.
+  br i1 undef, label %if.then, label %if.then.exit
+
+if.then.exit:
+  ret void
+
+if.else:
+  ret void
+}
+
+declare void @sink() cold
diff --git a/llvm/test/Transforms/HotColdSplit/mark-the-whole-func-cold.ll b/llvm/test/Transforms/HotColdSplit/mark-the-whole-func-cold.ll
new file mode 100644
index 0000000..d8bd55e
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/mark-the-whole-func-cold.ll
@@ -0,0 +1,64 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+; Source:
+; 
+; extern __attribute__((cold)) void sink();
+; extern void sideeffect(int);
+; void foo(int cond1, int cond2) {
+;     if (cond1) {
+;         if (cond2) {
+;             sideeffect(0);
+;         } else {
+;             sideeffect(1);
+;         }
+;         sink();
+;     } else {
+;         sideeffect(2);
+;     }
+;     sink();
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK: define {{.*}}@_Z3fooii{{.*}}#[[outlined_func_attr:[0-9]+]]
+; CHECK-NOT: _Z3fooii.cold
+; CHECK: attributes #[[outlined_func_attr]] = { {{.*}}minsize
+define void @_Z3fooii(i32, i32) {
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  store i32 %0, i32* %3, align 4
+  store i32 %1, i32* %4, align 4
+  %5 = load i32, i32* %3, align 4
+  %6 = icmp ne i32 %5, 0
+  br i1 %6, label %7, label %13
+
+; <label>:7:                                      ; preds = %2
+  %8 = load i32, i32* %4, align 4
+  %9 = icmp ne i32 %8, 0
+  br i1 %9, label %10, label %11
+
+; <label>:10:                                     ; preds = %7
+  call void @_Z10sideeffecti(i32 0)
+  br label %12
+
+; <label>:11:                                     ; preds = %7
+  call void @_Z10sideeffecti(i32 1)
+  br label %12
+
+; <label>:12:                                     ; preds = %11, %10
+  call void @_Z4sinkv() #3
+  br label %14
+
+; <label>:13:                                     ; preds = %2
+  call void @_Z10sideeffecti(i32 2)
+  br label %14
+
+; <label>:14:                                     ; preds = %13, %12
+  call void @_Z4sinkv() #3
+  ret void
+}
+
+declare void @_Z10sideeffecti(i32)
+
+declare void @_Z4sinkv() cold
diff --git a/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll b/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
new file mode 100644
index 0000000..d10240a
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
@@ -0,0 +1,57 @@
+; RUN: opt -S -hotcoldsplit < %s 2>&1 | FileCheck %s
+
+; CHECK-LABEL: define {{.*}}@fun
+; CHECK: call {{.*}}@fun.cold.2(
+; CHECK-NEXT: ret void
+; CHECK: call {{.*}}@fun.cold.1(
+; CHECK-NEXT: ret void
+define void @fun() {
+entry:
+  br i1 undef, label %A.then, label %A.else
+
+A.else:
+  br label %A.then4
+
+A.then4:
+  br i1 undef, label %A.then5, label %A.end
+
+A.then5:
+  br label %A.cleanup
+
+A.end:
+  br label %A.cleanup
+
+A.cleanup:
+  %A.cleanup.dest.slot.0 = phi i32 [ 1, %A.then5 ], [ 0, %A.end ]
+  unreachable
+
+A.then:
+  br i1 undef, label %B.then, label %B.else
+
+B.then:
+  ret void
+
+B.else:
+  br label %B.then4
+
+B.then4:
+  br i1 undef, label %B.then5, label %B.end
+
+B.then5:
+  br label %B.cleanup
+
+B.end:
+  br label %B.cleanup
+
+B.cleanup:
+  %B.cleanup.dest.slot.0 = phi i32 [ 1, %B.then5 ], [ 0, %B.end ]
+  unreachable
+}
+
+; CHECK-LABEL: define {{.*}}@fun.cold.1(
+; CHECK: %B.cleanup.dest.slot.0 = phi i32 [ 1, %B.then5 ], [ 0, %B.end ]
+; CHECK-NEXT: unreachable
+
+; CHECK-LABEL: define {{.*}}@fun.cold.2(
+; CHECK: %A.cleanup.dest.slot.0 = phi i32 [ 1, %A.then5 ], [ 0, %A.end ]
+; CHECK-NEXT: unreachable
diff --git a/llvm/test/Transforms/HotColdSplit/outline-multiple-entry-region.ll b/llvm/test/Transforms/HotColdSplit/outline-multiple-entry-region.ll
new file mode 100644
index 0000000..8fd20c0
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/outline-multiple-entry-region.ll
@@ -0,0 +1,81 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+; Source:
+;
+; extern __attribute__((cold)) void sink();
+; extern void sideeffect(int);
+; void foo(int cond1, int cond2) {
+;     while (true) {
+;         if (cond1) {
+;             sideeffect(0); // This is cold (it reaches sink()).
+;             break;
+;         }
+;         if (cond2) {
+;             sideeffect(1); // This is cold (it reaches sink()).
+;             break;
+;         }
+;         sideeffect(2);
+;         return;
+;     }
+;     sink();
+;     sideeffect(3);
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@_Z3fooii.cold.1
+; CHECK: call void @_Z10sideeffecti(i32 1)
+; CHECK: call void @_Z10sideeffecti(i32 11)
+
+; CHECK-LABEL: define {{.*}}@_Z3fooii.cold.2
+; CHECK: call void @_Z10sideeffecti(i32 0)
+; CHECK: call void @_Z10sideeffecti(i32 10)
+
+; CHECK-LABEL: define {{.*}}@_Z3fooii.cold.3
+; CHECK: call void @_Z4sinkv
+; CHECK: call void @_Z10sideeffecti(i32 3)
+
+define void @_Z3fooii(i32, i32) {
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  store i32 %0, i32* %3, align 4
+  store i32 %1, i32* %4, align 4
+  br label %5
+
+; <label>:5:                                      ; preds = %2
+  %6 = load i32, i32* %3, align 4
+  %7 = icmp ne i32 %6, 0
+  br i1 %7, label %8, label %9
+
+; <label>:8:                                      ; preds = %5
+  call void @_Z10sideeffecti(i32 0)
+  call void @_Z10sideeffecti(i32 10)
+  br label %14
+
+; <label>:9:                                      ; preds = %5
+  %10 = load i32, i32* %4, align 4
+  %11 = icmp ne i32 %10, 0
+  br i1 %11, label %12, label %13
+
+; <label>:12:                                     ; preds = %9
+  call void @_Z10sideeffecti(i32 1)
+  call void @_Z10sideeffecti(i32 11)
+  br label %14
+
+; <label>:13:                                     ; preds = %9
+  call void @_Z10sideeffecti(i32 2)
+  br label %15
+
+; <label>:14:                                     ; preds = %12, %8
+  call void @_Z4sinkv() #3
+  call void @_Z10sideeffecti(i32 3)
+  br label %15
+
+; <label>:15:                                     ; preds = %14, %13
+  ret void
+}
+
+declare void @_Z10sideeffecti(i32)
+
+declare void @_Z4sinkv() cold
diff --git a/llvm/test/Transforms/HotColdSplit/outline-while-loop.ll b/llvm/test/Transforms/HotColdSplit/outline-while-loop.ll
index 2a132bd..4c4e1f9 100644
--- a/llvm/test/Transforms/HotColdSplit/outline-while-loop.ll
+++ b/llvm/test/Transforms/HotColdSplit/outline-while-loop.ll
@@ -55,6 +55,47 @@
   ret void
 }
 
+; This is the same as @foo, but the while loop comes after the sink block.
+; CHECK-LABEL: define {{.*}}@while_loop_after_sink(
+; CHECK: br i1 {{.*}}, label %if.end, label %codeRepl
+; CHECK-LABEL: codeRepl:
+; CHECK-NEXT: call void @while_loop_after_sink.cold.1
+; CHECK-LABEL: if.end:
+; CHECK: call void @sideeffect(i32 1)
+define void @while_loop_after_sink(i32 %cond) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.end, label %sink
+
+sink:
+  tail call void (...) @sink()
+  br label %while.cond.preheader
+
+while.cond.preheader:
+  %cmp3 = icmp sgt i32 %cond, 10
+  br i1 %cmp3, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %while.cond.preheader
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %cond.addr.04 = phi i32 [ %dec, %while.body ], [ %cond, %while.body.preheader ]
+  %dec = add nsw i32 %cond.addr.04, -1
+  tail call void @sideeffect(i32 0) #3
+  %cmp = icmp sgt i32 %dec, 10
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %while.cond.preheader
+  ret void
+
+if.end:                                           ; preds = %entry
+  tail call void @sideeffect(i32 1)
+  ret void
+}
+
 ; CHECK-LABEL: define {{.*}}@foo.cold.1
 ; CHECK: phi i32
 ; CHECK-NEXT: add nsw i32
@@ -62,6 +103,14 @@
 ; CHECK-NEXT: icmp
 ; CHECK-NEXT: br
 
+; CHECK-LABEL: define {{.*}}@while_loop_after_sink.cold.1
+; CHECK: call {{.*}}@sink
+; CHECK: phi i32
+; CHECK-NEXT: add nsw i32
+; CHECK-NEXT: call {{.*}}@sideeffect
+; CHECK-NEXT: icmp
+; CHECK-NEXT: br
+
 declare void @sideeffect(i32)
 
 declare void @sink(...) cold
diff --git a/llvm/test/Transforms/HotColdSplit/phi-with-distinct-outlined-values.ll b/llvm/test/Transforms/HotColdSplit/phi-with-distinct-outlined-values.ll
new file mode 100644
index 0000000..6e8b13a
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/phi-with-distinct-outlined-values.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK: phi i32 [ 0, %entry ], [ %p.ce.reload, %codeRepl ]
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1(
+; CHECK: call {{.*}}@sink
+; CHECK: %p.ce = phi i32 [ 1, %coldbb ], [ 3, %coldbb2 ]
+; CHECK-NEXT: store i32 %p.ce, i32* %p.ce.out 
+
+define void @foo(i32 %cond) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.end, label %coldbb
+
+coldbb:
+  call void @sink()
+  call void @sideeffect()
+  call void @sideeffect()
+  br i1 undef, label %if.end, label %coldbb2
+
+coldbb2:
+  br label %if.end
+
+if.end:
+  %p = phi i32 [0, %entry], [1, %coldbb], [3, %coldbb2]
+  ret void
+}
+
+declare void @sink() cold
+
+declare void @sideeffect()
diff --git a/llvm/test/Transforms/HotColdSplit/region-overlap.ll b/llvm/test/Transforms/HotColdSplit/region-overlap.ll
new file mode 100644
index 0000000..8064a54
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/region-overlap.ll
@@ -0,0 +1,65 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+; Source:
+;
+; __attribute__((cold)) extern void sink(int);
+; extern void sideeffect(int);
+; void foo(int cond1, int cond2) {
+;     if (cond1) {
+;         if (cond2) { // This is the first cold region we visit.
+;             sideeffect(0);
+;             sideeffect(10);
+;             sink(0);
+;         }
+;
+;         // There's a larger, overlapping cold region here. But we ignore it.
+;         // This could be improved.
+;         sideeffect(1);
+;         sideeffect(11);
+;         sink(1);
+;     }
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@_Z3fooii
+; CHECK: call {{.*}}@_Z3fooii.cold.1
+; CHECK-NOT: _Z3fooii.cold
+define void @_Z3fooii(i32, i32) {
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  store i32 %0, i32* %3, align 4
+  store i32 %1, i32* %4, align 4
+  %5 = load i32, i32* %3, align 4
+  %6 = icmp ne i32 %5, 0
+  br i1 %6, label %7, label %12
+
+; <label>:7:                                      ; preds = %2
+  %8 = load i32, i32* %4, align 4
+  %9 = icmp ne i32 %8, 0
+  br i1 %9, label %10, label %11
+
+; <label>:10:                                     ; preds = %7
+  call void @_Z10sideeffecti(i32 0)
+  call void @_Z10sideeffecti(i32 10)
+  call void @_Z4sinki(i32 0) #3
+  br label %11
+
+; <label>:11:                                     ; preds = %10, %7
+  call void @_Z10sideeffecti(i32 1)
+  call void @_Z10sideeffecti(i32 11)
+  call void @_Z4sinki(i32 1) #3
+  br label %12
+
+; <label>:12:                                     ; preds = %11, %2
+  ret void
+}
+
+; CHECK-LABEL: define {{.*}}@_Z3fooii.cold.1
+; CHECK: call void @_Z10sideeffecti(i32 0)
+; CHECK: call void @_Z10sideeffecti(i32 10)
+
+declare void @_Z10sideeffecti(i32)
+
+declare void @_Z4sinki(i32) cold
diff --git a/llvm/test/Transforms/HotColdSplit/succ-block-with-self-edge.ll b/llvm/test/Transforms/HotColdSplit/succ-block-with-self-edge.ll
new file mode 100644
index 0000000..f8cff9e
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/succ-block-with-self-edge.ll
@@ -0,0 +1,56 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@exit_block_with_same_incoming_vals
+; CHECK: call {{.*}}@exit_block_with_same_incoming_vals.cold.1(
+; CHECK-NOT: br i1 undef
+; CHECK: phi i32 [ 0, %entry ], [ %p.ce.reload, %codeRepl ]
+define void @exit_block_with_same_incoming_vals(i32 %cond) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.end, label %coldbb
+
+coldbb:
+  call void @sink()
+  call void @sideeffect()
+  call void @sideeffect()
+  br i1 undef, label %if.end, label %coldbb2
+
+coldbb2:
+  %p2 = phi i32 [0, %coldbb], [1, %coldbb2]
+  br i1 undef, label %if.end, label %coldbb2
+
+if.end:
+  %p = phi i32 [0, %entry], [1, %coldbb], [1, %coldbb2]
+  ret void
+}
+
+; CHECK-LABEL: define {{.*}}@exit_block_with_distinct_incoming_vals
+; CHECK: call {{.*}}@exit_block_with_distinct_incoming_vals.cold.1(
+; CHECK-NOT: br i1 undef
+; CHECK: phi i32 [ 0, %entry ], [ %p.ce.reload, %codeRepl ]
+define void @exit_block_with_distinct_incoming_vals(i32 %cond) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.end, label %coldbb
+
+coldbb:
+  call void @sink()
+  call void @sideeffect()
+  call void @sideeffect()
+  br i1 undef, label %if.end, label %coldbb2
+
+coldbb2:
+  %p2 = phi i32 [0, %coldbb], [1, %coldbb2]
+  br i1 undef, label %if.end, label %coldbb2
+
+if.end:
+  %p = phi i32 [0, %entry], [1, %coldbb], [2, %coldbb2]
+  ret void
+}
+
+declare void @sink() cold
+
+declare void @sideeffect()