[HotColdSplitting] Identify larger cold regions using domtree queries
The current splitting algorithm works in three stages:
1) Identify cold blocks, then
2) Use forward/backward propagation to mark hot blocks, then
3) Grow a SESE region of blocks *outside* of the set of hot blocks and
start outlining.
While testing this pass on Apple internal frameworks I noticed that some
kinds of control flow (e.g. loops) are never outlined, even though they
unconditionally lead to / follow cold blocks. I noticed two other issues
related to how cold regions are identified:
- An inconsistency can arise in the internal state of the hotness
propagation stage, as a block may end up in both the ColdBlocks set
and the HotBlocks set. Further inconsistencies can arise as these sets
do not match what's in ProfileSummaryInfo.
- It isn't necessary to limit outlining to single-exit regions.
This patch teaches the splitting algorithm to identify maximal cold
regions and outline them. A maximal cold region is defined as the set of
blocks post-dominated by a cold sink block, or dominated by that sink
block. This approach can successfully outline loops in the cold path. As
a side benefit, it maintains less internal state than the current
approach.
Due to a limitation in CodeExtractor, blocks within the maximal cold
region which aren't dominated by a single entry point (a so-called "max
ancestor") are filtered out.
Results:
- X86 (LNT + -Os + externals): 134KB of TEXT were outlined compared to
47KB pre-patch, or a ~3x improvement. Did not see a performance impact
across two runs.
- AArch64 (LNT + -Os + externals + Apple-internal benchmarks): 149KB
of TEXT were outlined. Ditto re: performance impact.
- Outlining results improve marginally in the internal frameworks I
tested.
Follow-ups:
- Outline more than once per function, outline large single basic
blocks, & try to remove unconditional branches in outlined functions.
Differential Revision: https://reviews.llvm.org/D53627
llvm-svn: 345209
diff --git a/llvm/test/Transforms/HotColdSplit/split-cold-1.ll b/llvm/test/Transforms/HotColdSplit/do-not-split.ll
similarity index 71%
rename from llvm/test/Transforms/HotColdSplit/split-cold-1.ll
rename to llvm/test/Transforms/HotColdSplit/do-not-split.ll
index 1a8138f..1f62658 100644
--- a/llvm/test/Transforms/HotColdSplit/split-cold-1.ll
+++ b/llvm/test/Transforms/HotColdSplit/do-not-split.ll
@@ -1,9 +1,10 @@
; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
; RUN: opt -passes=hotcoldsplit -S < %s | FileCheck %s
-; Check that the function is not split. Outlined function is called from a
+; Check that these functions are not split. Outlined functions are called from a
; basic block named codeRepl.
+; The cold region is too small to split.
; CHECK-LABEL: @foo
; CHECK-NOT: codeRepl
define void @foo() {
@@ -26,11 +27,9 @@
ret void
}
-; Check that the function is not split. We used to outline the full function.
-
+; Make sure we don't try to outline the entire function.
; CHECK-LABEL: @fun
; CHECK-NOT: codeRepl
-
define void @fun() {
entry:
br i1 undef, label %if.then, label %if.end
@@ -41,3 +40,17 @@
if.end: ; preds = %entry
ret void
}
+
+; Don't outline infinite loops.
+; CHECK-LABEL: @infinite_loop
+; CHECK-NOT: codeRepl
+define void @infinite_loop() {
+entry:
+ br label %loop
+
+loop:
+ call void @sink()
+ br label %loop
+}
+
+declare void @sink() cold
diff --git a/llvm/test/Transforms/HotColdSplit/duplicate-phi-preds-crash.ll b/llvm/test/Transforms/HotColdSplit/duplicate-phi-preds-crash.ll
new file mode 100644
index 0000000..17001f9
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/duplicate-phi-preds-crash.ll
@@ -0,0 +1,54 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+declare void @sideeffect(i64)
+
+declare i8* @realloc(i8* %ptr, i64 %size)
+
+declare void @free(i8* %ptr)
+
+declare void @sink() cold
+
+; CHECK-LABEL: define {{.*}}@realloc2(
+; CHECK: call {{.*}}@sideeffect(
+; CHECK: call {{.*}}@realloc(
+; CHECK-LABEL: codeRepl:
+; CHECK-NEXT: call {{.*}}@realloc2.cold.1(i64 %size, i8* %ptr)
+; CHECK-LABEL: cleanup:
+; CHECK-NEXT: phi i8* [ null, %if.then ], [ null, %codeRepl ], [ %call, %if.end ]
+define i8* @realloc2(i8* %ptr, i64 %size) {
+entry:
+ %0 = add i64 %size, -1
+ %1 = icmp ugt i64 %0, 184549375
+ br i1 %1, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ call void @sideeffect(i64 %size)
+ br label %cleanup
+
+if.end: ; preds = %entry
+ %call = call i8* @realloc(i8* %ptr, i64 %size)
+ %tobool1 = icmp eq i8* %call, null
+ br i1 %tobool1, label %if.then2, label %cleanup
+
+if.then2: ; preds = %if.end
+ call void @sideeffect(i64 %size)
+ call void @sink()
+ %tobool3 = icmp eq i8* %ptr, null
+ br i1 %tobool3, label %cleanup, label %if.then4
+
+if.then4: ; preds = %if.then2
+ call void @free(i8* %ptr)
+ br label %cleanup
+
+cleanup: ; preds = %if.end, %if.then4, %if.then2, %if.then
+ %retval.0 = phi i8* [ null, %if.then ], [ null, %if.then2 ], [ null, %if.then4 ], [ %call, %if.end ]
+ ret i8* %retval.0
+}
+
+; CHECK-LABEL: define {{.*}}@realloc2.cold.1(
+; CHECK: call {{.*}}@sideeffect
+; CHECK: call {{.*}}@sink
+; CHECK: call {{.*}}@free
diff --git a/llvm/test/Transforms/HotColdSplit/multiple-exits.ll b/llvm/test/Transforms/HotColdSplit/multiple-exits.ll
new file mode 100644
index 0000000..2e7cf84
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/multiple-exits.ll
@@ -0,0 +1,73 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+; Source:
+;
+; extern void sideeffect(int);
+; extern void __attribute__((cold)) sink();
+; void foo(int cond) {
+; if (cond) { //< Start outlining here.
+; sink();
+; if (cond > 10)
+; goto exit1;
+; else
+; goto exit2;
+; }
+; exit1:
+; sideeffect(1);
+; return;
+; exit2:
+; sideeffect(2);
+; return;
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK: br i1 {{.*}}, label %exit1, label %codeRepl
+; CHECK-LABEL: codeRepl:
+; CHECK: [[targetBlock:%.*]] = call i1 @foo.cold.1(
+; CHECK-NEXT: br i1 [[targetBlock]], label %exit1, label %[[return:.*]]
+; CHECK-LABEL: exit1:
+; CHECK: call {{.*}}@sideeffect(i32 1)
+; CHECK: [[return]]:
+; CHECK-NEXT: ret void
+define void @foo(i32 %cond) {
+entry:
+ %tobool = icmp eq i32 %cond, 0
+ br i1 %tobool, label %exit1, label %if.then
+
+if.then: ; preds = %entry
+ tail call void (...) @sink()
+ %cmp = icmp sgt i32 %cond, 10
+ br i1 %cmp, label %exit1, label %exit2
+
+exit1: ; preds = %entry, %if.then
+ call void @sideeffect(i32 1)
+ br label %return
+
+exit2: ; preds = %if.then
+ call void @sideeffect(i32 2)
+ br label %return
+
+return: ; preds = %exit2, %exit1
+ ret void
+}
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1(
+; TODO: Eliminate this unnecessary unconditional branch.
+; CHECK: br
+; CHECK: [[exit1Stub:.*]]:
+; CHECK-NEXT: ret i1 true
+; CHECK: [[returnStub:.*]]:
+; CHECK-NEXT: ret i1 false
+; CHECK: call {{.*}}@sink
+; CHECK-NEXT: [[cmp:%.*]] = icmp
+; CHECK-NEXT: br i1 [[cmp]], label %[[exit1Stub]], label %exit2
+; CHECK-LABEL: exit2:
+; CHECK-NEXT: call {{.*}}@sideeffect(i32 2)
+; CHECK-NEXT: br label %[[returnStub]]
+
+declare void @sink(...) cold
+
+declare void @sideeffect(i32)
diff --git a/llvm/test/Transforms/HotColdSplit/outline-if-then-else.ll b/llvm/test/Transforms/HotColdSplit/outline-if-then-else.ll
new file mode 100644
index 0000000..bbde765
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/outline-if-then-else.ll
@@ -0,0 +1,64 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+; Source:
+;
+; extern void sideeffect(int);
+; extern void __attribute__((cold)) sink();
+; void foo(int cond) {
+; if (cond) { //< Start outlining here.
+; if (cond > 10)
+; sideeffect(0);
+; else
+; sideeffect(1);
+; sink();
+; }
+; sideeffect(2);
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK: br i1 {{.*}}, label %codeRepl, label %if.end2
+; CHECK-LABEL: codeRepl:
+; CHECK-NEXT: call void @foo.cold.1
+; CHECK-LABEL: if.end2:
+; CHECK: call void @sideeffect(i32 2)
+define void @foo(i32 %cond) {
+entry:
+ %cond.addr = alloca i32
+ store i32 %cond, i32* %cond.addr
+ %0 = load i32, i32* %cond.addr
+ %tobool = icmp ne i32 %0, 0
+ br i1 %tobool, label %if.then, label %if.end2
+
+if.then: ; preds = %entry
+ %1 = load i32, i32* %cond.addr
+ %cmp = icmp sgt i32 %1, 10
+ br i1 %cmp, label %if.then1, label %if.else
+
+if.then1: ; preds = %if.then
+ call void @sideeffect(i32 0)
+ br label %if.end
+
+if.else: ; preds = %if.then
+ call void @sideeffect(i32 1)
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then1
+ call void (...) @sink()
+ ret void
+
+if.end2: ; preds = %entry
+ call void @sideeffect(i32 2)
+ ret void
+}
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1
+; CHECK: call {{.*}}@sideeffect
+; CHECK: call {{.*}}@sideeffect
+; CHECK: call {{.*}}@sink
+
+declare void @sideeffect(i32)
+
+declare void @sink(...) cold
diff --git a/llvm/test/Transforms/HotColdSplit/outline-while-loop.ll b/llvm/test/Transforms/HotColdSplit/outline-while-loop.ll
new file mode 100644
index 0000000..2a132bd
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/outline-while-loop.ll
@@ -0,0 +1,67 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+; Source:
+;
+; extern void sideeffect(int);
+; extern void __attribute__((cold)) sink();
+; void foo(int cond) {
+; if (cond) { //< Start outlining here.
+; while (cond > 10) {
+; --cond;
+; sideeffect(0);
+; }
+; sink();
+; }
+; sideeffect(1);
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK: br i1 {{.*}}, label %if.end, label %codeRepl
+; CHECK-LABEL: codeRepl:
+; CHECK-NEXT: call void @foo.cold.1
+; CHECK-LABEL: if.end:
+; CHECK: call void @sideeffect(i32 1)
+define void @foo(i32 %cond) {
+entry:
+ %tobool = icmp eq i32 %cond, 0
+ br i1 %tobool, label %if.end, label %while.cond.preheader
+
+while.cond.preheader: ; preds = %entry
+ %cmp3 = icmp sgt i32 %cond, 10
+ br i1 %cmp3, label %while.body.preheader, label %while.end
+
+while.body.preheader: ; preds = %while.cond.preheader
+ br label %while.body
+
+while.body: ; preds = %while.body.preheader, %while.body
+ %cond.addr.04 = phi i32 [ %dec, %while.body ], [ %cond, %while.body.preheader ]
+ %dec = add nsw i32 %cond.addr.04, -1
+ tail call void @sideeffect(i32 0) #3
+ %cmp = icmp sgt i32 %dec, 10
+ br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit: ; preds = %while.body
+ br label %while.end
+
+while.end: ; preds = %while.end.loopexit, %while.cond.preheader
+ tail call void (...) @sink()
+ ret void
+
+if.end: ; preds = %entry
+ tail call void @sideeffect(i32 1)
+ ret void
+}
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1
+; CHECK: phi i32
+; CHECK-NEXT: add nsw i32
+; CHECK-NEXT: call {{.*}}@sideeffect
+; CHECK-NEXT: icmp
+; CHECK-NEXT: br
+
+declare void @sideeffect(i32)
+
+declare void @sink(...) cold