Add heuristics for irreducible loop metadata under PGO
Summary:
Add the following heuristics for irreducible loop metadata:
- When an irreducible loop header is missing the loop header weight metadata,
give it the minimum weight seen among other headers.
- Annotate indirectbr targets with the loop header weight metadata (as they are
likely to become irreducible loop headers after indirectbr tail duplication.)
These greatly improve the accuracy of the block frequency info of the Python
interpreter loop (eg. from ~3-16x off down to ~40-55% off) and the Python
performance (eg. unpack_sequence from ~50% slower to ~8% faster than GCC) due to
better register allocation under PGO.
Reviewers: davidxl
Reviewed By: davidxl
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D39980
llvm-svn: 318693
diff --git a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
index 7b916e3..9105679 100644
--- a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
+++ b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
@@ -16,6 +16,7 @@
#define LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/GraphTraits.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
@@ -1155,35 +1156,56 @@
DEBUG(dbgs() << "isIrreducible = true\n");
Distribution Dist;
unsigned NumHeadersWithWeight = 0;
+ Optional<uint64_t> MinHeaderWeight;
+ DenseSet<uint32_t> HeadersWithoutWeight;
+ HeadersWithoutWeight.reserve(Loop.NumHeaders);
for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
auto &HeaderNode = Loop.Nodes[H];
const BlockT *Block = getBlock(HeaderNode);
IsIrrLoopHeader.set(Loop.Nodes[H].Index);
Optional<uint64_t> HeaderWeight = Block->getIrrLoopHeaderWeight();
- if (!HeaderWeight)
+ if (!HeaderWeight) {
+ DEBUG(dbgs() << "Missing irr loop header metadata on "
+ << getBlockName(HeaderNode) << "\n");
+ HeadersWithoutWeight.insert(H);
continue;
+ }
DEBUG(dbgs() << getBlockName(HeaderNode)
<< " has irr loop header weight " << HeaderWeight.getValue()
<< "\n");
NumHeadersWithWeight++;
uint64_t HeaderWeightValue = HeaderWeight.getValue();
- if (HeaderWeightValue)
+ if (!MinHeaderWeight || HeaderWeightValue < MinHeaderWeight)
+ MinHeaderWeight = HeaderWeightValue;
+ if (HeaderWeightValue) {
Dist.addLocal(HeaderNode, HeaderWeightValue);
- }
- if (NumHeadersWithWeight != Loop.NumHeaders) {
- // Not all headers have a weight metadata. Distribute weight evenly.
- Dist = Distribution();
- for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
- auto &HeaderNode = Loop.Nodes[H];
- Dist.addLocal(HeaderNode, 1);
}
}
+ // As a heuristic, if some headers don't have a weight, give them the
+ // minimium weight seen (not to disrupt the existing trends too much by
+ // using a weight that's in the general range of the other headers' weights,
+ // and the minimum seems to perform better than the average.)
+ // FIXME: better update in the passes that drop the header weight.
+ // If no headers have a weight, give them even weight (use weight 1).
+ if (!MinHeaderWeight)
+ MinHeaderWeight = 1;
+ for (uint32_t H : HeadersWithoutWeight) {
+ auto &HeaderNode = Loop.Nodes[H];
+ const BlockT *Block = getBlock(HeaderNode);
+ assert(!Block->getIrrLoopHeaderWeight() &&
+ "Shouldn't have a weight metadata");
+ uint64_t MinWeight = MinHeaderWeight.getValue();
+ DEBUG(dbgs() << "Giving weight " << MinWeight
+ << " to " << getBlockName(HeaderNode) << "\n");
+ if (MinWeight)
+ Dist.addLocal(HeaderNode, MinWeight);
+ }
distributeIrrLoopHeaderMass(Dist);
for (const BlockNode &M : Loop.Nodes)
if (!propagateMassToSuccessors(&Loop, M))
llvm_unreachable("unhandled irreducible control flow");
- if (NumHeadersWithWeight != Loop.NumHeaders)
- // Not all headers have a weight metadata. Adjust header mass.
+ if (NumHeadersWithWeight == 0)
+ // No headers have a metadata. Adjust header mass.
adjustLoopHeaderMass(Loop);
} else {
Working[Loop.getHeader().Index].getMass() = BlockMass::getFull();
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index c92d483..47278e1 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -1188,11 +1188,22 @@
}
}
+static bool isIndirectBrTarget(BasicBlock *BB) {
+ for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+ if (isa<IndirectBrInst>((*PI)->getTerminator()))
+ return true;
+ }
+ return false;
+}
+
void PGOUseFunc::annotateIrrLoopHeaderWeights() {
DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n");
// Find irr loop headers
for (auto &BB : F) {
- if (BFI->isIrrLoopHeader(&BB)) {
+ // As a heuristic also annotate indrectbr targets as they have a high chance
+ // to become an irreducible loop header after the indirectbr tail
+ // duplication.
+ if (BFI->isIrrLoopHeader(&BB) || isIndirectBrTarget(&BB)) {
TerminatorInst *TI = BB.getTerminator();
const UseBBInfo &BBCountInfo = getBBInfo(&BB);
setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue);
diff --git a/llvm/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll b/llvm/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll
index 3eb0597..8a18cba 100644
--- a/llvm/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll
+++ b/llvm/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll
@@ -159,3 +159,68 @@
; CHECK-NEXT: - sw.default: {{.*}} count = 0
; CHECK-NEXT: - exit: {{.*}} count = 1
; CHECK-NEXT: - indirectgoto: {{.*}} count = 399, irr_loop_header_weight = 400
+
+; Missing some irr loop annotations.
+; Function Attrs: noinline norecurse nounwind uwtable
+define i32 @_Z11irreduciblePh2(i8* nocapture readonly %p) !prof !27 {
+entry:
+ %0 = load i32, i32* @tracing, align 4
+ %1 = trunc i32 %0 to i8
+ %tobool = icmp eq i32 %0, 0
+ br label %for.cond1
+
+for.cond1: ; preds = %sw.default, %entry
+ br label %dispatch_op
+
+dispatch_op: ; preds = %sw.bb6, %for.cond1
+switch i8 %1, label %sw.default [
+ i8 0, label %sw.bb
+ i8 1, label %dispatch_op.sw.bb6_crit_edge
+ i8 2, label %sw.bb15
+ ], !prof !36
+
+dispatch_op.sw.bb6_crit_edge: ; preds = %dispatch_op
+ br label %sw.bb6
+
+sw.bb: ; preds = %indirectgoto, %dispatch_op
+ br label %exit
+
+TARGET_1: ; preds = %indirectgoto
+ br label %sw.bb6
+
+sw.bb6: ; preds = %TARGET_1, %dispatch_op.sw.bb6_crit_edge
+ br i1 %tobool, label %dispatch_op, label %if.then, !prof !37 ; Missing !irr_loop !38
+
+if.then: ; preds = %sw.bb6
+ br label %indirectgoto
+
+TARGET_2: ; preds = %indirectgoto
+ br label %sw.bb15
+
+sw.bb15: ; preds = %TARGET_2, %dispatch_op
+ br i1 %tobool, label %if.then18, label %exit, !prof !39, !irr_loop !40
+
+if.then18: ; preds = %sw.bb15
+ br label %indirectgoto
+
+unknown_op: ; preds = %indirectgoto
+ br label %sw.default
+
+sw.default: ; preds = %unknown_op, %dispatch_op
+ br label %for.cond1
+
+exit: ; preds = %sw.bb15, %sw.bb
+ ret i32 0
+
+indirectgoto: ; preds = %if.then18, %if.then
+ %idxprom21 = zext i32 %0 to i64
+ %arrayidx22 = getelementptr inbounds [256 x i8*], [256 x i8*]* @targets, i64 0, i64 %idxprom21
+ %target = load i8*, i8** %arrayidx22, align 8
+ indirectbr i8* %target, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2], !prof !41, !irr_loop !42
+}
+
+; CHECK-LABEL: Printing analysis {{.*}} for function '_Z11irreduciblePh2':
+; CHECK: block-frequency-info: _Z11irreduciblePh2
+; CHECK: - sw.bb6: {{.*}} count = 100
+; CHECK: - sw.bb15: {{.*}} count = 100, irr_loop_header_weight = 100
+; CHECK: - indirectgoto: {{.*}} count = 400, irr_loop_header_weight = 400
diff --git a/llvm/test/Transforms/PGOProfile/irreducible.ll b/llvm/test/Transforms/PGOProfile/irreducible.ll
index 9b2c8f6..9394b72 100644
--- a/llvm/test/Transforms/PGOProfile/irreducible.ll
+++ b/llvm/test/Transforms/PGOProfile/irreducible.ll
@@ -91,6 +91,7 @@
TARGET_1: ; preds = %indirectgoto
br label %sw.bb6
+; USE: br label %sw.bb6, !irr_loop {{.*}}
sw.bb6: ; preds = %TARGET_1, %dispatch_op.sw.bb6_crit_edge
br i1 %tobool, label %dispatch_op, label %if.then
@@ -102,6 +103,7 @@
TARGET_2: ; preds = %indirectgoto
br label %sw.bb15
+; USE: br label %sw.bb15, !irr_loop {{.*}}
sw.bb15: ; preds = %TARGET_2, %dispatch_op
br i1 %tobool, label %if.then18, label %exit