|  | //===- SyncDependenceAnalysis.cpp - Divergent Branch Dependence Calculation | 
|  | //--===// | 
|  | // | 
|  | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|  | // See https://llvm.org/LICENSE.txt for license information. | 
|  | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  | // | 
|  | // This file implements an algorithm that returns for a divergent branch | 
|  | // the set of basic blocks whose phi nodes become divergent due to divergent | 
|  | // control. These are the blocks that are reachable by two disjoint paths from | 
|  | // the branch or loop exits that have a reaching path that is disjoint from a | 
|  | // path to the loop latch. | 
|  | // | 
|  | // The SyncDependenceAnalysis is used in the DivergenceAnalysis to model | 
|  | // control-induced divergence in phi nodes. | 
|  | // | 
|  | // -- Summary -- | 
|  | // The SyncDependenceAnalysis lazily computes sync dependences [3]. | 
|  | // The analysis evaluates the disjoint path criterion [2] by a reduction | 
|  | // to SSA construction. The SSA construction algorithm is implemented as | 
|  | // a simple data-flow analysis [1]. | 
|  | // | 
|  | // [1] "A Simple, Fast Dominance Algorithm", SPI '01, Cooper, Harvey and Kennedy | 
|  | // [2] "Efficiently Computing Static Single Assignment Form | 
|  | //     and the Control Dependence Graph", TOPLAS '91, | 
|  | //           Cytron, Ferrante, Rosen, Wegman and Zadeck | 
|  | // [3] "Improving Performance of OpenCL on CPUs", CC '12, Karrenberg and Hack | 
|  | // [4] "Divergence Analysis", TOPLAS '13, Sampaio, Souza, Collange and Pereira | 
|  | // | 
|  | // -- Sync dependence -- | 
|  | // Sync dependence [4] characterizes the control flow aspect of the | 
|  | // propagation of branch divergence. For example, | 
|  | // | 
|  | //   %cond = icmp slt i32 %tid, 10 | 
|  | //   br i1 %cond, label %then, label %else | 
|  | // then: | 
|  | //   br label %merge | 
|  | // else: | 
|  | //   br label %merge | 
|  | // merge: | 
|  | //   %a = phi i32 [ 0, %then ], [ 1, %else ] | 
|  | // | 
|  | // Suppose %tid holds the thread ID. Although %a is not data dependent on %tid | 
|  | // because %tid is not on its use-def chains, %a is sync dependent on %tid | 
|  | // because the branch "br i1 %cond" depends on %tid and affects which value %a | 
|  | // is assigned to. | 
|  | // | 
|  | // -- Reduction to SSA construction -- | 
|  | // There are two disjoint paths from A to X, if a certain variant of SSA | 
|  | // construction places a phi node in X under the following set-up scheme [2]. | 
|  | // | 
|  | // This variant of SSA construction ignores incoming undef values. | 
|  | // That is paths from the entry without a definition do not result in | 
|  | // phi nodes. | 
|  | // | 
|  | //       entry | 
|  | //     /      \ | 
|  | //    A        \ | 
|  | //  /   \       Y | 
|  | // B     C     / | 
|  | //  \   /  \  / | 
|  | //    D     E | 
|  | //     \   / | 
|  | //       F | 
|  | // Assume that A contains a divergent branch. We are interested | 
|  | // in the set of all blocks where each block is reachable from A | 
|  | // via two disjoint paths. This would be the set {D, F} in this | 
|  | // case. | 
|  | // To generally reduce this query to SSA construction we introduce | 
|  | // a virtual variable x and assign to x different values in each | 
|  | // successor block of A. | 
|  | //           entry | 
|  | //         /      \ | 
|  | //        A        \ | 
|  | //      /   \       Y | 
|  | // x = 0   x = 1   / | 
|  | //      \  /   \  / | 
|  | //        D     E | 
|  | //         \   / | 
|  | //           F | 
|  | // Our flavor of SSA construction for x will construct the following | 
|  | //            entry | 
|  | //          /      \ | 
|  | //         A        \ | 
|  | //       /   \       Y | 
|  | // x0 = 0   x1 = 1  / | 
|  | //       \   /   \ / | 
|  | //      x2=phi    E | 
|  | //         \     / | 
|  | //          x3=phi | 
|  | // The blocks D and F contain phi nodes and are thus each reachable | 
|  | // by two disjoins paths from A. | 
|  | // | 
|  | // -- Remarks -- | 
|  | // In case of loop exits we need to check the disjoint path criterion for loops | 
|  | // [2]. To this end, we check whether the definition of x differs between the | 
|  | // loop exit and the loop header (_after_ SSA construction). | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  | #include "llvm/ADT/PostOrderIterator.h" | 
|  | #include "llvm/ADT/SmallPtrSet.h" | 
|  | #include "llvm/Analysis/PostDominators.h" | 
|  | #include "llvm/Analysis/SyncDependenceAnalysis.h" | 
|  | #include "llvm/IR/BasicBlock.h" | 
|  | #include "llvm/IR/CFG.h" | 
|  | #include "llvm/IR/Dominators.h" | 
|  | #include "llvm/IR/Function.h" | 
|  |  | 
|  | #include <stack> | 
|  | #include <unordered_set> | 
|  |  | 
|  | #define DEBUG_TYPE "sync-dependence" | 
|  |  | 
|  | namespace llvm { | 
|  |  | 
|  | ConstBlockSet SyncDependenceAnalysis::EmptyBlockSet; | 
|  |  | 
|  | SyncDependenceAnalysis::SyncDependenceAnalysis(const DominatorTree &DT, | 
|  | const PostDominatorTree &PDT, | 
|  | const LoopInfo &LI) | 
|  | : FuncRPOT(DT.getRoot()->getParent()), DT(DT), PDT(PDT), LI(LI) {} | 
|  |  | 
|  | SyncDependenceAnalysis::~SyncDependenceAnalysis() {} | 
|  |  | 
|  | using FunctionRPOT = ReversePostOrderTraversal<const Function *>; | 
|  |  | 
|  | // divergence propagator for reducible CFGs | 
|  | struct DivergencePropagator { | 
|  | const FunctionRPOT &FuncRPOT; | 
|  | const DominatorTree &DT; | 
|  | const PostDominatorTree &PDT; | 
|  | const LoopInfo &LI; | 
|  |  | 
|  | // identified join points | 
|  | std::unique_ptr<ConstBlockSet> JoinBlocks; | 
|  |  | 
|  | // reached loop exits (by a path disjoint to a path to the loop header) | 
|  | SmallPtrSet<const BasicBlock *, 4> ReachedLoopExits; | 
|  |  | 
|  | // if DefMap[B] == C then C is the dominating definition at block B | 
|  | // if DefMap[B] ~ undef then we haven't seen B yet | 
|  | // if DefMap[B] == B then B is a join point of disjoint paths from X or B is | 
|  | // an immediate successor of X (initial value). | 
|  | using DefiningBlockMap = std::map<const BasicBlock *, const BasicBlock *>; | 
|  | DefiningBlockMap DefMap; | 
|  |  | 
|  | // all blocks with pending visits | 
|  | std::unordered_set<const BasicBlock *> PendingUpdates; | 
|  |  | 
|  | DivergencePropagator(const FunctionRPOT &FuncRPOT, const DominatorTree &DT, | 
|  | const PostDominatorTree &PDT, const LoopInfo &LI) | 
|  | : FuncRPOT(FuncRPOT), DT(DT), PDT(PDT), LI(LI), | 
|  | JoinBlocks(new ConstBlockSet) {} | 
|  |  | 
|  | // set the definition at @block and mark @block as pending for a visit | 
|  | void addPending(const BasicBlock &Block, const BasicBlock &DefBlock) { | 
|  | bool WasAdded = DefMap.emplace(&Block, &DefBlock).second; | 
|  | if (WasAdded) | 
|  | PendingUpdates.insert(&Block); | 
|  | } | 
|  |  | 
|  | void printDefs(raw_ostream &Out) { | 
|  | Out << "Propagator::DefMap {\n"; | 
|  | for (const auto *Block : FuncRPOT) { | 
|  | auto It = DefMap.find(Block); | 
|  | Out << Block->getName() << " : "; | 
|  | if (It == DefMap.end()) { | 
|  | Out << "\n"; | 
|  | } else { | 
|  | const auto *DefBlock = It->second; | 
|  | Out << (DefBlock ? DefBlock->getName() : "<null>") << "\n"; | 
|  | } | 
|  | } | 
|  | Out << "}\n"; | 
|  | } | 
|  |  | 
|  | // process @succBlock with reaching definition @defBlock | 
|  | // the original divergent branch was in @parentLoop (if any) | 
|  | void visitSuccessor(const BasicBlock &SuccBlock, const Loop *ParentLoop, | 
|  | const BasicBlock &DefBlock) { | 
|  |  | 
|  | // @succBlock is a loop exit | 
|  | if (ParentLoop && !ParentLoop->contains(&SuccBlock)) { | 
|  | DefMap.emplace(&SuccBlock, &DefBlock); | 
|  | ReachedLoopExits.insert(&SuccBlock); | 
|  | return; | 
|  | } | 
|  |  | 
|  | // first reaching def? | 
|  | auto ItLastDef = DefMap.find(&SuccBlock); | 
|  | if (ItLastDef == DefMap.end()) { | 
|  | addPending(SuccBlock, DefBlock); | 
|  | return; | 
|  | } | 
|  |  | 
|  | // a join of at least two definitions | 
|  | if (ItLastDef->second != &DefBlock) { | 
|  | // do we know this join already? | 
|  | if (!JoinBlocks->insert(&SuccBlock).second) | 
|  | return; | 
|  |  | 
|  | // update the definition | 
|  | addPending(SuccBlock, SuccBlock); | 
|  | } | 
|  | } | 
|  |  | 
|  | // find all blocks reachable by two disjoint paths from @rootTerm. | 
|  | // This method works for both divergent terminators and loops with | 
|  | // divergent exits. | 
|  | // @rootBlock is either the block containing the branch or the header of the | 
|  | // divergent loop. | 
|  | // @nodeSuccessors is the set of successors of the node (Loop or Terminator) | 
|  | // headed by @rootBlock. | 
|  | // @parentLoop is the parent loop of the Loop or the loop that contains the | 
|  | // Terminator. | 
|  | template <typename SuccessorIterable> | 
|  | std::unique_ptr<ConstBlockSet> | 
|  | computeJoinPoints(const BasicBlock &RootBlock, | 
|  | SuccessorIterable NodeSuccessors, const Loop *ParentLoop, const BasicBlock * PdBoundBlock) { | 
|  | assert(JoinBlocks); | 
|  |  | 
|  | // bootstrap with branch targets | 
|  | for (const auto *SuccBlock : NodeSuccessors) { | 
|  | DefMap.emplace(SuccBlock, SuccBlock); | 
|  |  | 
|  | if (ParentLoop && !ParentLoop->contains(SuccBlock)) { | 
|  | // immediate loop exit from node. | 
|  | ReachedLoopExits.insert(SuccBlock); | 
|  | continue; | 
|  | } else { | 
|  | // regular successor | 
|  | PendingUpdates.insert(SuccBlock); | 
|  | } | 
|  | } | 
|  |  | 
|  | auto ItBeginRPO = FuncRPOT.begin(); | 
|  |  | 
|  | // skip until term (TODO RPOT won't let us start at @term directly) | 
|  | for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) {} | 
|  |  | 
|  | auto ItEndRPO = FuncRPOT.end(); | 
|  | assert(ItBeginRPO != ItEndRPO); | 
|  |  | 
|  | // propagate definitions at the immediate successors of the node in RPO | 
|  | auto ItBlockRPO = ItBeginRPO; | 
|  | while (++ItBlockRPO != ItEndRPO && *ItBlockRPO != PdBoundBlock) { | 
|  | const auto *Block = *ItBlockRPO; | 
|  |  | 
|  | // skip @block if not pending update | 
|  | auto ItPending = PendingUpdates.find(Block); | 
|  | if (ItPending == PendingUpdates.end()) | 
|  | continue; | 
|  | PendingUpdates.erase(ItPending); | 
|  |  | 
|  | // propagate definition at @block to its successors | 
|  | auto ItDef = DefMap.find(Block); | 
|  | const auto *DefBlock = ItDef->second; | 
|  | assert(DefBlock); | 
|  |  | 
|  | auto *BlockLoop = LI.getLoopFor(Block); | 
|  | if (ParentLoop && | 
|  | (ParentLoop != BlockLoop && ParentLoop->contains(BlockLoop))) { | 
|  | // if the successor is the header of a nested loop pretend its a | 
|  | // single node with the loop's exits as successors | 
|  | SmallVector<BasicBlock *, 4> BlockLoopExits; | 
|  | BlockLoop->getExitBlocks(BlockLoopExits); | 
|  | for (const auto *BlockLoopExit : BlockLoopExits) { | 
|  | visitSuccessor(*BlockLoopExit, ParentLoop, *DefBlock); | 
|  | } | 
|  |  | 
|  | } else { | 
|  | // the successors are either on the same loop level or loop exits | 
|  | for (const auto *SuccBlock : successors(Block)) { | 
|  | visitSuccessor(*SuccBlock, ParentLoop, *DefBlock); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // We need to know the definition at the parent loop header to decide | 
|  | // whether the definition at the header is different from the definition at | 
|  | // the loop exits, which would indicate a divergent loop exits. | 
|  | // | 
|  | // A // loop header | 
|  | // | | 
|  | // B // nested loop header | 
|  | // | | 
|  | // C -> X (exit from B loop) -..-> (A latch) | 
|  | // | | 
|  | // D -> back to B (B latch) | 
|  | // | | 
|  | // proper exit from both loops | 
|  | // | 
|  | // D post-dominates B as it is the only proper exit from the "A loop". | 
|  | // If C has a divergent branch, propagation will therefore stop at D. | 
|  | // That implies that B will never receive a definition. | 
|  | // But that definition can only be the same as at D (D itself in thise case) | 
|  | // because all paths to anywhere have to pass through D. | 
|  | // | 
|  | const BasicBlock *ParentLoopHeader = | 
|  | ParentLoop ? ParentLoop->getHeader() : nullptr; | 
|  | if (ParentLoop && ParentLoop->contains(PdBoundBlock)) { | 
|  | DefMap[ParentLoopHeader] = DefMap[PdBoundBlock]; | 
|  | } | 
|  |  | 
|  | // analyze reached loop exits | 
|  | if (!ReachedLoopExits.empty()) { | 
|  | assert(ParentLoop); | 
|  | const auto *HeaderDefBlock = DefMap[ParentLoopHeader]; | 
|  | LLVM_DEBUG(printDefs(dbgs())); | 
|  | assert(HeaderDefBlock && "no definition in header of carrying loop"); | 
|  |  | 
|  | for (const auto *ExitBlock : ReachedLoopExits) { | 
|  | auto ItExitDef = DefMap.find(ExitBlock); | 
|  | assert((ItExitDef != DefMap.end()) && | 
|  | "no reaching def at reachable loop exit"); | 
|  | if (ItExitDef->second != HeaderDefBlock) { | 
|  | JoinBlocks->insert(ExitBlock); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | return std::move(JoinBlocks); | 
|  | } | 
|  | }; | 
|  |  | 
|  | const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const Loop &Loop) { | 
|  | using LoopExitVec = SmallVector<BasicBlock *, 4>; | 
|  | LoopExitVec LoopExits; | 
|  | Loop.getExitBlocks(LoopExits); | 
|  | if (LoopExits.size() < 1) { | 
|  | return EmptyBlockSet; | 
|  | } | 
|  |  | 
|  | // already available in cache? | 
|  | auto ItCached = CachedLoopExitJoins.find(&Loop); | 
|  | if (ItCached != CachedLoopExitJoins.end()) { | 
|  | return *ItCached->second; | 
|  | } | 
|  |  | 
|  | // dont propagte beyond the immediate post dom of the loop | 
|  | const auto *PdNode = PDT.getNode(const_cast<BasicBlock *>(Loop.getHeader())); | 
|  | const auto *IpdNode = PdNode->getIDom(); | 
|  | const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; | 
|  | while (PdBoundBlock && Loop.contains(PdBoundBlock)) { | 
|  | IpdNode = IpdNode->getIDom(); | 
|  | PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; | 
|  | } | 
|  |  | 
|  | // compute all join points | 
|  | DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI}; | 
|  | auto JoinBlocks = Propagator.computeJoinPoints<const LoopExitVec &>( | 
|  | *Loop.getHeader(), LoopExits, Loop.getParentLoop(), PdBoundBlock); | 
|  |  | 
|  | auto ItInserted = CachedLoopExitJoins.emplace(&Loop, std::move(JoinBlocks)); | 
|  | assert(ItInserted.second); | 
|  | return *ItInserted.first->second; | 
|  | } | 
|  |  | 
|  | const ConstBlockSet & | 
|  | SyncDependenceAnalysis::join_blocks(const Instruction &Term) { | 
|  | // trivial case | 
|  | if (Term.getNumSuccessors() < 1) { | 
|  | return EmptyBlockSet; | 
|  | } | 
|  |  | 
|  | // already available in cache? | 
|  | auto ItCached = CachedBranchJoins.find(&Term); | 
|  | if (ItCached != CachedBranchJoins.end()) | 
|  | return *ItCached->second; | 
|  |  | 
|  | // dont propagate beyond the immediate post dominator of the branch | 
|  | const auto *PdNode = PDT.getNode(const_cast<BasicBlock *>(Term.getParent())); | 
|  | const auto *IpdNode = PdNode->getIDom(); | 
|  | const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr; | 
|  |  | 
|  | // compute all join points | 
|  | DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI}; | 
|  | const auto &TermBlock = *Term.getParent(); | 
|  | auto JoinBlocks = Propagator.computeJoinPoints<succ_const_range>( | 
|  | TermBlock, successors(Term.getParent()), LI.getLoopFor(&TermBlock), PdBoundBlock); | 
|  |  | 
|  | auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks)); | 
|  | assert(ItInserted.second); | 
|  | return *ItInserted.first->second; | 
|  | } | 
|  |  | 
|  | } // namespace llvm |