|  | //===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==// | 
|  | // | 
|  | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|  | // See https://llvm.org/LICENSE.txt for license information. | 
|  | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  | // | 
|  | // This file implements a general divergence analysis for loop vectorization | 
|  | // and GPU programs. It determines which branches and values in a loop or GPU | 
|  | // program are divergent. It can help branch optimizations such as jump | 
|  | // threading and loop unswitching to make better decisions. | 
|  | // | 
|  | // GPU programs typically use the SIMD execution model, where multiple threads | 
|  | // in the same execution group have to execute in lock-step. Therefore, if the | 
|  | // code contains divergent branches (i.e., threads in a group do not agree on | 
|  | // which path of the branch to take), the group of threads has to execute all | 
|  | // the paths from that branch with different subsets of threads enabled until | 
|  | // they re-converge. | 
|  | // | 
|  | // Due to this execution model, some optimizations such as jump | 
|  | // threading and loop unswitching can interfere with thread re-convergence. | 
|  | // Therefore, an analysis that computes which branches in a GPU program are | 
|  | // divergent can help the compiler to selectively run these optimizations. | 
|  | // | 
|  | // This implementation is derived from the Vectorization Analysis of the | 
|  | // Region Vectorizer (RV). That implementation in turn is based on the approach | 
|  | // described in | 
|  | // | 
|  | //   Improving Performance of OpenCL on CPUs | 
|  | //   Ralf Karrenberg and Sebastian Hack | 
|  | //   CC '12 | 
|  | // | 
|  | // This DivergenceAnalysis implementation is generic in the sense that it does | 
|  | // not itself identify original sources of divergence. | 
|  | // Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and | 
|  | // (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence | 
|  | // (e.g., special variables that hold the thread ID or the iteration variable). | 
|  | // | 
|  | // The generic implementation propagates divergence to variables that are data | 
|  | // or sync dependent on a source of divergence. | 
|  | // | 
|  | // While data dependency is a well-known concept, the notion of sync dependency | 
|  | // is worth more explanation. Sync dependence characterizes the control flow | 
|  | // aspect of the propagation of branch divergence. For example, | 
|  | // | 
|  | //   %cond = icmp slt i32 %tid, 10 | 
|  | //   br i1 %cond, label %then, label %else | 
|  | // then: | 
|  | //   br label %merge | 
|  | // else: | 
|  | //   br label %merge | 
|  | // merge: | 
|  | //   %a = phi i32 [ 0, %then ], [ 1, %else ] | 
|  | // | 
|  | // Suppose %tid holds the thread ID. Although %a is not data dependent on %tid | 
|  | // because %tid is not on its use-def chains, %a is sync dependent on %tid | 
|  | // because the branch "br i1 %cond" depends on %tid and affects which value %a | 
|  | // is assigned to. | 
|  | // | 
|  | // The sync dependence detection (which branch induces divergence in which join | 
|  | // points) is implemented in the SyncDependenceAnalysis. | 
|  | // | 
|  | // The current DivergenceAnalysis implementation has the following limitations: | 
|  | // 1. intra-procedural. It conservatively considers the arguments of a | 
|  | //    non-kernel-entry function and the return value of a function call as | 
|  | //    divergent. | 
|  | // 2. memory as black box. It conservatively considers values loaded from | 
|  | //    generic or local address as divergent. This can be improved by leveraging | 
|  | //    pointer analysis and/or by modelling non-escaping memory objects in SSA | 
|  | //    as done in RV. | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  |  | 
|  | #include "llvm/Analysis/DivergenceAnalysis.h" | 
|  | #include "llvm/Analysis/LoopInfo.h" | 
|  | #include "llvm/Analysis/Passes.h" | 
|  | #include "llvm/Analysis/PostDominators.h" | 
|  | #include "llvm/Analysis/TargetTransformInfo.h" | 
|  | #include "llvm/IR/Dominators.h" | 
|  | #include "llvm/IR/InstIterator.h" | 
|  | #include "llvm/IR/Instructions.h" | 
|  | #include "llvm/IR/IntrinsicInst.h" | 
|  | #include "llvm/IR/Value.h" | 
|  | #include "llvm/Support/Debug.h" | 
|  | #include "llvm/Support/raw_ostream.h" | 
|  | #include <vector> | 
|  |  | 
|  | using namespace llvm; | 
|  |  | 
|  | #define DEBUG_TYPE "divergence-analysis" | 
|  |  | 
|  | // class DivergenceAnalysis | 
|  | DivergenceAnalysis::DivergenceAnalysis( | 
|  | const Function &F, const Loop *RegionLoop, const DominatorTree &DT, | 
|  | const LoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm) | 
|  | : F(F), RegionLoop(RegionLoop), DT(DT), LI(LI), SDA(SDA), | 
|  | IsLCSSAForm(IsLCSSAForm) {} | 
|  |  | 
|  | void DivergenceAnalysis::markDivergent(const Value &DivVal) { | 
|  | assert(isa<Instruction>(DivVal) || isa<Argument>(DivVal)); | 
|  | assert(!isAlwaysUniform(DivVal) && "cannot be a divergent"); | 
|  | DivergentValues.insert(&DivVal); | 
|  | } | 
|  |  | 
|  | void DivergenceAnalysis::addUniformOverride(const Value &UniVal) { | 
|  | UniformOverrides.insert(&UniVal); | 
|  | } | 
|  |  | 
|  | bool DivergenceAnalysis::updateTerminator(const Instruction &Term) const { | 
|  | if (Term.getNumSuccessors() <= 1) | 
|  | return false; | 
|  | if (auto *BranchTerm = dyn_cast<BranchInst>(&Term)) { | 
|  | assert(BranchTerm->isConditional()); | 
|  | return isDivergent(*BranchTerm->getCondition()); | 
|  | } | 
|  | if (auto *SwitchTerm = dyn_cast<SwitchInst>(&Term)) { | 
|  | return isDivergent(*SwitchTerm->getCondition()); | 
|  | } | 
|  | if (isa<InvokeInst>(Term)) { | 
|  | return false; // ignore abnormal executions through landingpad | 
|  | } | 
|  |  | 
|  | llvm_unreachable("unexpected terminator"); | 
|  | } | 
|  |  | 
|  | bool DivergenceAnalysis::updateNormalInstruction(const Instruction &I) const { | 
|  | // TODO function calls with side effects, etc | 
|  | for (const auto &Op : I.operands()) { | 
|  | if (isDivergent(*Op)) | 
|  | return true; | 
|  | } | 
|  | return false; | 
|  | } | 
|  |  | 
|  | bool DivergenceAnalysis::isTemporalDivergent(const BasicBlock &ObservingBlock, | 
|  | const Value &Val) const { | 
|  | const auto *Inst = dyn_cast<const Instruction>(&Val); | 
|  | if (!Inst) | 
|  | return false; | 
|  | // check whether any divergent loop carrying Val terminates before control | 
|  | // proceeds to ObservingBlock | 
|  | for (const auto *Loop = LI.getLoopFor(Inst->getParent()); | 
|  | Loop != RegionLoop && !Loop->contains(&ObservingBlock); | 
|  | Loop = Loop->getParentLoop()) { | 
|  | if (DivergentLoops.find(Loop) != DivergentLoops.end()) | 
|  | return true; | 
|  | } | 
|  |  | 
|  | return false; | 
|  | } | 
|  |  | 
|  | bool DivergenceAnalysis::updatePHINode(const PHINode &Phi) const { | 
|  | // joining divergent disjoint path in Phi parent block | 
|  | if (!Phi.hasConstantOrUndefValue() && isJoinDivergent(*Phi.getParent())) { | 
|  | return true; | 
|  | } | 
|  |  | 
|  | // An incoming value could be divergent by itself. | 
|  | // Otherwise, an incoming value could be uniform within the loop | 
|  | // that carries its definition but it may appear divergent | 
|  | // from outside the loop. This happens when divergent loop exits | 
|  | // drop definitions of that uniform value in different iterations. | 
|  | // | 
|  | // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop | 
|  | //   if (i % thread_id == 0) break;    // divergent loop exit | 
|  | // } | 
|  | // int divI = i;                 // divI is divergent | 
|  | for (size_t i = 0; i < Phi.getNumIncomingValues(); ++i) { | 
|  | const auto *InVal = Phi.getIncomingValue(i); | 
|  | if (isDivergent(*Phi.getIncomingValue(i)) || | 
|  | isTemporalDivergent(*Phi.getParent(), *InVal)) { | 
|  | return true; | 
|  | } | 
|  | } | 
|  | return false; | 
|  | } | 
|  |  | 
|  | bool DivergenceAnalysis::inRegion(const Instruction &I) const { | 
|  | return I.getParent() && inRegion(*I.getParent()); | 
|  | } | 
|  |  | 
|  | bool DivergenceAnalysis::inRegion(const BasicBlock &BB) const { | 
|  | return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB); | 
|  | } | 
|  |  | 
|  | // marks all users of loop-carried values of the loop headed by LoopHeader as | 
|  | // divergent | 
|  | void DivergenceAnalysis::taintLoopLiveOuts(const BasicBlock &LoopHeader) { | 
|  | auto *DivLoop = LI.getLoopFor(&LoopHeader); | 
|  | assert(DivLoop && "loopHeader is not actually part of a loop"); | 
|  |  | 
|  | SmallVector<BasicBlock *, 8> TaintStack; | 
|  | DivLoop->getExitBlocks(TaintStack); | 
|  |  | 
|  | // Otherwise potential users of loop-carried values could be anywhere in the | 
|  | // dominance region of DivLoop (including its fringes for phi nodes) | 
|  | DenseSet<const BasicBlock *> Visited; | 
|  | for (auto *Block : TaintStack) { | 
|  | Visited.insert(Block); | 
|  | } | 
|  | Visited.insert(&LoopHeader); | 
|  |  | 
|  | while (!TaintStack.empty()) { | 
|  | auto *UserBlock = TaintStack.back(); | 
|  | TaintStack.pop_back(); | 
|  |  | 
|  | // don't spread divergence beyond the region | 
|  | if (!inRegion(*UserBlock)) | 
|  | continue; | 
|  |  | 
|  | assert(!DivLoop->contains(UserBlock) && | 
|  | "irreducible control flow detected"); | 
|  |  | 
|  | // phi nodes at the fringes of the dominance region | 
|  | if (!DT.dominates(&LoopHeader, UserBlock)) { | 
|  | // all PHI nodes of UserBlock become divergent | 
|  | for (auto &Phi : UserBlock->phis()) { | 
|  | Worklist.push_back(&Phi); | 
|  | } | 
|  | continue; | 
|  | } | 
|  |  | 
|  | // taint outside users of values carried by DivLoop | 
|  | for (auto &I : *UserBlock) { | 
|  | if (isAlwaysUniform(I)) | 
|  | continue; | 
|  | if (isDivergent(I)) | 
|  | continue; | 
|  |  | 
|  | for (auto &Op : I.operands()) { | 
|  | auto *OpInst = dyn_cast<Instruction>(&Op); | 
|  | if (!OpInst) | 
|  | continue; | 
|  | if (DivLoop->contains(OpInst->getParent())) { | 
|  | markDivergent(I); | 
|  | pushUsers(I); | 
|  | break; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // visit all blocks in the dominance region | 
|  | for (auto *SuccBlock : successors(UserBlock)) { | 
|  | if (!Visited.insert(SuccBlock).second) { | 
|  | continue; | 
|  | } | 
|  | TaintStack.push_back(SuccBlock); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | void DivergenceAnalysis::pushPHINodes(const BasicBlock &Block) { | 
|  | for (const auto &Phi : Block.phis()) { | 
|  | if (isDivergent(Phi)) | 
|  | continue; | 
|  | Worklist.push_back(&Phi); | 
|  | } | 
|  | } | 
|  |  | 
|  | void DivergenceAnalysis::pushUsers(const Value &V) { | 
|  | for (const auto *User : V.users()) { | 
|  | const auto *UserInst = dyn_cast<const Instruction>(User); | 
|  | if (!UserInst) | 
|  | continue; | 
|  |  | 
|  | if (isDivergent(*UserInst)) | 
|  | continue; | 
|  |  | 
|  | // only compute divergent inside loop | 
|  | if (!inRegion(*UserInst)) | 
|  | continue; | 
|  | Worklist.push_back(UserInst); | 
|  | } | 
|  | } | 
|  |  | 
|  | bool DivergenceAnalysis::propagateJoinDivergence(const BasicBlock &JoinBlock, | 
|  | const Loop *BranchLoop) { | 
|  | LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n"); | 
|  |  | 
|  | // ignore divergence outside the region | 
|  | if (!inRegion(JoinBlock)) { | 
|  | return false; | 
|  | } | 
|  |  | 
|  | // push non-divergent phi nodes in JoinBlock to the worklist | 
|  | pushPHINodes(JoinBlock); | 
|  |  | 
|  | // JoinBlock is a divergent loop exit | 
|  | if (BranchLoop && !BranchLoop->contains(&JoinBlock)) { | 
|  | return true; | 
|  | } | 
|  |  | 
|  | // disjoint-paths divergent at JoinBlock | 
|  | markBlockJoinDivergent(JoinBlock); | 
|  | return false; | 
|  | } | 
|  |  | 
|  | void DivergenceAnalysis::propagateBranchDivergence(const Instruction &Term) { | 
|  | LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n"); | 
|  |  | 
|  | markDivergent(Term); | 
|  |  | 
|  | const auto *BranchLoop = LI.getLoopFor(Term.getParent()); | 
|  |  | 
|  | // whether there is a divergent loop exit from BranchLoop (if any) | 
|  | bool IsBranchLoopDivergent = false; | 
|  |  | 
|  | // iterate over all blocks reachable by disjoint from Term within the loop | 
|  | // also iterates over loop exits that become divergent due to Term. | 
|  | for (const auto *JoinBlock : SDA.join_blocks(Term)) { | 
|  | IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop); | 
|  | } | 
|  |  | 
|  | // Branch loop is a divergent loop due to the divergent branch in Term | 
|  | if (IsBranchLoopDivergent) { | 
|  | assert(BranchLoop); | 
|  | if (!DivergentLoops.insert(BranchLoop).second) { | 
|  | return; | 
|  | } | 
|  | propagateLoopDivergence(*BranchLoop); | 
|  | } | 
|  | } | 
|  |  | 
|  | void DivergenceAnalysis::propagateLoopDivergence(const Loop &ExitingLoop) { | 
|  | LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getName() << "\n"); | 
|  |  | 
|  | // don't propagate beyond region | 
|  | if (!inRegion(*ExitingLoop.getHeader())) | 
|  | return; | 
|  |  | 
|  | const auto *BranchLoop = ExitingLoop.getParentLoop(); | 
|  |  | 
|  | // Uses of loop-carried values could occur anywhere | 
|  | // within the dominance region of the definition. All loop-carried | 
|  | // definitions are dominated by the loop header (reducible control). | 
|  | // Thus all users have to be in the dominance region of the loop header, | 
|  | // except PHI nodes that can also live at the fringes of the dom region | 
|  | // (incoming defining value). | 
|  | if (!IsLCSSAForm) | 
|  | taintLoopLiveOuts(*ExitingLoop.getHeader()); | 
|  |  | 
|  | // whether there is a divergent loop exit from BranchLoop (if any) | 
|  | bool IsBranchLoopDivergent = false; | 
|  |  | 
|  | // iterate over all blocks reachable by disjoint paths from exits of | 
|  | // ExitingLoop also iterates over loop exits (of BranchLoop) that in turn | 
|  | // become divergent. | 
|  | for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) { | 
|  | IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop); | 
|  | } | 
|  |  | 
|  | // Branch loop is a divergent due to divergent loop exit in ExitingLoop | 
|  | if (IsBranchLoopDivergent) { | 
|  | assert(BranchLoop); | 
|  | if (!DivergentLoops.insert(BranchLoop).second) { | 
|  | return; | 
|  | } | 
|  | propagateLoopDivergence(*BranchLoop); | 
|  | } | 
|  | } | 
|  |  | 
|  | void DivergenceAnalysis::compute() { | 
|  | for (auto *DivVal : DivergentValues) { | 
|  | pushUsers(*DivVal); | 
|  | } | 
|  |  | 
|  | // propagate divergence | 
|  | while (!Worklist.empty()) { | 
|  | const Instruction &I = *Worklist.back(); | 
|  | Worklist.pop_back(); | 
|  |  | 
|  | // maintain uniformity of overrides | 
|  | if (isAlwaysUniform(I)) | 
|  | continue; | 
|  |  | 
|  | bool WasDivergent = isDivergent(I); | 
|  | if (WasDivergent) | 
|  | continue; | 
|  |  | 
|  | // propagate divergence caused by terminator | 
|  | if (I.isTerminator()) { | 
|  | if (updateTerminator(I)) { | 
|  | // propagate control divergence to affected instructions | 
|  | propagateBranchDivergence(I); | 
|  | continue; | 
|  | } | 
|  | } | 
|  |  | 
|  | // update divergence of I due to divergent operands | 
|  | bool DivergentUpd = false; | 
|  | const auto *Phi = dyn_cast<const PHINode>(&I); | 
|  | if (Phi) { | 
|  | DivergentUpd = updatePHINode(*Phi); | 
|  | } else { | 
|  | DivergentUpd = updateNormalInstruction(I); | 
|  | } | 
|  |  | 
|  | // propagate value divergence to users | 
|  | if (DivergentUpd) { | 
|  | markDivergent(I); | 
|  | pushUsers(I); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | bool DivergenceAnalysis::isAlwaysUniform(const Value &V) const { | 
|  | return UniformOverrides.find(&V) != UniformOverrides.end(); | 
|  | } | 
|  |  | 
|  | bool DivergenceAnalysis::isDivergent(const Value &V) const { | 
|  | return DivergentValues.find(&V) != DivergentValues.end(); | 
|  | } | 
|  |  | 
|  | bool DivergenceAnalysis::isDivergentUse(const Use &U) const { | 
|  | Value &V = *U.get(); | 
|  | Instruction &I = *cast<Instruction>(U.getUser()); | 
|  | return isDivergent(V) || isTemporalDivergent(*I.getParent(), V); | 
|  | } | 
|  |  | 
|  | void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const { | 
|  | if (DivergentValues.empty()) | 
|  | return; | 
|  | // iterate instructions using instructions() to ensure a deterministic order. | 
|  | for (auto &I : instructions(F)) { | 
|  | if (isDivergent(I)) | 
|  | OS << "DIVERGENT:" << I << '\n'; | 
|  | } | 
|  | } | 
|  |  | 
|  | // class GPUDivergenceAnalysis | 
|  | GPUDivergenceAnalysis::GPUDivergenceAnalysis(Function &F, | 
|  | const DominatorTree &DT, | 
|  | const PostDominatorTree &PDT, | 
|  | const LoopInfo &LI, | 
|  | const TargetTransformInfo &TTI) | 
|  | : SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, false) { | 
|  | for (auto &I : instructions(F)) { | 
|  | if (TTI.isSourceOfDivergence(&I)) { | 
|  | DA.markDivergent(I); | 
|  | } else if (TTI.isAlwaysUniform(&I)) { | 
|  | DA.addUniformOverride(I); | 
|  | } | 
|  | } | 
|  | for (auto &Arg : F.args()) { | 
|  | if (TTI.isSourceOfDivergence(&Arg)) { | 
|  | DA.markDivergent(Arg); | 
|  | } | 
|  | } | 
|  |  | 
|  | DA.compute(); | 
|  | } | 
|  |  | 
|  | bool GPUDivergenceAnalysis::isDivergent(const Value &val) const { | 
|  | return DA.isDivergent(val); | 
|  | } | 
|  |  | 
|  | bool GPUDivergenceAnalysis::isDivergentUse(const Use &use) const { | 
|  | return DA.isDivergentUse(use); | 
|  | } | 
|  |  | 
|  | void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const { | 
|  | OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n"; | 
|  | DA.print(OS, mod); | 
|  | OS << "}\n"; | 
|  | } |