Marcello Maggioni | ab58c74 | 2015-09-21 17:58:14 +0000 | [diff] [blame] | 1 | //===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==// |
Jingyue Wu | 5da831c | 2015-04-10 05:03:50 +0000 | [diff] [blame] | 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | // |
Marcello Maggioni | ab58c74 | 2015-09-21 17:58:14 +0000 | [diff] [blame] | 10 | // This file implements divergence analysis which determines whether a branch |
| 11 | // in a GPU program is divergent.It can help branch optimizations such as jump |
Jingyue Wu | 5da831c | 2015-04-10 05:03:50 +0000 | [diff] [blame] | 12 | // threading and loop unswitching to make better decisions. |
| 13 | // |
| 14 | // GPU programs typically use the SIMD execution model, where multiple threads |
| 15 | // in the same execution group have to execute in lock-step. Therefore, if the |
| 16 | // code contains divergent branches (i.e., threads in a group do not agree on |
| 17 | // which path of the branch to take), the group of threads has to execute all |
| 18 | // the paths from that branch with different subsets of threads enabled until |
| 19 | // they converge at the immediately post-dominating BB of the paths. |
| 20 | // |
| 21 | // Due to this execution model, some optimizations such as jump |
| 22 | // threading and loop unswitching can be unfortunately harmful when performed on |
| 23 | // divergent branches. Therefore, an analysis that computes which branches in a |
| 24 | // GPU program are divergent can help the compiler to selectively run these |
| 25 | // optimizations. |
| 26 | // |
| 27 | // This file defines divergence analysis which computes a conservative but |
| 28 | // non-trivial approximation of all divergent branches in a GPU program. It |
| 29 | // partially implements the approach described in |
| 30 | // |
| 31 | // Divergence Analysis |
| 32 | // Sampaio, Souza, Collange, Pereira |
| 33 | // TOPLAS '13 |
| 34 | // |
| 35 | // The divergence analysis identifies the sources of divergence (e.g., special |
| 36 | // variables that hold the thread ID), and recursively marks variables that are |
| 37 | // data or sync dependent on a source of divergence as divergent. |
| 38 | // |
| 39 | // While data dependency is a well-known concept, the notion of sync dependency |
| 40 | // is worth more explanation. Sync dependence characterizes the control flow |
| 41 | // aspect of the propagation of branch divergence. For example, |
| 42 | // |
| 43 | // %cond = icmp slt i32 %tid, 10 |
| 44 | // br i1 %cond, label %then, label %else |
| 45 | // then: |
| 46 | // br label %merge |
| 47 | // else: |
| 48 | // br label %merge |
| 49 | // merge: |
| 50 | // %a = phi i32 [ 0, %then ], [ 1, %else ] |
| 51 | // |
| 52 | // Suppose %tid holds the thread ID. Although %a is not data dependent on %tid |
| 53 | // because %tid is not on its use-def chains, %a is sync dependent on %tid |
| 54 | // because the branch "br i1 %cond" depends on %tid and affects which value %a |
| 55 | // is assigned to. |
| 56 | // |
| 57 | // The current implementation has the following limitations: |
| 58 | // 1. intra-procedural. It conservatively considers the arguments of a |
| 59 | // non-kernel-entry function and the return value of a function call as |
| 60 | // divergent. |
| 61 | // 2. memory as black box. It conservatively considers values loaded from |
| 62 | // generic or local address as divergent. This can be improved by leveraging |
| 63 | // pointer analysis. |
Marcello Maggioni | ab58c74 | 2015-09-21 17:58:14 +0000 | [diff] [blame] | 64 | // |
Jingyue Wu | 5da831c | 2015-04-10 05:03:50 +0000 | [diff] [blame] | 65 | //===----------------------------------------------------------------------===// |
| 66 | |
Marcello Maggioni | ab58c74 | 2015-09-21 17:58:14 +0000 | [diff] [blame] | 67 | #include "llvm/Analysis/DivergenceAnalysis.h" |
Jingyue Wu | 5da831c | 2015-04-10 05:03:50 +0000 | [diff] [blame] | 68 | #include "llvm/Analysis/Passes.h" |
| 69 | #include "llvm/Analysis/PostDominators.h" |
| 70 | #include "llvm/Analysis/TargetTransformInfo.h" |
Marcello Maggioni | ab58c74 | 2015-09-21 17:58:14 +0000 | [diff] [blame] | 71 | #include "llvm/IR/Dominators.h" |
Jingyue Wu | 5da831c | 2015-04-10 05:03:50 +0000 | [diff] [blame] | 72 | #include "llvm/IR/InstIterator.h" |
| 73 | #include "llvm/IR/Instructions.h" |
| 74 | #include "llvm/IR/IntrinsicInst.h" |
| 75 | #include "llvm/IR/Value.h" |
Jingyue Wu | 5da831c | 2015-04-10 05:03:50 +0000 | [diff] [blame] | 76 | #include "llvm/Support/CommandLine.h" |
| 77 | #include "llvm/Support/Debug.h" |
| 78 | #include "llvm/Support/raw_ostream.h" |
| 79 | #include "llvm/Transforms/Scalar.h" |
Marcello Maggioni | ab58c74 | 2015-09-21 17:58:14 +0000 | [diff] [blame] | 80 | #include <vector> |
Jingyue Wu | 5da831c | 2015-04-10 05:03:50 +0000 | [diff] [blame] | 81 | using namespace llvm; |
| 82 | |
Jingyue Wu | 5da831c | 2015-04-10 05:03:50 +0000 | [diff] [blame] | 83 | namespace { |
| 84 | |
| 85 | class DivergencePropagator { |
| 86 | public: |
Marcello Maggioni | ab58c74 | 2015-09-21 17:58:14 +0000 | [diff] [blame] | 87 | DivergencePropagator(Function &F, TargetTransformInfo &TTI, DominatorTree &DT, |
| 88 | PostDominatorTree &PDT, DenseSet<const Value *> &DV) |
Jingyue Wu | 5da831c | 2015-04-10 05:03:50 +0000 | [diff] [blame] | 89 | : F(F), TTI(TTI), DT(DT), PDT(PDT), DV(DV) {} |
| 90 | void populateWithSourcesOfDivergence(); |
| 91 | void propagate(); |
| 92 | |
| 93 | private: |
| 94 | // A helper function that explores data dependents of V. |
| 95 | void exploreDataDependency(Value *V); |
| 96 | // A helper function that explores sync dependents of TI. |
| 97 | void exploreSyncDependency(TerminatorInst *TI); |
| 98 | // Computes the influence region from Start to End. This region includes all |
| 99 | // basic blocks on any path from Start to End. |
| 100 | void computeInfluenceRegion(BasicBlock *Start, BasicBlock *End, |
| 101 | DenseSet<BasicBlock *> &InfluenceRegion); |
| 102 | // Finds all users of I that are outside the influence region, and add these |
| 103 | // users to Worklist. |
| 104 | void findUsersOutsideInfluenceRegion( |
| 105 | Instruction &I, const DenseSet<BasicBlock *> &InfluenceRegion); |
| 106 | |
| 107 | Function &F; |
| 108 | TargetTransformInfo &TTI; |
| 109 | DominatorTree &DT; |
| 110 | PostDominatorTree &PDT; |
| 111 | std::vector<Value *> Worklist; // Stack for DFS. |
Marcello Maggioni | ab58c74 | 2015-09-21 17:58:14 +0000 | [diff] [blame] | 112 | DenseSet<const Value *> &DV; // Stores all divergent values. |
Jingyue Wu | 5da831c | 2015-04-10 05:03:50 +0000 | [diff] [blame] | 113 | }; |
| 114 | |
| 115 | void DivergencePropagator::populateWithSourcesOfDivergence() { |
| 116 | Worklist.clear(); |
| 117 | DV.clear(); |
Nico Rieck | 7819951 | 2015-08-06 19:10:45 +0000 | [diff] [blame] | 118 | for (auto &I : instructions(F)) { |
Jingyue Wu | 5da831c | 2015-04-10 05:03:50 +0000 | [diff] [blame] | 119 | if (TTI.isSourceOfDivergence(&I)) { |
| 120 | Worklist.push_back(&I); |
| 121 | DV.insert(&I); |
| 122 | } |
| 123 | } |
| 124 | for (auto &Arg : F.args()) { |
| 125 | if (TTI.isSourceOfDivergence(&Arg)) { |
| 126 | Worklist.push_back(&Arg); |
| 127 | DV.insert(&Arg); |
| 128 | } |
| 129 | } |
| 130 | } |
| 131 | |
| 132 | void DivergencePropagator::exploreSyncDependency(TerminatorInst *TI) { |
| 133 | // Propagation rule 1: if branch TI is divergent, all PHINodes in TI's |
| 134 | // immediate post dominator are divergent. This rule handles if-then-else |
| 135 | // patterns. For example, |
| 136 | // |
| 137 | // if (tid < 5) |
| 138 | // a1 = 1; |
| 139 | // else |
| 140 | // a2 = 2; |
| 141 | // a = phi(a1, a2); // sync dependent on (tid < 5) |
| 142 | BasicBlock *ThisBB = TI->getParent(); |
| 143 | BasicBlock *IPostDom = PDT.getNode(ThisBB)->getIDom()->getBlock(); |
| 144 | if (IPostDom == nullptr) |
| 145 | return; |
| 146 | |
| 147 | for (auto I = IPostDom->begin(); isa<PHINode>(I); ++I) { |
| 148 | // A PHINode is uniform if it returns the same value no matter which path is |
| 149 | // taken. |
Duncan P. N. Exon Smith | 5a82c91 | 2015-10-10 00:53:03 +0000 | [diff] [blame^] | 150 | if (!cast<PHINode>(I)->hasConstantValue() && DV.insert(&*I).second) |
| 151 | Worklist.push_back(&*I); |
Jingyue Wu | 5da831c | 2015-04-10 05:03:50 +0000 | [diff] [blame] | 152 | } |
| 153 | |
| 154 | // Propagation rule 2: if a value defined in a loop is used outside, the user |
| 155 | // is sync dependent on the condition of the loop exits that dominate the |
| 156 | // user. For example, |
| 157 | // |
| 158 | // int i = 0; |
| 159 | // do { |
| 160 | // i++; |
| 161 | // if (foo(i)) ... // uniform |
| 162 | // } while (i < tid); |
| 163 | // if (bar(i)) ... // divergent |
| 164 | // |
| 165 | // A program may contain unstructured loops. Therefore, we cannot leverage |
| 166 | // LoopInfo, which only recognizes natural loops. |
| 167 | // |
| 168 | // The algorithm used here handles both natural and unstructured loops. Given |
| 169 | // a branch TI, we first compute its influence region, the union of all simple |
| 170 | // paths from TI to its immediate post dominator (IPostDom). Then, we search |
| 171 | // for all the values defined in the influence region but used outside. All |
| 172 | // these users are sync dependent on TI. |
| 173 | DenseSet<BasicBlock *> InfluenceRegion; |
| 174 | computeInfluenceRegion(ThisBB, IPostDom, InfluenceRegion); |
| 175 | // An insight that can speed up the search process is that all the in-region |
| 176 | // values that are used outside must dominate TI. Therefore, instead of |
| 177 | // searching every basic blocks in the influence region, we search all the |
| 178 | // dominators of TI until it is outside the influence region. |
| 179 | BasicBlock *InfluencedBB = ThisBB; |
| 180 | while (InfluenceRegion.count(InfluencedBB)) { |
| 181 | for (auto &I : *InfluencedBB) |
| 182 | findUsersOutsideInfluenceRegion(I, InfluenceRegion); |
| 183 | DomTreeNode *IDomNode = DT.getNode(InfluencedBB)->getIDom(); |
| 184 | if (IDomNode == nullptr) |
| 185 | break; |
| 186 | InfluencedBB = IDomNode->getBlock(); |
| 187 | } |
| 188 | } |
| 189 | |
| 190 | void DivergencePropagator::findUsersOutsideInfluenceRegion( |
| 191 | Instruction &I, const DenseSet<BasicBlock *> &InfluenceRegion) { |
| 192 | for (User *U : I.users()) { |
| 193 | Instruction *UserInst = cast<Instruction>(U); |
| 194 | if (!InfluenceRegion.count(UserInst->getParent())) { |
| 195 | if (DV.insert(UserInst).second) |
| 196 | Worklist.push_back(UserInst); |
| 197 | } |
| 198 | } |
| 199 | } |
| 200 | |
| 201 | void DivergencePropagator::computeInfluenceRegion( |
| 202 | BasicBlock *Start, BasicBlock *End, |
| 203 | DenseSet<BasicBlock *> &InfluenceRegion) { |
| 204 | assert(PDT.properlyDominates(End, Start) && |
| 205 | "End does not properly dominate Start"); |
| 206 | std::vector<BasicBlock *> InfluenceStack; |
| 207 | InfluenceStack.push_back(Start); |
| 208 | InfluenceRegion.insert(Start); |
| 209 | while (!InfluenceStack.empty()) { |
| 210 | BasicBlock *BB = InfluenceStack.back(); |
| 211 | InfluenceStack.pop_back(); |
| 212 | for (BasicBlock *Succ : successors(BB)) { |
| 213 | if (End != Succ && InfluenceRegion.insert(Succ).second) |
| 214 | InfluenceStack.push_back(Succ); |
| 215 | } |
| 216 | } |
| 217 | } |
| 218 | |
| 219 | void DivergencePropagator::exploreDataDependency(Value *V) { |
| 220 | // Follow def-use chains of V. |
| 221 | for (User *U : V->users()) { |
| 222 | Instruction *UserInst = cast<Instruction>(U); |
| 223 | if (DV.insert(UserInst).second) |
| 224 | Worklist.push_back(UserInst); |
| 225 | } |
| 226 | } |
| 227 | |
| 228 | void DivergencePropagator::propagate() { |
| 229 | // Traverse the dependency graph using DFS. |
| 230 | while (!Worklist.empty()) { |
| 231 | Value *V = Worklist.back(); |
| 232 | Worklist.pop_back(); |
| 233 | if (TerminatorInst *TI = dyn_cast<TerminatorInst>(V)) { |
| 234 | // Terminators with less than two successors won't introduce sync |
| 235 | // dependency. Ignore them. |
| 236 | if (TI->getNumSuccessors() > 1) |
| 237 | exploreSyncDependency(TI); |
| 238 | } |
| 239 | exploreDataDependency(V); |
| 240 | } |
| 241 | } |
| 242 | |
Alexander Kornienko | f00654e | 2015-06-23 09:49:53 +0000 | [diff] [blame] | 243 | } /// end namespace anonymous |
Jingyue Wu | 5da831c | 2015-04-10 05:03:50 +0000 | [diff] [blame] | 244 | |
Marcello Maggioni | ab58c74 | 2015-09-21 17:58:14 +0000 | [diff] [blame] | 245 | // Register this pass. |
| 246 | char DivergenceAnalysis::ID = 0; |
| 247 | INITIALIZE_PASS_BEGIN(DivergenceAnalysis, "divergence", "Divergence Analysis", |
| 248 | false, true) |
| 249 | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) |
| 250 | INITIALIZE_PASS_DEPENDENCY(PostDominatorTree) |
| 251 | INITIALIZE_PASS_END(DivergenceAnalysis, "divergence", "Divergence Analysis", |
| 252 | false, true) |
| 253 | |
Jingyue Wu | 5da831c | 2015-04-10 05:03:50 +0000 | [diff] [blame] | 254 | FunctionPass *llvm::createDivergenceAnalysisPass() { |
| 255 | return new DivergenceAnalysis(); |
| 256 | } |
| 257 | |
Marcello Maggioni | ab58c74 | 2015-09-21 17:58:14 +0000 | [diff] [blame] | 258 | void DivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { |
| 259 | AU.addRequired<DominatorTreeWrapperPass>(); |
| 260 | AU.addRequired<PostDominatorTree>(); |
| 261 | AU.setPreservesAll(); |
| 262 | } |
| 263 | |
Jingyue Wu | 5da831c | 2015-04-10 05:03:50 +0000 | [diff] [blame] | 264 | bool DivergenceAnalysis::runOnFunction(Function &F) { |
| 265 | auto *TTIWP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>(); |
| 266 | if (TTIWP == nullptr) |
| 267 | return false; |
| 268 | |
| 269 | TargetTransformInfo &TTI = TTIWP->getTTI(F); |
| 270 | // Fast path: if the target does not have branch divergence, we do not mark |
| 271 | // any branch as divergent. |
| 272 | if (!TTI.hasBranchDivergence()) |
| 273 | return false; |
| 274 | |
| 275 | DivergentValues.clear(); |
| 276 | DivergencePropagator DP(F, TTI, |
| 277 | getAnalysis<DominatorTreeWrapperPass>().getDomTree(), |
| 278 | getAnalysis<PostDominatorTree>(), DivergentValues); |
| 279 | DP.populateWithSourcesOfDivergence(); |
| 280 | DP.propagate(); |
| 281 | return false; |
| 282 | } |
| 283 | |
| 284 | void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const { |
| 285 | if (DivergentValues.empty()) |
| 286 | return; |
| 287 | const Value *FirstDivergentValue = *DivergentValues.begin(); |
| 288 | const Function *F; |
| 289 | if (const Argument *Arg = dyn_cast<Argument>(FirstDivergentValue)) { |
| 290 | F = Arg->getParent(); |
| 291 | } else if (const Instruction *I = |
| 292 | dyn_cast<Instruction>(FirstDivergentValue)) { |
| 293 | F = I->getParent()->getParent(); |
| 294 | } else { |
| 295 | llvm_unreachable("Only arguments and instructions can be divergent"); |
| 296 | } |
| 297 | |
| 298 | // Dumps all divergent values in F, arguments and then instructions. |
| 299 | for (auto &Arg : F->args()) { |
| 300 | if (DivergentValues.count(&Arg)) |
| 301 | OS << "DIVERGENT: " << Arg << "\n"; |
| 302 | } |
Nico Rieck | 7819951 | 2015-08-06 19:10:45 +0000 | [diff] [blame] | 303 | // Iterate instructions using instructions() to ensure a deterministic order. |
| 304 | for (auto &I : instructions(F)) { |
Jingyue Wu | 5da831c | 2015-04-10 05:03:50 +0000 | [diff] [blame] | 305 | if (DivergentValues.count(&I)) |
| 306 | OS << "DIVERGENT:" << I << "\n"; |
| 307 | } |
| 308 | } |