Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 1 | //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// This file implements the loop fusion pass. |
| 11 | /// The implementation is largely based on the following document: |
| 12 | /// |
| 13 | /// Code Transformations to Augment the Scope of Loop Fusion in a |
| 14 | /// Production Compiler |
| 15 | /// Christopher Mark Barton |
| 16 | /// MSc Thesis |
| 17 | /// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf |
| 18 | /// |
| 19 | /// The general approach taken is to collect sets of control flow equivalent |
| 20 | /// loops and test whether they can be fused. The necessary conditions for |
| 21 | /// fusion are: |
| 22 | /// 1. The loops must be adjacent (there cannot be any statements between |
| 23 | /// the two loops). |
| 24 | /// 2. The loops must be conforming (they must execute the same number of |
| 25 | /// iterations). |
| 26 | /// 3. The loops must be control flow equivalent (if one loop executes, the |
| 27 | /// other is guaranteed to execute). |
| 28 | /// 4. There cannot be any negative distance dependencies between the loops. |
| 29 | /// If all of these conditions are satisfied, it is safe to fuse the loops. |
| 30 | /// |
| 31 | /// This implementation creates FusionCandidates that represent the loop and the |
| 32 | /// necessary information needed by fusion. It then operates on the fusion |
| 33 | /// candidates, first confirming that the candidate is eligible for fusion. The |
| 34 | /// candidates are then collected into control flow equivalent sets, sorted in |
| 35 | /// dominance order. Each set of control flow equivalent candidates is then |
| 36 | /// traversed, attempting to fuse pairs of candidates in the set. If all |
| 37 | /// requirements for fusion are met, the two candidates are fused, creating a |
| 38 | /// new (fused) candidate which is then added back into the set to consider for |
| 39 | /// additional fusion. |
| 40 | /// |
| 41 | /// This implementation currently does not make any modifications to remove |
| 42 | /// conditions for fusion. Code transformations to make loops conform to each of |
| 43 | /// the conditions for fusion are discussed in more detail in the document |
| 44 | /// above. These can be added to the current implementation in the future. |
| 45 | //===----------------------------------------------------------------------===// |
| 46 | |
| 47 | #include "llvm/Transforms/Scalar/LoopFuse.h" |
| 48 | #include "llvm/ADT/Statistic.h" |
| 49 | #include "llvm/Analysis/DependenceAnalysis.h" |
| 50 | #include "llvm/Analysis/DomTreeUpdater.h" |
| 51 | #include "llvm/Analysis/LoopInfo.h" |
| 52 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| 53 | #include "llvm/Analysis/PostDominators.h" |
| 54 | #include "llvm/Analysis/ScalarEvolution.h" |
| 55 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
| 56 | #include "llvm/IR/Function.h" |
| 57 | #include "llvm/IR/Verifier.h" |
| 58 | #include "llvm/Pass.h" |
| 59 | #include "llvm/Support/Debug.h" |
| 60 | #include "llvm/Support/raw_ostream.h" |
| 61 | #include "llvm/Transforms/Scalar.h" |
| 62 | #include "llvm/Transforms/Utils.h" |
| 63 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
| 64 | |
| 65 | using namespace llvm; |
| 66 | |
| 67 | #define DEBUG_TYPE "loop-fusion" |
| 68 | |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 69 | STATISTIC(FuseCounter, "Loops fused"); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 70 | STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion"); |
| 71 | STATISTIC(InvalidPreheader, "Loop has invalid preheader"); |
| 72 | STATISTIC(InvalidHeader, "Loop has invalid header"); |
| 73 | STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks"); |
| 74 | STATISTIC(InvalidExitBlock, "Loop has invalid exit block"); |
| 75 | STATISTIC(InvalidLatch, "Loop has invalid latch"); |
| 76 | STATISTIC(InvalidLoop, "Loop is invalid"); |
| 77 | STATISTIC(AddressTakenBB, "Basic block has address taken"); |
| 78 | STATISTIC(MayThrowException, "Loop may throw an exception"); |
| 79 | STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access"); |
| 80 | STATISTIC(NotSimplifiedForm, "Loop is not in simplified form"); |
| 81 | STATISTIC(InvalidDependencies, "Dependencies prevent fusion"); |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 82 | STATISTIC(UnknownTripCount, "Loop has unknown trip count"); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 83 | STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop"); |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 84 | STATISTIC(NonEqualTripCount, "Loop trip counts are not the same"); |
| 85 | STATISTIC(NonAdjacent, "Loops are not adjacent"); |
| 86 | STATISTIC(NonEmptyPreheader, "Loop has a non-empty preheader"); |
| 87 | STATISTIC(FusionNotBeneficial, "Fusion is not beneficial"); |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 88 | STATISTIC(NonIdenticalGuards, "Candidates have different guards"); |
| 89 | STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block"); |
| 90 | STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block"); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 91 | |
| 92 | enum FusionDependenceAnalysisChoice { |
| 93 | FUSION_DEPENDENCE_ANALYSIS_SCEV, |
| 94 | FUSION_DEPENDENCE_ANALYSIS_DA, |
| 95 | FUSION_DEPENDENCE_ANALYSIS_ALL, |
| 96 | }; |
| 97 | |
| 98 | static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis( |
| 99 | "loop-fusion-dependence-analysis", |
| 100 | cl::desc("Which dependence analysis should loop fusion use?"), |
| 101 | cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev", |
| 102 | "Use the scalar evolution interface"), |
| 103 | clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da", |
| 104 | "Use the dependence analysis interface"), |
| 105 | clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all", |
| 106 | "Use all available analyses")), |
| 107 | cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL), cl::ZeroOrMore); |
| 108 | |
| 109 | #ifndef NDEBUG |
| 110 | static cl::opt<bool> |
| 111 | VerboseFusionDebugging("loop-fusion-verbose-debug", |
| 112 | cl::desc("Enable verbose debugging for Loop Fusion"), |
| 113 | cl::Hidden, cl::init(false), cl::ZeroOrMore); |
| 114 | #endif |
| 115 | |
Benjamin Kramer | dc5f805 | 2019-08-23 19:59:23 +0000 | [diff] [blame] | 116 | namespace { |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 117 | /// This class is used to represent a candidate for loop fusion. When it is |
| 118 | /// constructed, it checks the conditions for loop fusion to ensure that it |
| 119 | /// represents a valid candidate. It caches several parts of a loop that are |
| 120 | /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead |
| 121 | /// of continually querying the underlying Loop to retrieve these values. It is |
| 122 | /// assumed these will not change throughout loop fusion. |
| 123 | /// |
| 124 | /// The invalidate method should be used to indicate that the FusionCandidate is |
| 125 | /// no longer a valid candidate for fusion. Similarly, the isValid() method can |
| 126 | /// be used to ensure that the FusionCandidate is still valid for fusion. |
| 127 | struct FusionCandidate { |
| 128 | /// Cache of parts of the loop used throughout loop fusion. These should not |
| 129 | /// need to change throughout the analysis and transformation. |
| 130 | /// These parts are cached to avoid repeatedly looking up in the Loop class. |
| 131 | |
| 132 | /// Preheader of the loop this candidate represents |
| 133 | BasicBlock *Preheader; |
| 134 | /// Header of the loop this candidate represents |
| 135 | BasicBlock *Header; |
| 136 | /// Blocks in the loop that exit the loop |
| 137 | BasicBlock *ExitingBlock; |
| 138 | /// The successor block of this loop (where the exiting blocks go to) |
| 139 | BasicBlock *ExitBlock; |
| 140 | /// Latch of the loop |
| 141 | BasicBlock *Latch; |
| 142 | /// The loop that this fusion candidate represents |
| 143 | Loop *L; |
| 144 | /// Vector of instructions in this loop that read from memory |
| 145 | SmallVector<Instruction *, 16> MemReads; |
| 146 | /// Vector of instructions in this loop that write to memory |
| 147 | SmallVector<Instruction *, 16> MemWrites; |
| 148 | /// Are all of the members of this fusion candidate still valid |
| 149 | bool Valid; |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 150 | /// Guard branch of the loop, if it exists |
| 151 | BranchInst *GuardBranch; |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 152 | |
| 153 | /// Dominator and PostDominator trees are needed for the |
| 154 | /// FusionCandidateCompare function, required by FusionCandidateSet to |
| 155 | /// determine where the FusionCandidate should be inserted into the set. These |
| 156 | /// are used to establish ordering of the FusionCandidates based on dominance. |
| 157 | const DominatorTree *DT; |
| 158 | const PostDominatorTree *PDT; |
| 159 | |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 160 | OptimizationRemarkEmitter &ORE; |
| 161 | |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 162 | FusionCandidate(Loop *L, const DominatorTree *DT, |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 163 | const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE) |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 164 | : Preheader(L->getLoopPreheader()), Header(L->getHeader()), |
| 165 | ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 166 | Latch(L->getLoopLatch()), L(L), Valid(true), GuardBranch(nullptr), |
| 167 | DT(DT), PDT(PDT), ORE(ORE) { |
| 168 | |
| 169 | // TODO: This is temporary while we fuse both rotated and non-rotated |
| 170 | // loops. Once we switch to only fusing rotated loops, the initialization of |
| 171 | // GuardBranch can be moved into the initialization list above. |
| 172 | if (isRotated()) |
| 173 | GuardBranch = L->getLoopGuardBranch(); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 174 | |
| 175 | // Walk over all blocks in the loop and check for conditions that may |
| 176 | // prevent fusion. For each block, walk over all instructions and collect |
| 177 | // the memory reads and writes If any instructions that prevent fusion are |
| 178 | // found, invalidate this object and return. |
| 179 | for (BasicBlock *BB : L->blocks()) { |
| 180 | if (BB->hasAddressTaken()) { |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 181 | invalidate(); |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 182 | reportInvalidCandidate(AddressTakenBB); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 183 | return; |
| 184 | } |
| 185 | |
| 186 | for (Instruction &I : *BB) { |
| 187 | if (I.mayThrow()) { |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 188 | invalidate(); |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 189 | reportInvalidCandidate(MayThrowException); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 190 | return; |
| 191 | } |
| 192 | if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { |
| 193 | if (SI->isVolatile()) { |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 194 | invalidate(); |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 195 | reportInvalidCandidate(ContainsVolatileAccess); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 196 | return; |
| 197 | } |
| 198 | } |
| 199 | if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { |
| 200 | if (LI->isVolatile()) { |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 201 | invalidate(); |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 202 | reportInvalidCandidate(ContainsVolatileAccess); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 203 | return; |
| 204 | } |
| 205 | } |
| 206 | if (I.mayWriteToMemory()) |
| 207 | MemWrites.push_back(&I); |
| 208 | if (I.mayReadFromMemory()) |
| 209 | MemReads.push_back(&I); |
| 210 | } |
| 211 | } |
| 212 | } |
| 213 | |
| 214 | /// Check if all members of the class are valid. |
| 215 | bool isValid() const { |
| 216 | return Preheader && Header && ExitingBlock && ExitBlock && Latch && L && |
| 217 | !L->isInvalid() && Valid; |
| 218 | } |
| 219 | |
| 220 | /// Verify that all members are in sync with the Loop object. |
| 221 | void verify() const { |
| 222 | assert(isValid() && "Candidate is not valid!!"); |
| 223 | assert(!L->isInvalid() && "Loop is invalid!"); |
| 224 | assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync"); |
| 225 | assert(Header == L->getHeader() && "Header is out of sync"); |
| 226 | assert(ExitingBlock == L->getExitingBlock() && |
| 227 | "Exiting Blocks is out of sync"); |
| 228 | assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync"); |
| 229 | assert(Latch == L->getLoopLatch() && "Latch is out of sync"); |
| 230 | } |
| 231 | |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 232 | /// Get the entry block for this fusion candidate. |
| 233 | /// |
| 234 | /// If this fusion candidate represents a guarded loop, the entry block is the |
| 235 | /// loop guard block. If it represents an unguarded loop, the entry block is |
| 236 | /// the preheader of the loop. |
| 237 | BasicBlock *getEntryBlock() const { |
| 238 | if (GuardBranch) |
| 239 | return GuardBranch->getParent(); |
| 240 | else |
| 241 | return Preheader; |
| 242 | } |
| 243 | |
| 244 | /// Given a guarded loop, get the successor of the guard that is not in the |
| 245 | /// loop. |
| 246 | /// |
| 247 | /// This method returns the successor of the loop guard that is not located |
| 248 | /// within the loop (i.e., the successor of the guard that is not the |
| 249 | /// preheader). |
| 250 | /// This method is only valid for guarded loops. |
| 251 | BasicBlock *getNonLoopBlock() const { |
| 252 | assert(GuardBranch && "Only valid on guarded loops."); |
| 253 | assert(GuardBranch->isConditional() && |
| 254 | "Expecting guard to be a conditional branch."); |
| 255 | return (GuardBranch->getSuccessor(0) == Preheader) |
| 256 | ? GuardBranch->getSuccessor(1) |
| 257 | : GuardBranch->getSuccessor(0); |
| 258 | } |
| 259 | |
| 260 | bool isRotated() const { |
| 261 | assert(L && "Expecting loop to be valid."); |
| 262 | assert(Latch && "Expecting latch to be valid."); |
| 263 | return L->isLoopExiting(Latch); |
| 264 | } |
| 265 | |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 266 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 267 | LLVM_DUMP_METHOD void dump() const { |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 268 | dbgs() << "\tGuardBranch: " |
| 269 | << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n" |
| 270 | << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr") |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 271 | << "\n" |
| 272 | << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n" |
| 273 | << "\tExitingBB: " |
| 274 | << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n" |
| 275 | << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr") |
| 276 | << "\n" |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 277 | << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n" |
| 278 | << "\tEntryBlock: " |
| 279 | << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr") |
| 280 | << "\n"; |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 281 | } |
| 282 | #endif |
| 283 | |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 284 | /// Determine if a fusion candidate (representing a loop) is eligible for |
| 285 | /// fusion. Note that this only checks whether a single loop can be fused - it |
| 286 | /// does not check whether it is *legal* to fuse two loops together. |
| 287 | bool isEligibleForFusion(ScalarEvolution &SE) const { |
| 288 | if (!isValid()) { |
| 289 | LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n"); |
| 290 | if (!Preheader) |
| 291 | ++InvalidPreheader; |
| 292 | if (!Header) |
| 293 | ++InvalidHeader; |
| 294 | if (!ExitingBlock) |
| 295 | ++InvalidExitingBlock; |
| 296 | if (!ExitBlock) |
| 297 | ++InvalidExitBlock; |
| 298 | if (!Latch) |
| 299 | ++InvalidLatch; |
| 300 | if (L->isInvalid()) |
| 301 | ++InvalidLoop; |
| 302 | |
| 303 | return false; |
| 304 | } |
| 305 | |
| 306 | // Require ScalarEvolution to be able to determine a trip count. |
| 307 | if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { |
| 308 | LLVM_DEBUG(dbgs() << "Loop " << L->getName() |
| 309 | << " trip count not computable!\n"); |
| 310 | return reportInvalidCandidate(UnknownTripCount); |
| 311 | } |
| 312 | |
| 313 | if (!L->isLoopSimplifyForm()) { |
| 314 | LLVM_DEBUG(dbgs() << "Loop " << L->getName() |
| 315 | << " is not in simplified form!\n"); |
| 316 | return reportInvalidCandidate(NotSimplifiedForm); |
| 317 | } |
| 318 | |
| 319 | return true; |
| 320 | } |
| 321 | |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 322 | private: |
| 323 | // This is only used internally for now, to clear the MemWrites and MemReads |
| 324 | // list and setting Valid to false. I can't envision other uses of this right |
| 325 | // now, since once FusionCandidates are put into the FusionCandidateSet they |
| 326 | // are immutable. Thus, any time we need to change/update a FusionCandidate, |
| 327 | // we must create a new one and insert it into the FusionCandidateSet to |
| 328 | // ensure the FusionCandidateSet remains ordered correctly. |
| 329 | void invalidate() { |
| 330 | MemWrites.clear(); |
| 331 | MemReads.clear(); |
| 332 | Valid = false; |
| 333 | } |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 334 | |
| 335 | bool reportInvalidCandidate(llvm::Statistic &Stat) const { |
| 336 | using namespace ore; |
| 337 | assert(L && Preheader && "Fusion candidate not initialized properly!"); |
| 338 | ++Stat; |
| 339 | ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(), |
| 340 | L->getStartLoc(), Preheader) |
| 341 | << "[" << Preheader->getParent()->getName() << "]: " |
| 342 | << "Loop is not a candidate for fusion: " << Stat.getDesc()); |
| 343 | return false; |
| 344 | } |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 345 | }; |
| 346 | |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 347 | struct FusionCandidateCompare { |
| 348 | /// Comparison functor to sort two Control Flow Equivalent fusion candidates |
| 349 | /// into dominance order. |
| 350 | /// If LHS dominates RHS and RHS post-dominates LHS, return true; |
| 351 | /// IF RHS dominates LHS and LHS post-dominates RHS, return false; |
| 352 | bool operator()(const FusionCandidate &LHS, |
| 353 | const FusionCandidate &RHS) const { |
| 354 | const DominatorTree *DT = LHS.DT; |
| 355 | |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 356 | BasicBlock *LHSEntryBlock = LHS.getEntryBlock(); |
| 357 | BasicBlock *RHSEntryBlock = RHS.getEntryBlock(); |
| 358 | |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 359 | // Do not save PDT to local variable as it is only used in asserts and thus |
| 360 | // will trigger an unused variable warning if building without asserts. |
| 361 | assert(DT && LHS.PDT && "Expecting valid dominator tree"); |
| 362 | |
Richard Trieu | 7b61920 | 2019-04-18 01:39:45 +0000 | [diff] [blame] | 363 | // Do this compare first so if LHS == RHS, function returns false. |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 364 | if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) { |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 365 | // RHS dominates LHS |
| 366 | // Verify LHS post-dominates RHS |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 367 | assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock)); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 368 | return false; |
| 369 | } |
Richard Trieu | 7b61920 | 2019-04-18 01:39:45 +0000 | [diff] [blame] | 370 | |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 371 | if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) { |
Richard Trieu | 7b61920 | 2019-04-18 01:39:45 +0000 | [diff] [blame] | 372 | // Verify RHS Postdominates LHS |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 373 | assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock)); |
Richard Trieu | 7b61920 | 2019-04-18 01:39:45 +0000 | [diff] [blame] | 374 | return true; |
| 375 | } |
| 376 | |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 377 | // If LHS does not dominate RHS and RHS does not dominate LHS then there is |
| 378 | // no dominance relationship between the two FusionCandidates. Thus, they |
| 379 | // should not be in the same set together. |
| 380 | llvm_unreachable( |
| 381 | "No dominance relationship between these fusion candidates!"); |
| 382 | } |
| 383 | }; |
| 384 | |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 385 | using LoopVector = SmallVector<Loop *, 4>; |
| 386 | |
| 387 | // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance |
| 388 | // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0 |
| 389 | // dominates FC1 and FC1 post-dominates FC0. |
| 390 | // std::set was chosen because we want a sorted data structure with stable |
| 391 | // iterators. A subsequent patch to loop fusion will enable fusing non-ajdacent |
| 392 | // loops by moving intervening code around. When this intervening code contains |
| 393 | // loops, those loops will be moved also. The corresponding FusionCandidates |
| 394 | // will also need to be moved accordingly. As this is done, having stable |
| 395 | // iterators will simplify the logic. Similarly, having an efficient insert that |
| 396 | // keeps the FusionCandidateSet sorted will also simplify the implementation. |
| 397 | using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>; |
| 398 | using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>; |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 399 | |
Fangrui Song | eb70ac0 | 2019-08-24 02:50:42 +0000 | [diff] [blame] | 400 | #if !defined(NDEBUG) |
| 401 | static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, |
Fangrui Song | 6964027 | 2019-08-28 03:12:40 +0000 | [diff] [blame] | 402 | const FusionCandidate &FC) { |
| 403 | if (FC.isValid()) |
| 404 | OS << FC.Preheader->getName(); |
| 405 | else |
| 406 | OS << "<Invalid>"; |
| 407 | |
| 408 | return OS; |
| 409 | } |
| 410 | |
| 411 | static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 412 | const FusionCandidateSet &CandSet) { |
Fangrui Song | eb70ac0 | 2019-08-24 02:50:42 +0000 | [diff] [blame] | 413 | for (const FusionCandidate &FC : CandSet) |
| 414 | OS << FC << '\n'; |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 415 | |
| 416 | return OS; |
| 417 | } |
| 418 | |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 419 | static void |
| 420 | printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { |
Kit Barton | 8e64f0a | 2019-04-25 02:10:02 +0000 | [diff] [blame] | 421 | dbgs() << "Fusion Candidates: \n"; |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 422 | for (const auto &CandidateSet : FusionCandidates) { |
Kit Barton | 8e64f0a | 2019-04-25 02:10:02 +0000 | [diff] [blame] | 423 | dbgs() << "*** Fusion Candidate Set ***\n"; |
| 424 | dbgs() << CandidateSet; |
| 425 | dbgs() << "****************************\n"; |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 426 | } |
| 427 | } |
| 428 | #endif |
| 429 | |
| 430 | /// Collect all loops in function at the same nest level, starting at the |
| 431 | /// outermost level. |
| 432 | /// |
| 433 | /// This data structure collects all loops at the same nest level for a |
| 434 | /// given function (specified by the LoopInfo object). It starts at the |
| 435 | /// outermost level. |
| 436 | struct LoopDepthTree { |
| 437 | using LoopsOnLevelTy = SmallVector<LoopVector, 4>; |
| 438 | using iterator = LoopsOnLevelTy::iterator; |
| 439 | using const_iterator = LoopsOnLevelTy::const_iterator; |
| 440 | |
| 441 | LoopDepthTree(LoopInfo &LI) : Depth(1) { |
| 442 | if (!LI.empty()) |
| 443 | LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend())); |
| 444 | } |
| 445 | |
| 446 | /// Test whether a given loop has been removed from the function, and thus is |
| 447 | /// no longer valid. |
| 448 | bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); } |
| 449 | |
| 450 | /// Record that a given loop has been removed from the function and is no |
| 451 | /// longer valid. |
| 452 | void removeLoop(const Loop *L) { RemovedLoops.insert(L); } |
| 453 | |
| 454 | /// Descend the tree to the next (inner) nesting level |
| 455 | void descend() { |
| 456 | LoopsOnLevelTy LoopsOnNextLevel; |
| 457 | |
| 458 | for (const LoopVector &LV : *this) |
| 459 | for (Loop *L : LV) |
| 460 | if (!isRemovedLoop(L) && L->begin() != L->end()) |
| 461 | LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end())); |
| 462 | |
| 463 | LoopsOnLevel = LoopsOnNextLevel; |
| 464 | RemovedLoops.clear(); |
| 465 | Depth++; |
| 466 | } |
| 467 | |
| 468 | bool empty() const { return size() == 0; } |
| 469 | size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); } |
| 470 | unsigned getDepth() const { return Depth; } |
| 471 | |
| 472 | iterator begin() { return LoopsOnLevel.begin(); } |
| 473 | iterator end() { return LoopsOnLevel.end(); } |
| 474 | const_iterator begin() const { return LoopsOnLevel.begin(); } |
| 475 | const_iterator end() const { return LoopsOnLevel.end(); } |
| 476 | |
| 477 | private: |
| 478 | /// Set of loops that have been removed from the function and are no longer |
| 479 | /// valid. |
| 480 | SmallPtrSet<const Loop *, 8> RemovedLoops; |
| 481 | |
| 482 | /// Depth of the current level, starting at 1 (outermost loops). |
| 483 | unsigned Depth; |
| 484 | |
| 485 | /// Vector of loops at the current depth level that have the same parent loop |
| 486 | LoopsOnLevelTy LoopsOnLevel; |
| 487 | }; |
| 488 | |
| 489 | #ifndef NDEBUG |
| 490 | static void printLoopVector(const LoopVector &LV) { |
| 491 | dbgs() << "****************************\n"; |
| 492 | for (auto L : LV) |
| 493 | printLoop(*L, dbgs()); |
| 494 | dbgs() << "****************************\n"; |
| 495 | } |
| 496 | #endif |
| 497 | |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 498 | struct LoopFuser { |
| 499 | private: |
| 500 | // Sets of control flow equivalent fusion candidates for a given nest level. |
| 501 | FusionCandidateCollection FusionCandidates; |
| 502 | |
| 503 | LoopDepthTree LDT; |
| 504 | DomTreeUpdater DTU; |
| 505 | |
| 506 | LoopInfo &LI; |
| 507 | DominatorTree &DT; |
| 508 | DependenceInfo &DI; |
| 509 | ScalarEvolution &SE; |
| 510 | PostDominatorTree &PDT; |
| 511 | OptimizationRemarkEmitter &ORE; |
| 512 | |
| 513 | public: |
| 514 | LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI, |
| 515 | ScalarEvolution &SE, PostDominatorTree &PDT, |
| 516 | OptimizationRemarkEmitter &ORE, const DataLayout &DL) |
| 517 | : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI), |
| 518 | DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE) {} |
| 519 | |
| 520 | /// This is the main entry point for loop fusion. It will traverse the |
| 521 | /// specified function and collect candidate loops to fuse, starting at the |
| 522 | /// outermost nesting level and working inwards. |
| 523 | bool fuseLoops(Function &F) { |
| 524 | #ifndef NDEBUG |
| 525 | if (VerboseFusionDebugging) { |
| 526 | LI.print(dbgs()); |
| 527 | } |
| 528 | #endif |
| 529 | |
| 530 | LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName() |
| 531 | << "\n"); |
| 532 | bool Changed = false; |
| 533 | |
| 534 | while (!LDT.empty()) { |
| 535 | LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth " |
| 536 | << LDT.getDepth() << "\n";); |
| 537 | |
| 538 | for (const LoopVector &LV : LDT) { |
| 539 | assert(LV.size() > 0 && "Empty loop set was build!"); |
| 540 | |
| 541 | // Skip singleton loop sets as they do not offer fusion opportunities on |
| 542 | // this level. |
| 543 | if (LV.size() == 1) |
| 544 | continue; |
| 545 | #ifndef NDEBUG |
| 546 | if (VerboseFusionDebugging) { |
| 547 | LLVM_DEBUG({ |
| 548 | dbgs() << " Visit loop set (#" << LV.size() << "):\n"; |
| 549 | printLoopVector(LV); |
| 550 | }); |
| 551 | } |
| 552 | #endif |
| 553 | |
| 554 | collectFusionCandidates(LV); |
| 555 | Changed |= fuseCandidates(); |
| 556 | } |
| 557 | |
| 558 | // Finished analyzing candidates at this level. |
| 559 | // Descend to the next level and clear all of the candidates currently |
| 560 | // collected. Note that it will not be possible to fuse any of the |
| 561 | // existing candidates with new candidates because the new candidates will |
| 562 | // be at a different nest level and thus not be control flow equivalent |
| 563 | // with all of the candidates collected so far. |
| 564 | LLVM_DEBUG(dbgs() << "Descend one level!\n"); |
| 565 | LDT.descend(); |
| 566 | FusionCandidates.clear(); |
| 567 | } |
| 568 | |
| 569 | if (Changed) |
| 570 | LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump();); |
| 571 | |
| 572 | #ifndef NDEBUG |
| 573 | assert(DT.verify()); |
| 574 | assert(PDT.verify()); |
| 575 | LI.verify(DT); |
| 576 | SE.verify(); |
| 577 | #endif |
| 578 | |
| 579 | LLVM_DEBUG(dbgs() << "Loop Fusion complete\n"); |
| 580 | return Changed; |
| 581 | } |
| 582 | |
| 583 | private: |
| 584 | /// Determine if two fusion candidates are control flow equivalent. |
| 585 | /// |
| 586 | /// Two fusion candidates are control flow equivalent if when one executes, |
| 587 | /// the other is guaranteed to execute. This is determined using dominators |
| 588 | /// and post-dominators: if A dominates B and B post-dominates A then A and B |
| 589 | /// are control-flow equivalent. |
| 590 | bool isControlFlowEquivalent(const FusionCandidate &FC0, |
| 591 | const FusionCandidate &FC1) const { |
| 592 | assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders"); |
| 593 | |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 594 | BasicBlock *FC0EntryBlock = FC0.getEntryBlock(); |
| 595 | BasicBlock *FC1EntryBlock = FC1.getEntryBlock(); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 596 | |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 597 | if (DT.dominates(FC0EntryBlock, FC1EntryBlock)) |
| 598 | return PDT.dominates(FC1EntryBlock, FC0EntryBlock); |
| 599 | |
| 600 | if (DT.dominates(FC1EntryBlock, FC0EntryBlock)) |
| 601 | return PDT.dominates(FC0EntryBlock, FC1EntryBlock); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 602 | |
| 603 | return false; |
| 604 | } |
| 605 | |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 606 | /// Iterate over all loops in the given loop set and identify the loops that |
| 607 | /// are eligible for fusion. Place all eligible fusion candidates into Control |
| 608 | /// Flow Equivalent sets, sorted by dominance. |
| 609 | void collectFusionCandidates(const LoopVector &LV) { |
| 610 | for (Loop *L : LV) { |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 611 | FusionCandidate CurrCand(L, &DT, &PDT, ORE); |
| 612 | if (!CurrCand.isEligibleForFusion(SE)) |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 613 | continue; |
| 614 | |
| 615 | // Go through each list in FusionCandidates and determine if L is control |
| 616 | // flow equivalent with the first loop in that list. If it is, append LV. |
| 617 | // If not, go to the next list. |
| 618 | // If no suitable list is found, start another list and add it to |
| 619 | // FusionCandidates. |
| 620 | bool FoundSet = false; |
| 621 | |
| 622 | for (auto &CurrCandSet : FusionCandidates) { |
| 623 | if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) { |
| 624 | CurrCandSet.insert(CurrCand); |
| 625 | FoundSet = true; |
| 626 | #ifndef NDEBUG |
| 627 | if (VerboseFusionDebugging) |
| 628 | LLVM_DEBUG(dbgs() << "Adding " << CurrCand |
| 629 | << " to existing candidate set\n"); |
| 630 | #endif |
| 631 | break; |
| 632 | } |
| 633 | } |
| 634 | if (!FoundSet) { |
| 635 | // No set was found. Create a new set and add to FusionCandidates |
| 636 | #ifndef NDEBUG |
| 637 | if (VerboseFusionDebugging) |
| 638 | LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n"); |
| 639 | #endif |
| 640 | FusionCandidateSet NewCandSet; |
| 641 | NewCandSet.insert(CurrCand); |
| 642 | FusionCandidates.push_back(NewCandSet); |
| 643 | } |
| 644 | NumFusionCandidates++; |
| 645 | } |
| 646 | } |
| 647 | |
| 648 | /// Determine if it is beneficial to fuse two loops. |
| 649 | /// |
| 650 | /// For now, this method simply returns true because we want to fuse as much |
| 651 | /// as possible (primarily to test the pass). This method will evolve, over |
| 652 | /// time, to add heuristics for profitability of fusion. |
| 653 | bool isBeneficialFusion(const FusionCandidate &FC0, |
| 654 | const FusionCandidate &FC1) { |
| 655 | return true; |
| 656 | } |
| 657 | |
| 658 | /// Determine if two fusion candidates have the same trip count (i.e., they |
| 659 | /// execute the same number of iterations). |
| 660 | /// |
| 661 | /// Note that for now this method simply returns a boolean value because there |
| 662 | /// are no mechanisms in loop fusion to handle different trip counts. In the |
| 663 | /// future, this behaviour can be extended to adjust one of the loops to make |
| 664 | /// the trip counts equal (e.g., loop peeling). When this is added, this |
| 665 | /// interface may need to change to return more information than just a |
| 666 | /// boolean value. |
| 667 | bool identicalTripCounts(const FusionCandidate &FC0, |
| 668 | const FusionCandidate &FC1) const { |
| 669 | const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L); |
| 670 | if (isa<SCEVCouldNotCompute>(TripCount0)) { |
| 671 | UncomputableTripCount++; |
| 672 | LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!"); |
| 673 | return false; |
| 674 | } |
| 675 | |
| 676 | const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L); |
| 677 | if (isa<SCEVCouldNotCompute>(TripCount1)) { |
| 678 | UncomputableTripCount++; |
| 679 | LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!"); |
| 680 | return false; |
| 681 | } |
| 682 | LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & " |
| 683 | << *TripCount1 << " are " |
| 684 | << (TripCount0 == TripCount1 ? "identical" : "different") |
| 685 | << "\n"); |
| 686 | |
| 687 | return (TripCount0 == TripCount1); |
| 688 | } |
| 689 | |
| 690 | /// Walk each set of control flow equivalent fusion candidates and attempt to |
| 691 | /// fuse them. This does a single linear traversal of all candidates in the |
| 692 | /// set. The conditions for legal fusion are checked at this point. If a pair |
| 693 | /// of fusion candidates passes all legality checks, they are fused together |
| 694 | /// and a new fusion candidate is created and added to the FusionCandidateSet. |
| 695 | /// The original fusion candidates are then removed, as they are no longer |
| 696 | /// valid. |
| 697 | bool fuseCandidates() { |
| 698 | bool Fused = false; |
| 699 | LLVM_DEBUG(printFusionCandidates(FusionCandidates)); |
| 700 | for (auto &CandidateSet : FusionCandidates) { |
| 701 | if (CandidateSet.size() < 2) |
| 702 | continue; |
| 703 | |
| 704 | LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n" |
| 705 | << CandidateSet << "\n"); |
| 706 | |
| 707 | for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) { |
| 708 | assert(!LDT.isRemovedLoop(FC0->L) && |
| 709 | "Should not have removed loops in CandidateSet!"); |
| 710 | auto FC1 = FC0; |
| 711 | for (++FC1; FC1 != CandidateSet.end(); ++FC1) { |
| 712 | assert(!LDT.isRemovedLoop(FC1->L) && |
| 713 | "Should not have removed loops in CandidateSet!"); |
| 714 | |
| 715 | LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump(); |
| 716 | dbgs() << " with\n"; FC1->dump(); dbgs() << "\n"); |
| 717 | |
| 718 | FC0->verify(); |
| 719 | FC1->verify(); |
| 720 | |
| 721 | if (!identicalTripCounts(*FC0, *FC1)) { |
| 722 | LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip " |
| 723 | "counts. Not fusing.\n"); |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 724 | reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, |
| 725 | NonEqualTripCount); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 726 | continue; |
| 727 | } |
| 728 | |
| 729 | if (!isAdjacent(*FC0, *FC1)) { |
| 730 | LLVM_DEBUG(dbgs() |
| 731 | << "Fusion candidates are not adjacent. Not fusing.\n"); |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 732 | reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 733 | continue; |
| 734 | } |
| 735 | |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 736 | // Ensure that FC0 and FC1 have identical guards. |
| 737 | // If one (or both) are not guarded, this check is not necessary. |
| 738 | if (FC0->GuardBranch && FC1->GuardBranch && |
| 739 | !haveIdenticalGuards(*FC0, *FC1)) { |
| 740 | LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical " |
| 741 | "guards. Not Fusing.\n"); |
| 742 | reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, |
| 743 | NonIdenticalGuards); |
| 744 | continue; |
| 745 | } |
| 746 | |
| 747 | // The following three checks look for empty blocks in FC0 and FC1. If |
| 748 | // any of these blocks are non-empty, we do not fuse. This is done |
| 749 | // because we currently do not have the safety checks to determine if |
| 750 | // it is safe to move the blocks past other blocks in the loop. Once |
| 751 | // these checks are added, these conditions can be relaxed. |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 752 | if (!isEmptyPreheader(*FC1)) { |
| 753 | LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty " |
| 754 | "preheader. Not fusing.\n"); |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 755 | reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, |
| 756 | NonEmptyPreheader); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 757 | continue; |
| 758 | } |
| 759 | |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 760 | if (FC0->GuardBranch && !isEmptyExitBlock(*FC0)) { |
| 761 | LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty exit " |
| 762 | "block. Not fusing.\n"); |
| 763 | reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, |
| 764 | NonEmptyExitBlock); |
| 765 | continue; |
| 766 | } |
| 767 | |
| 768 | if (FC1->GuardBranch && !isEmptyGuardBlock(*FC1)) { |
| 769 | LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty guard " |
| 770 | "block. Not fusing.\n"); |
| 771 | reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, |
| 772 | NonEmptyGuardBlock); |
| 773 | continue; |
| 774 | } |
| 775 | |
| 776 | // Check the dependencies across the loops and do not fuse if it would |
| 777 | // violate them. |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 778 | if (!dependencesAllowFusion(*FC0, *FC1)) { |
| 779 | LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n"); |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 780 | reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, |
| 781 | InvalidDependencies); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 782 | continue; |
| 783 | } |
| 784 | |
| 785 | bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1); |
| 786 | LLVM_DEBUG(dbgs() |
| 787 | << "\tFusion appears to be " |
| 788 | << (BeneficialToFuse ? "" : "un") << "profitable!\n"); |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 789 | if (!BeneficialToFuse) { |
| 790 | reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, |
| 791 | FusionNotBeneficial); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 792 | continue; |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 793 | } |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 794 | // All analysis has completed and has determined that fusion is legal |
| 795 | // and profitable. At this point, start transforming the code and |
| 796 | // perform fusion. |
| 797 | |
| 798 | LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and " |
| 799 | << *FC1 << "\n"); |
| 800 | |
| 801 | // Report fusion to the Optimization Remarks. |
| 802 | // Note this needs to be done *before* performFusion because |
| 803 | // performFusion will change the original loops, making it not |
| 804 | // possible to identify them after fusion is complete. |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 805 | reportLoopFusion<OptimizationRemark>(*FC0, *FC1, FuseCounter); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 806 | |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 807 | FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT, ORE); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 808 | FusedCand.verify(); |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 809 | assert(FusedCand.isEligibleForFusion(SE) && |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 810 | "Fused candidate should be eligible for fusion!"); |
| 811 | |
| 812 | // Notify the loop-depth-tree that these loops are not valid objects |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 813 | LDT.removeLoop(FC1->L); |
| 814 | |
| 815 | CandidateSet.erase(FC0); |
| 816 | CandidateSet.erase(FC1); |
| 817 | |
| 818 | auto InsertPos = CandidateSet.insert(FusedCand); |
| 819 | |
| 820 | assert(InsertPos.second && |
| 821 | "Unable to insert TargetCandidate in CandidateSet!"); |
| 822 | |
| 823 | // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations |
| 824 | // of the FC1 loop will attempt to fuse the new (fused) loop with the |
| 825 | // remaining candidates in the current candidate set. |
| 826 | FC0 = FC1 = InsertPos.first; |
| 827 | |
| 828 | LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet |
| 829 | << "\n"); |
| 830 | |
| 831 | Fused = true; |
| 832 | } |
| 833 | } |
| 834 | } |
| 835 | return Fused; |
| 836 | } |
| 837 | |
| 838 | /// Rewrite all additive recurrences in a SCEV to use a new loop. |
| 839 | class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> { |
| 840 | public: |
| 841 | AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL, |
| 842 | bool UseMax = true) |
| 843 | : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL), |
| 844 | NewL(NewL) {} |
| 845 | |
| 846 | const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { |
| 847 | const Loop *ExprL = Expr->getLoop(); |
| 848 | SmallVector<const SCEV *, 2> Operands; |
| 849 | if (ExprL == &OldL) { |
| 850 | Operands.append(Expr->op_begin(), Expr->op_end()); |
| 851 | return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags()); |
| 852 | } |
| 853 | |
| 854 | if (OldL.contains(ExprL)) { |
| 855 | bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE)); |
| 856 | if (!UseMax || !Pos || !Expr->isAffine()) { |
| 857 | Valid = false; |
| 858 | return Expr; |
| 859 | } |
| 860 | return visit(Expr->getStart()); |
| 861 | } |
| 862 | |
| 863 | for (const SCEV *Op : Expr->operands()) |
| 864 | Operands.push_back(visit(Op)); |
| 865 | return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags()); |
| 866 | } |
| 867 | |
| 868 | bool wasValidSCEV() const { return Valid; } |
| 869 | |
| 870 | private: |
| 871 | bool Valid, UseMax; |
| 872 | const Loop &OldL, &NewL; |
| 873 | }; |
| 874 | |
| 875 | /// Return false if the access functions of \p I0 and \p I1 could cause |
| 876 | /// a negative dependence. |
| 877 | bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0, |
| 878 | Instruction &I1, bool EqualIsInvalid) { |
| 879 | Value *Ptr0 = getLoadStorePointerOperand(&I0); |
| 880 | Value *Ptr1 = getLoadStorePointerOperand(&I1); |
| 881 | if (!Ptr0 || !Ptr1) |
| 882 | return false; |
| 883 | |
| 884 | const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0); |
| 885 | const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1); |
| 886 | #ifndef NDEBUG |
| 887 | if (VerboseFusionDebugging) |
| 888 | LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs " |
| 889 | << *SCEVPtr1 << "\n"); |
| 890 | #endif |
| 891 | AddRecLoopReplacer Rewriter(SE, L0, L1); |
| 892 | SCEVPtr0 = Rewriter.visit(SCEVPtr0); |
| 893 | #ifndef NDEBUG |
| 894 | if (VerboseFusionDebugging) |
| 895 | LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0 |
| 896 | << " [Valid: " << Rewriter.wasValidSCEV() << "]\n"); |
| 897 | #endif |
| 898 | if (!Rewriter.wasValidSCEV()) |
| 899 | return false; |
| 900 | |
| 901 | // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by |
| 902 | // L0) and the other is not. We could check if it is monotone and test |
| 903 | // the beginning and end value instead. |
| 904 | |
| 905 | BasicBlock *L0Header = L0.getHeader(); |
| 906 | auto HasNonLinearDominanceRelation = [&](const SCEV *S) { |
| 907 | const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S); |
| 908 | if (!AddRec) |
| 909 | return false; |
| 910 | return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) && |
| 911 | !DT.dominates(AddRec->getLoop()->getHeader(), L0Header); |
| 912 | }; |
| 913 | if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation)) |
| 914 | return false; |
| 915 | |
| 916 | ICmpInst::Predicate Pred = |
| 917 | EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE; |
| 918 | bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1); |
| 919 | #ifndef NDEBUG |
| 920 | if (VerboseFusionDebugging) |
| 921 | LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0 |
| 922 | << (IsAlwaysGE ? " >= " : " may < ") << *SCEVPtr1 |
| 923 | << "\n"); |
| 924 | #endif |
| 925 | return IsAlwaysGE; |
| 926 | } |
| 927 | |
| 928 | /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in |
| 929 | /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses |
| 930 | /// specified by @p DepChoice are used to determine this. |
| 931 | bool dependencesAllowFusion(const FusionCandidate &FC0, |
| 932 | const FusionCandidate &FC1, Instruction &I0, |
| 933 | Instruction &I1, bool AnyDep, |
| 934 | FusionDependenceAnalysisChoice DepChoice) { |
| 935 | #ifndef NDEBUG |
| 936 | if (VerboseFusionDebugging) { |
| 937 | LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : " |
| 938 | << DepChoice << "\n"); |
| 939 | } |
| 940 | #endif |
| 941 | switch (DepChoice) { |
| 942 | case FUSION_DEPENDENCE_ANALYSIS_SCEV: |
| 943 | return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep); |
| 944 | case FUSION_DEPENDENCE_ANALYSIS_DA: { |
| 945 | auto DepResult = DI.depends(&I0, &I1, true); |
| 946 | if (!DepResult) |
| 947 | return true; |
| 948 | #ifndef NDEBUG |
| 949 | if (VerboseFusionDebugging) { |
| 950 | LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs()); |
| 951 | dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: " |
| 952 | << (DepResult->isOrdered() ? "true" : "false") |
| 953 | << "]\n"); |
| 954 | LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels() |
| 955 | << "\n"); |
| 956 | } |
| 957 | #endif |
| 958 | |
| 959 | if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) |
| 960 | LLVM_DEBUG( |
| 961 | dbgs() << "TODO: Implement pred/succ dependence handling!\n"); |
| 962 | |
| 963 | // TODO: Can we actually use the dependence info analysis here? |
| 964 | return false; |
| 965 | } |
| 966 | |
| 967 | case FUSION_DEPENDENCE_ANALYSIS_ALL: |
| 968 | return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, |
| 969 | FUSION_DEPENDENCE_ANALYSIS_SCEV) || |
| 970 | dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, |
| 971 | FUSION_DEPENDENCE_ANALYSIS_DA); |
| 972 | } |
| 973 | |
| 974 | llvm_unreachable("Unknown fusion dependence analysis choice!"); |
| 975 | } |
| 976 | |
| 977 | /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused. |
| 978 | bool dependencesAllowFusion(const FusionCandidate &FC0, |
| 979 | const FusionCandidate &FC1) { |
| 980 | LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1 |
| 981 | << "\n"); |
| 982 | assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth()); |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 983 | assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock())); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 984 | |
| 985 | for (Instruction *WriteL0 : FC0.MemWrites) { |
| 986 | for (Instruction *WriteL1 : FC1.MemWrites) |
| 987 | if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, |
| 988 | /* AnyDep */ false, |
| 989 | FusionDependenceAnalysis)) { |
| 990 | InvalidDependencies++; |
| 991 | return false; |
| 992 | } |
| 993 | for (Instruction *ReadL1 : FC1.MemReads) |
| 994 | if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1, |
| 995 | /* AnyDep */ false, |
| 996 | FusionDependenceAnalysis)) { |
| 997 | InvalidDependencies++; |
| 998 | return false; |
| 999 | } |
| 1000 | } |
| 1001 | |
| 1002 | for (Instruction *WriteL1 : FC1.MemWrites) { |
| 1003 | for (Instruction *WriteL0 : FC0.MemWrites) |
| 1004 | if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, |
| 1005 | /* AnyDep */ false, |
| 1006 | FusionDependenceAnalysis)) { |
| 1007 | InvalidDependencies++; |
| 1008 | return false; |
| 1009 | } |
| 1010 | for (Instruction *ReadL0 : FC0.MemReads) |
| 1011 | if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1, |
| 1012 | /* AnyDep */ false, |
| 1013 | FusionDependenceAnalysis)) { |
| 1014 | InvalidDependencies++; |
| 1015 | return false; |
| 1016 | } |
| 1017 | } |
| 1018 | |
| 1019 | // Walk through all uses in FC1. For each use, find the reaching def. If the |
| 1020 | // def is located in FC0 then it is is not safe to fuse. |
| 1021 | for (BasicBlock *BB : FC1.L->blocks()) |
| 1022 | for (Instruction &I : *BB) |
| 1023 | for (auto &Op : I.operands()) |
| 1024 | if (Instruction *Def = dyn_cast<Instruction>(Op)) |
| 1025 | if (FC0.L->contains(Def->getParent())) { |
| 1026 | InvalidDependencies++; |
| 1027 | return false; |
| 1028 | } |
| 1029 | |
| 1030 | return true; |
| 1031 | } |
| 1032 | |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 1033 | /// Determine if two fusion candidates are adjacent in the CFG. |
| 1034 | /// |
| 1035 | /// This method will determine if there are additional basic blocks in the CFG |
| 1036 | /// between the exit of \p FC0 and the entry of \p FC1. |
| 1037 | /// If the two candidates are guarded loops, then it checks whether the |
| 1038 | /// non-loop successor of the \p FC0 guard branch is the entry block of \p |
| 1039 | /// FC1. If not, then the loops are not adjacent. If the two candidates are |
| 1040 | /// not guarded loops, then it checks whether the exit block of \p FC0 is the |
| 1041 | /// preheader of \p FC1. |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 1042 | bool isAdjacent(const FusionCandidate &FC0, |
| 1043 | const FusionCandidate &FC1) const { |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 1044 | // If the successor of the guard branch is FC1, then the loops are adjacent |
| 1045 | if (FC0.GuardBranch) |
| 1046 | return FC0.getNonLoopBlock() == FC1.getEntryBlock(); |
| 1047 | else |
| 1048 | return FC0.ExitBlock == FC1.getEntryBlock(); |
| 1049 | } |
| 1050 | |
| 1051 | /// Determine if two fusion candidates have identical guards |
| 1052 | /// |
| 1053 | /// This method will determine if two fusion candidates have the same guards. |
| 1054 | /// The guards are considered the same if: |
| 1055 | /// 1. The instructions to compute the condition used in the compare are |
| 1056 | /// identical. |
| 1057 | /// 2. The successors of the guard have the same flow into/around the loop. |
| 1058 | /// If the compare instructions are identical, then the first successor of the |
| 1059 | /// guard must go to the same place (either the preheader of the loop or the |
| 1060 | /// NonLoopBlock). In other words, the the first successor of both loops must |
| 1061 | /// both go into the loop (i.e., the preheader) or go around the loop (i.e., |
| 1062 | /// the NonLoopBlock). The same must be true for the second successor. |
| 1063 | bool haveIdenticalGuards(const FusionCandidate &FC0, |
| 1064 | const FusionCandidate &FC1) const { |
| 1065 | assert(FC0.GuardBranch && FC1.GuardBranch && |
| 1066 | "Expecting FC0 and FC1 to be guarded loops."); |
| 1067 | |
| 1068 | if (auto FC0CmpInst = |
| 1069 | dyn_cast<Instruction>(FC0.GuardBranch->getCondition())) |
| 1070 | if (auto FC1CmpInst = |
| 1071 | dyn_cast<Instruction>(FC1.GuardBranch->getCondition())) |
| 1072 | if (!FC0CmpInst->isIdenticalTo(FC1CmpInst)) |
| 1073 | return false; |
| 1074 | |
| 1075 | // The compare instructions are identical. |
| 1076 | // Now make sure the successor of the guards have the same flow into/around |
| 1077 | // the loop |
| 1078 | if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader) |
| 1079 | return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader); |
| 1080 | else |
| 1081 | return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader); |
| 1082 | } |
| 1083 | |
| 1084 | /// Check that the guard for \p FC *only* contains the cmp/branch for the |
| 1085 | /// guard. |
| 1086 | /// Once we are able to handle intervening code, any code in the guard block |
| 1087 | /// for FC1 will need to be treated as intervening code and checked whether |
| 1088 | /// it can safely move around the loops. |
| 1089 | bool isEmptyGuardBlock(const FusionCandidate &FC) const { |
| 1090 | assert(FC.GuardBranch && "Expecting a fusion candidate with guard branch."); |
| 1091 | if (auto *CmpInst = dyn_cast<Instruction>(FC.GuardBranch->getCondition())) { |
| 1092 | auto *GuardBlock = FC.GuardBranch->getParent(); |
| 1093 | // If the generation of the cmp value is in GuardBlock, then the size of |
| 1094 | // the guard block should be 2 (cmp + branch). If the generation of the |
| 1095 | // cmp value is in a different block, then the size of the guard block |
| 1096 | // should only be 1. |
| 1097 | if (CmpInst->getParent() == GuardBlock) |
| 1098 | return GuardBlock->size() == 2; |
| 1099 | else |
| 1100 | return GuardBlock->size() == 1; |
| 1101 | } |
| 1102 | |
| 1103 | return false; |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 1104 | } |
| 1105 | |
| 1106 | bool isEmptyPreheader(const FusionCandidate &FC) const { |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 1107 | assert(FC.Preheader && "Expecting a valid preheader"); |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 1108 | return FC.Preheader->size() == 1; |
| 1109 | } |
| 1110 | |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 1111 | bool isEmptyExitBlock(const FusionCandidate &FC) const { |
| 1112 | assert(FC.ExitBlock && "Expecting a valid exit block"); |
| 1113 | return FC.ExitBlock->size() == 1; |
| 1114 | } |
| 1115 | |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 1116 | /// Fuse two fusion candidates, creating a new fused loop. |
| 1117 | /// |
| 1118 | /// This method contains the mechanics of fusing two loops, represented by \p |
| 1119 | /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1 |
| 1120 | /// postdominates \p FC0 (making them control flow equivalent). It also |
| 1121 | /// assumes that the other conditions for fusion have been met: adjacent, |
| 1122 | /// identical trip counts, and no negative distance dependencies exist that |
| 1123 | /// would prevent fusion. Thus, there is no checking for these conditions in |
| 1124 | /// this method. |
| 1125 | /// |
| 1126 | /// Fusion is performed by rewiring the CFG to update successor blocks of the |
| 1127 | /// components of tho loop. Specifically, the following changes are done: |
| 1128 | /// |
| 1129 | /// 1. The preheader of \p FC1 is removed as it is no longer necessary |
| 1130 | /// (because it is currently only a single statement block). |
| 1131 | /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1. |
| 1132 | /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0. |
| 1133 | /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0. |
| 1134 | /// |
| 1135 | /// All of these modifications are done with dominator tree updates, thus |
| 1136 | /// keeping the dominator (and post dominator) information up-to-date. |
| 1137 | /// |
| 1138 | /// This can be improved in the future by actually merging blocks during |
| 1139 | /// fusion. For example, the preheader of \p FC1 can be merged with the |
| 1140 | /// preheader of \p FC0. This would allow loops with more than a single |
| 1141 | /// statement in the preheader to be fused. Similarly, the latch blocks of the |
| 1142 | /// two loops could also be fused into a single block. This will require |
| 1143 | /// analysis to prove it is safe to move the contents of the block past |
| 1144 | /// existing code, which currently has not been implemented. |
| 1145 | Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) { |
| 1146 | assert(FC0.isValid() && FC1.isValid() && |
| 1147 | "Expecting valid fusion candidates"); |
| 1148 | |
| 1149 | LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump(); |
| 1150 | dbgs() << "Fusion Candidate 1: \n"; FC1.dump();); |
| 1151 | |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 1152 | // Fusing guarded loops is handled slightly differently than non-guarded |
| 1153 | // loops and has been broken out into a separate method instead of trying to |
| 1154 | // intersperse the logic within a single method. |
| 1155 | if (FC0.GuardBranch) |
| 1156 | return fuseGuardedLoops(FC0, FC1); |
| 1157 | |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 1158 | assert(FC1.Preheader == FC0.ExitBlock); |
| 1159 | assert(FC1.Preheader->size() == 1 && |
| 1160 | FC1.Preheader->getSingleSuccessor() == FC1.Header); |
| 1161 | |
| 1162 | // Remember the phi nodes originally in the header of FC0 in order to rewire |
| 1163 | // them later. However, this is only necessary if the new loop carried |
| 1164 | // values might not dominate the exiting branch. While we do not generally |
| 1165 | // test if this is the case but simply insert intermediate phi nodes, we |
| 1166 | // need to make sure these intermediate phi nodes have different |
| 1167 | // predecessors. To this end, we filter the special case where the exiting |
| 1168 | // block is the latch block of the first loop. Nothing needs to be done |
| 1169 | // anyway as all loop carried values dominate the latch and thereby also the |
| 1170 | // exiting branch. |
| 1171 | SmallVector<PHINode *, 8> OriginalFC0PHIs; |
| 1172 | if (FC0.ExitingBlock != FC0.Latch) |
| 1173 | for (PHINode &PHI : FC0.Header->phis()) |
| 1174 | OriginalFC0PHIs.push_back(&PHI); |
| 1175 | |
| 1176 | // Replace incoming blocks for header PHIs first. |
| 1177 | FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); |
| 1178 | FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); |
| 1179 | |
| 1180 | // Then modify the control flow and update DT and PDT. |
| 1181 | SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; |
| 1182 | |
| 1183 | // The old exiting block of the first loop (FC0) has to jump to the header |
| 1184 | // of the second as we need to execute the code in the second header block |
| 1185 | // regardless of the trip count. That is, if the trip count is 0, so the |
| 1186 | // back edge is never taken, we still have to execute both loop headers, |
| 1187 | // especially (but not only!) if the second is a do-while style loop. |
| 1188 | // However, doing so might invalidate the phi nodes of the first loop as |
| 1189 | // the new values do only need to dominate their latch and not the exiting |
| 1190 | // predicate. To remedy this potential problem we always introduce phi |
| 1191 | // nodes in the header of the second loop later that select the loop carried |
| 1192 | // value, if the second header was reached through an old latch of the |
| 1193 | // first, or undef otherwise. This is sound as exiting the first implies the |
| 1194 | // second will exit too, __without__ taking the back-edge. [Their |
| 1195 | // trip-counts are equal after all. |
| 1196 | // KB: Would this sequence be simpler to just just make FC0.ExitingBlock go |
| 1197 | // to FC1.Header? I think this is basically what the three sequences are |
| 1198 | // trying to accomplish; however, doing this directly in the CFG may mean |
| 1199 | // the DT/PDT becomes invalid |
| 1200 | FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader, |
| 1201 | FC1.Header); |
| 1202 | TreeUpdates.emplace_back(DominatorTree::UpdateType( |
| 1203 | DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader)); |
| 1204 | TreeUpdates.emplace_back(DominatorTree::UpdateType( |
| 1205 | DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); |
| 1206 | |
| 1207 | // The pre-header of L1 is not necessary anymore. |
| 1208 | assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader)); |
| 1209 | FC1.Preheader->getTerminator()->eraseFromParent(); |
| 1210 | new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); |
| 1211 | TreeUpdates.emplace_back(DominatorTree::UpdateType( |
| 1212 | DominatorTree::Delete, FC1.Preheader, FC1.Header)); |
| 1213 | |
| 1214 | // Moves the phi nodes from the second to the first loops header block. |
| 1215 | while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { |
| 1216 | if (SE.isSCEVable(PHI->getType())) |
| 1217 | SE.forgetValue(PHI); |
| 1218 | if (PHI->hasNUsesOrMore(1)) |
| 1219 | PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); |
| 1220 | else |
| 1221 | PHI->eraseFromParent(); |
| 1222 | } |
| 1223 | |
| 1224 | // Introduce new phi nodes in the second loop header to ensure |
| 1225 | // exiting the first and jumping to the header of the second does not break |
| 1226 | // the SSA property of the phis originally in the first loop. See also the |
| 1227 | // comment above. |
| 1228 | Instruction *L1HeaderIP = &FC1.Header->front(); |
| 1229 | for (PHINode *LCPHI : OriginalFC0PHIs) { |
| 1230 | int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); |
| 1231 | assert(L1LatchBBIdx >= 0 && |
| 1232 | "Expected loop carried value to be rewired at this point!"); |
| 1233 | |
| 1234 | Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); |
| 1235 | |
| 1236 | PHINode *L1HeaderPHI = PHINode::Create( |
| 1237 | LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); |
| 1238 | L1HeaderPHI->addIncoming(LCV, FC0.Latch); |
| 1239 | L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), |
| 1240 | FC0.ExitingBlock); |
| 1241 | |
| 1242 | LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); |
| 1243 | } |
| 1244 | |
| 1245 | // Replace latch terminator destinations. |
| 1246 | FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); |
| 1247 | FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); |
| 1248 | |
| 1249 | // If FC0.Latch and FC0.ExitingBlock are the same then we have already |
| 1250 | // performed the updates above. |
| 1251 | if (FC0.Latch != FC0.ExitingBlock) |
| 1252 | TreeUpdates.emplace_back(DominatorTree::UpdateType( |
| 1253 | DominatorTree::Insert, FC0.Latch, FC1.Header)); |
| 1254 | |
| 1255 | TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, |
| 1256 | FC0.Latch, FC0.Header)); |
| 1257 | TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, |
| 1258 | FC1.Latch, FC0.Header)); |
| 1259 | TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, |
| 1260 | FC1.Latch, FC1.Header)); |
| 1261 | |
| 1262 | // Update DT/PDT |
| 1263 | DTU.applyUpdates(TreeUpdates); |
| 1264 | |
| 1265 | LI.removeBlock(FC1.Preheader); |
| 1266 | DTU.deleteBB(FC1.Preheader); |
| 1267 | DTU.flush(); |
| 1268 | |
| 1269 | // Is there a way to keep SE up-to-date so we don't need to forget the loops |
| 1270 | // and rebuild the information in subsequent passes of fusion? |
| 1271 | SE.forgetLoop(FC1.L); |
| 1272 | SE.forgetLoop(FC0.L); |
| 1273 | |
| 1274 | // Merge the loops. |
| 1275 | SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(), |
| 1276 | FC1.L->block_end()); |
| 1277 | for (BasicBlock *BB : Blocks) { |
| 1278 | FC0.L->addBlockEntry(BB); |
| 1279 | FC1.L->removeBlockFromLoop(BB); |
| 1280 | if (LI.getLoopFor(BB) != FC1.L) |
| 1281 | continue; |
| 1282 | LI.changeLoopFor(BB, FC0.L); |
| 1283 | } |
| 1284 | while (!FC1.L->empty()) { |
| 1285 | const auto &ChildLoopIt = FC1.L->begin(); |
| 1286 | Loop *ChildLoop = *ChildLoopIt; |
| 1287 | FC1.L->removeChildLoop(ChildLoopIt); |
| 1288 | FC0.L->addChildLoop(ChildLoop); |
| 1289 | } |
| 1290 | |
| 1291 | // Delete the now empty loop L1. |
| 1292 | LI.erase(FC1.L); |
| 1293 | |
| 1294 | #ifndef NDEBUG |
| 1295 | assert(!verifyFunction(*FC0.Header->getParent(), &errs())); |
| 1296 | assert(DT.verify(DominatorTree::VerificationLevel::Fast)); |
| 1297 | assert(PDT.verify()); |
| 1298 | LI.verify(DT); |
| 1299 | SE.verify(); |
| 1300 | #endif |
| 1301 | |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 1302 | LLVM_DEBUG(dbgs() << "Fusion done:\n"); |
| 1303 | |
| 1304 | return FC0.L; |
| 1305 | } |
Kit Barton | de0b633 | 2019-07-30 15:58:43 +0000 | [diff] [blame] | 1306 | |
| 1307 | /// Report details on loop fusion opportunities. |
| 1308 | /// |
| 1309 | /// This template function can be used to report both successful and missed |
| 1310 | /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should |
| 1311 | /// be one of: |
| 1312 | /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful |
| 1313 | /// given two valid fusion candidates. |
| 1314 | /// - OptimizationRemark to report successful fusion of two fusion |
| 1315 | /// candidates. |
| 1316 | /// The remarks will be printed using the form: |
| 1317 | /// <path/filename>:<line number>:<column number>: [<function name>]: |
| 1318 | /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> |
| 1319 | template <typename RemarkKind> |
| 1320 | void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, |
| 1321 | llvm::Statistic &Stat) { |
| 1322 | assert(FC0.Preheader && FC1.Preheader && |
| 1323 | "Expecting valid fusion candidates"); |
| 1324 | using namespace ore; |
| 1325 | ++Stat; |
| 1326 | ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(), |
| 1327 | FC0.Preheader) |
| 1328 | << "[" << FC0.Preheader->getParent()->getName() |
| 1329 | << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName())) |
| 1330 | << " and " << NV("Cand2", StringRef(FC1.Preheader->getName())) |
| 1331 | << ": " << Stat.getDesc()); |
| 1332 | } |
Kit Barton | 50bc610 | 2019-09-26 21:42:45 +0000 | [diff] [blame] | 1333 | |
| 1334 | /// Fuse two guarded fusion candidates, creating a new fused loop. |
| 1335 | /// |
| 1336 | /// Fusing guarded loops is handled much the same way as fusing non-guarded |
| 1337 | /// loops. The rewiring of the CFG is slightly different though, because of |
| 1338 | /// the presence of the guards around the loops and the exit blocks after the |
| 1339 | /// loop body. As such, the new loop is rewired as follows: |
| 1340 | /// 1. Keep the guard branch from FC0 and use the non-loop block target |
| 1341 | /// from the FC1 guard branch. |
| 1342 | /// 2. Remove the exit block from FC0 (this exit block should be empty |
| 1343 | /// right now). |
| 1344 | /// 3. Remove the guard branch for FC1 |
| 1345 | /// 4. Remove the preheader for FC1. |
| 1346 | /// The exit block successor for the latch of FC0 is updated to be the header |
| 1347 | /// of FC1 and the non-exit block successor of the latch of FC1 is updated to |
| 1348 | /// be the header of FC0, thus creating the fused loop. |
| 1349 | Loop *fuseGuardedLoops(const FusionCandidate &FC0, |
| 1350 | const FusionCandidate &FC1) { |
| 1351 | assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops"); |
| 1352 | |
| 1353 | BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent(); |
| 1354 | BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent(); |
| 1355 | BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock(); |
| 1356 | BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock(); |
| 1357 | |
| 1358 | assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent"); |
| 1359 | |
| 1360 | SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; |
| 1361 | |
| 1362 | //////////////////////////////////////////////////////////////////////////// |
| 1363 | // Update the Loop Guard |
| 1364 | //////////////////////////////////////////////////////////////////////////// |
| 1365 | // The guard for FC0 is updated to guard both FC0 and FC1. This is done by |
| 1366 | // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1. |
| 1367 | // Thus, one path from the guard goes to the preheader for FC0 (and thus |
| 1368 | // executes the new fused loop) and the other path goes to the NonLoopBlock |
| 1369 | // for FC1 (where FC1 guard would have gone if FC1 was not executed). |
| 1370 | FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock); |
| 1371 | FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock, |
| 1372 | FC1.Header); |
| 1373 | |
| 1374 | // The guard of FC1 is not necessary anymore. |
| 1375 | FC1.GuardBranch->eraseFromParent(); |
| 1376 | new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock); |
| 1377 | |
| 1378 | TreeUpdates.emplace_back(DominatorTree::UpdateType( |
| 1379 | DominatorTree::Delete, FC1GuardBlock, FC1.Preheader)); |
| 1380 | TreeUpdates.emplace_back(DominatorTree::UpdateType( |
| 1381 | DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock)); |
| 1382 | TreeUpdates.emplace_back(DominatorTree::UpdateType( |
| 1383 | DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock)); |
| 1384 | TreeUpdates.emplace_back(DominatorTree::UpdateType( |
| 1385 | DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock)); |
| 1386 | |
| 1387 | assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) && |
| 1388 | "Expecting guard block to have no predecessors"); |
| 1389 | assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) && |
| 1390 | "Expecting guard block to have no successors"); |
| 1391 | |
| 1392 | // Remember the phi nodes originally in the header of FC0 in order to rewire |
| 1393 | // them later. However, this is only necessary if the new loop carried |
| 1394 | // values might not dominate the exiting branch. While we do not generally |
| 1395 | // test if this is the case but simply insert intermediate phi nodes, we |
| 1396 | // need to make sure these intermediate phi nodes have different |
| 1397 | // predecessors. To this end, we filter the special case where the exiting |
| 1398 | // block is the latch block of the first loop. Nothing needs to be done |
| 1399 | // anyway as all loop carried values dominate the latch and thereby also the |
| 1400 | // exiting branch. |
| 1401 | // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch |
| 1402 | // (because the loops are rotated. Thus, nothing will ever be added to |
| 1403 | // OriginalFC0PHIs. |
| 1404 | SmallVector<PHINode *, 8> OriginalFC0PHIs; |
| 1405 | if (FC0.ExitingBlock != FC0.Latch) |
| 1406 | for (PHINode &PHI : FC0.Header->phis()) |
| 1407 | OriginalFC0PHIs.push_back(&PHI); |
| 1408 | |
| 1409 | assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!"); |
| 1410 | |
| 1411 | // Replace incoming blocks for header PHIs first. |
| 1412 | FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); |
| 1413 | FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); |
| 1414 | |
| 1415 | // The old exiting block of the first loop (FC0) has to jump to the header |
| 1416 | // of the second as we need to execute the code in the second header block |
| 1417 | // regardless of the trip count. That is, if the trip count is 0, so the |
| 1418 | // back edge is never taken, we still have to execute both loop headers, |
| 1419 | // especially (but not only!) if the second is a do-while style loop. |
| 1420 | // However, doing so might invalidate the phi nodes of the first loop as |
| 1421 | // the new values do only need to dominate their latch and not the exiting |
| 1422 | // predicate. To remedy this potential problem we always introduce phi |
| 1423 | // nodes in the header of the second loop later that select the loop carried |
| 1424 | // value, if the second header was reached through an old latch of the |
| 1425 | // first, or undef otherwise. This is sound as exiting the first implies the |
| 1426 | // second will exit too, __without__ taking the back-edge (their |
| 1427 | // trip-counts are equal after all). |
| 1428 | FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, |
| 1429 | FC1.Header); |
| 1430 | |
| 1431 | TreeUpdates.emplace_back(DominatorTree::UpdateType( |
| 1432 | DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); |
| 1433 | TreeUpdates.emplace_back(DominatorTree::UpdateType( |
| 1434 | DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); |
| 1435 | |
| 1436 | // Remove FC0 Exit Block |
| 1437 | // The exit block for FC0 is no longer needed since control will flow |
| 1438 | // directly to the header of FC1. Since it is an empty block, it can be |
| 1439 | // removed at this point. |
| 1440 | // TODO: In the future, we can handle non-empty exit blocks my merging any |
| 1441 | // instructions from FC0 exit block into FC1 exit block prior to removing |
| 1442 | // the block. |
| 1443 | assert(pred_begin(FC0.ExitBlock) == pred_end(FC0.ExitBlock) && |
| 1444 | "Expecting exit block to be empty"); |
| 1445 | FC0.ExitBlock->getTerminator()->eraseFromParent(); |
| 1446 | new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); |
| 1447 | |
| 1448 | // Remove FC1 Preheader |
| 1449 | // The pre-header of L1 is not necessary anymore. |
| 1450 | assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader)); |
| 1451 | FC1.Preheader->getTerminator()->eraseFromParent(); |
| 1452 | new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); |
| 1453 | TreeUpdates.emplace_back(DominatorTree::UpdateType( |
| 1454 | DominatorTree::Delete, FC1.Preheader, FC1.Header)); |
| 1455 | |
| 1456 | // Moves the phi nodes from the second to the first loops header block. |
| 1457 | while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { |
| 1458 | if (SE.isSCEVable(PHI->getType())) |
| 1459 | SE.forgetValue(PHI); |
| 1460 | if (PHI->hasNUsesOrMore(1)) |
| 1461 | PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); |
| 1462 | else |
| 1463 | PHI->eraseFromParent(); |
| 1464 | } |
| 1465 | |
| 1466 | // Introduce new phi nodes in the second loop header to ensure |
| 1467 | // exiting the first and jumping to the header of the second does not break |
| 1468 | // the SSA property of the phis originally in the first loop. See also the |
| 1469 | // comment above. |
| 1470 | Instruction *L1HeaderIP = &FC1.Header->front(); |
| 1471 | for (PHINode *LCPHI : OriginalFC0PHIs) { |
| 1472 | int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); |
| 1473 | assert(L1LatchBBIdx >= 0 && |
| 1474 | "Expected loop carried value to be rewired at this point!"); |
| 1475 | |
| 1476 | Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); |
| 1477 | |
| 1478 | PHINode *L1HeaderPHI = PHINode::Create( |
| 1479 | LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); |
| 1480 | L1HeaderPHI->addIncoming(LCV, FC0.Latch); |
| 1481 | L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), |
| 1482 | FC0.ExitingBlock); |
| 1483 | |
| 1484 | LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); |
| 1485 | } |
| 1486 | |
| 1487 | // Update the latches |
| 1488 | |
| 1489 | // Replace latch terminator destinations. |
| 1490 | FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); |
| 1491 | FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); |
| 1492 | |
| 1493 | // If FC0.Latch and FC0.ExitingBlock are the same then we have already |
| 1494 | // performed the updates above. |
| 1495 | if (FC0.Latch != FC0.ExitingBlock) |
| 1496 | TreeUpdates.emplace_back(DominatorTree::UpdateType( |
| 1497 | DominatorTree::Insert, FC0.Latch, FC1.Header)); |
| 1498 | |
| 1499 | TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, |
| 1500 | FC0.Latch, FC0.Header)); |
| 1501 | TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, |
| 1502 | FC1.Latch, FC0.Header)); |
| 1503 | TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, |
| 1504 | FC1.Latch, FC1.Header)); |
| 1505 | |
| 1506 | // All done |
| 1507 | // Apply the updates to the Dominator Tree and cleanup. |
| 1508 | |
| 1509 | assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) && |
| 1510 | "FC1GuardBlock has successors!!"); |
| 1511 | assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) && |
| 1512 | "FC1GuardBlock has predecessors!!"); |
| 1513 | |
| 1514 | // Update DT/PDT |
| 1515 | DTU.applyUpdates(TreeUpdates); |
| 1516 | |
| 1517 | LI.removeBlock(FC1.Preheader); |
| 1518 | DTU.deleteBB(FC1.Preheader); |
| 1519 | DTU.deleteBB(FC0.ExitBlock); |
| 1520 | DTU.flush(); |
| 1521 | |
| 1522 | // Is there a way to keep SE up-to-date so we don't need to forget the loops |
| 1523 | // and rebuild the information in subsequent passes of fusion? |
| 1524 | SE.forgetLoop(FC1.L); |
| 1525 | SE.forgetLoop(FC0.L); |
| 1526 | |
| 1527 | // Merge the loops. |
| 1528 | SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(), |
| 1529 | FC1.L->block_end()); |
| 1530 | for (BasicBlock *BB : Blocks) { |
| 1531 | FC0.L->addBlockEntry(BB); |
| 1532 | FC1.L->removeBlockFromLoop(BB); |
| 1533 | if (LI.getLoopFor(BB) != FC1.L) |
| 1534 | continue; |
| 1535 | LI.changeLoopFor(BB, FC0.L); |
| 1536 | } |
| 1537 | while (!FC1.L->empty()) { |
| 1538 | const auto &ChildLoopIt = FC1.L->begin(); |
| 1539 | Loop *ChildLoop = *ChildLoopIt; |
| 1540 | FC1.L->removeChildLoop(ChildLoopIt); |
| 1541 | FC0.L->addChildLoop(ChildLoop); |
| 1542 | } |
| 1543 | |
| 1544 | // Delete the now empty loop L1. |
| 1545 | LI.erase(FC1.L); |
| 1546 | |
| 1547 | #ifndef NDEBUG |
| 1548 | assert(!verifyFunction(*FC0.Header->getParent(), &errs())); |
| 1549 | assert(DT.verify(DominatorTree::VerificationLevel::Fast)); |
| 1550 | assert(PDT.verify()); |
| 1551 | LI.verify(DT); |
| 1552 | SE.verify(); |
| 1553 | #endif |
| 1554 | |
| 1555 | LLVM_DEBUG(dbgs() << "Fusion done:\n"); |
| 1556 | |
| 1557 | return FC0.L; |
| 1558 | } |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 1559 | }; |
| 1560 | |
| 1561 | struct LoopFuseLegacy : public FunctionPass { |
| 1562 | |
| 1563 | static char ID; |
| 1564 | |
| 1565 | LoopFuseLegacy() : FunctionPass(ID) { |
| 1566 | initializeLoopFuseLegacyPass(*PassRegistry::getPassRegistry()); |
| 1567 | } |
| 1568 | |
| 1569 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 1570 | AU.addRequiredID(LoopSimplifyID); |
| 1571 | AU.addRequired<ScalarEvolutionWrapperPass>(); |
| 1572 | AU.addRequired<LoopInfoWrapperPass>(); |
| 1573 | AU.addRequired<DominatorTreeWrapperPass>(); |
| 1574 | AU.addRequired<PostDominatorTreeWrapperPass>(); |
| 1575 | AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); |
| 1576 | AU.addRequired<DependenceAnalysisWrapperPass>(); |
| 1577 | |
| 1578 | AU.addPreserved<ScalarEvolutionWrapperPass>(); |
| 1579 | AU.addPreserved<LoopInfoWrapperPass>(); |
| 1580 | AU.addPreserved<DominatorTreeWrapperPass>(); |
| 1581 | AU.addPreserved<PostDominatorTreeWrapperPass>(); |
| 1582 | } |
| 1583 | |
| 1584 | bool runOnFunction(Function &F) override { |
| 1585 | if (skipFunction(F)) |
| 1586 | return false; |
| 1587 | auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); |
| 1588 | auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
| 1589 | auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI(); |
| 1590 | auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); |
| 1591 | auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); |
| 1592 | auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); |
| 1593 | |
| 1594 | const DataLayout &DL = F.getParent()->getDataLayout(); |
| 1595 | LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL); |
| 1596 | return LF.fuseLoops(F); |
| 1597 | } |
| 1598 | }; |
Benjamin Kramer | dc5f805 | 2019-08-23 19:59:23 +0000 | [diff] [blame] | 1599 | } // namespace |
Kit Barton | 3cdf879 | 2019-04-17 18:53:27 +0000 | [diff] [blame] | 1600 | |
| 1601 | PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { |
| 1602 | auto &LI = AM.getResult<LoopAnalysis>(F); |
| 1603 | auto &DT = AM.getResult<DominatorTreeAnalysis>(F); |
| 1604 | auto &DI = AM.getResult<DependenceAnalysis>(F); |
| 1605 | auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); |
| 1606 | auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); |
| 1607 | auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); |
| 1608 | |
| 1609 | const DataLayout &DL = F.getParent()->getDataLayout(); |
| 1610 | LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL); |
| 1611 | bool Changed = LF.fuseLoops(F); |
| 1612 | if (!Changed) |
| 1613 | return PreservedAnalyses::all(); |
| 1614 | |
| 1615 | PreservedAnalyses PA; |
| 1616 | PA.preserve<DominatorTreeAnalysis>(); |
| 1617 | PA.preserve<PostDominatorTreeAnalysis>(); |
| 1618 | PA.preserve<ScalarEvolutionAnalysis>(); |
| 1619 | PA.preserve<LoopAnalysis>(); |
| 1620 | return PA; |
| 1621 | } |
| 1622 | |
| 1623 | char LoopFuseLegacy::ID = 0; |
| 1624 | |
| 1625 | INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, |
| 1626 | false) |
| 1627 | INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) |
| 1628 | INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) |
| 1629 | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) |
| 1630 | INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) |
| 1631 | INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) |
| 1632 | INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) |
| 1633 | INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false) |
| 1634 | |
| 1635 | FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); } |