| Stanislav Mekhanoshin | 5670e6d | 2017-09-20 04:25:58 +0000 | [diff] [blame] | 1 | //===- AMDGPUInline.cpp - Code to perform simple function inlining --------===// | 
|  | 2 | // | 
|  | 3 | //                     The LLVM Compiler Infrastructure | 
|  | 4 | // | 
|  | 5 | // This file is distributed under the University of Illinois Open Source | 
|  | 6 | // License. See LICENSE.TXT for details. | 
|  | 7 | // | 
|  | 8 | //===----------------------------------------------------------------------===// | 
|  | 9 | // | 
|  | 10 | /// \file | 
| Adrian Prantl | 5f8f34e4 | 2018-05-01 15:54:18 +0000 | [diff] [blame] | 11 | /// This is AMDGPU specific replacement of the standard inliner. | 
| Stanislav Mekhanoshin | 5670e6d | 2017-09-20 04:25:58 +0000 | [diff] [blame] | 12 | /// The main purpose is to account for the fact that calls not only expensive | 
|  | 13 | /// on the AMDGPU, but much more expensive if a private memory pointer is | 
|  | 14 | /// passed to a function as an argument. In this situation, we are unable to | 
|  | 15 | /// eliminate private memory in the caller unless inlined and end up with slow | 
|  | 16 | /// and expensive scratch access. Thus, we boost the inline threshold for such | 
|  | 17 | /// functions here. | 
|  | 18 | /// | 
|  | 19 | //===----------------------------------------------------------------------===// | 
|  | 20 |  | 
|  | 21 |  | 
|  | 22 | #include "AMDGPU.h" | 
|  | 23 | #include "llvm/Transforms/IPO.h" | 
|  | 24 | #include "llvm/Analysis/AssumptionCache.h" | 
|  | 25 | #include "llvm/Analysis/CallGraph.h" | 
|  | 26 | #include "llvm/Analysis/InlineCost.h" | 
|  | 27 | #include "llvm/Analysis/ValueTracking.h" | 
|  | 28 | #include "llvm/Analysis/TargetTransformInfo.h" | 
|  | 29 | #include "llvm/IR/CallSite.h" | 
|  | 30 | #include "llvm/IR/DataLayout.h" | 
|  | 31 | #include "llvm/IR/Instructions.h" | 
|  | 32 | #include "llvm/IR/Module.h" | 
|  | 33 | #include "llvm/IR/Type.h" | 
|  | 34 | #include "llvm/Support/CommandLine.h" | 
|  | 35 | #include "llvm/Support/Debug.h" | 
|  | 36 | #include "llvm/Transforms/IPO/Inliner.h" | 
|  | 37 |  | 
|  | 38 | using namespace llvm; | 
|  | 39 |  | 
|  | 40 | #define DEBUG_TYPE "inline" | 
|  | 41 |  | 
|  | 42 | static cl::opt<int> | 
|  | 43 | ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200), | 
|  | 44 | cl::desc("Cost of alloca argument")); | 
|  | 45 |  | 
|  | 46 | // If the amount of scratch memory to eliminate exceeds our ability to allocate | 
|  | 47 | // it into registers we gain nothing by agressively inlining functions for that | 
|  | 48 | // heuristic. | 
|  | 49 | static cl::opt<unsigned> | 
|  | 50 | ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), | 
|  | 51 | cl::desc("Maximum alloca size to use for inline cost")); | 
|  | 52 |  | 
|  | 53 | namespace { | 
|  | 54 |  | 
|  | 55 | class AMDGPUInliner : public LegacyInlinerBase { | 
|  | 56 |  | 
|  | 57 | public: | 
|  | 58 | AMDGPUInliner() : LegacyInlinerBase(ID) { | 
|  | 59 | initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry()); | 
|  | 60 | Params = getInlineParams(); | 
|  | 61 | } | 
|  | 62 |  | 
|  | 63 | static char ID; // Pass identification, replacement for typeid | 
|  | 64 |  | 
|  | 65 | unsigned getInlineThreshold(CallSite CS) const; | 
|  | 66 |  | 
|  | 67 | InlineCost getInlineCost(CallSite CS) override; | 
|  | 68 |  | 
|  | 69 | bool runOnSCC(CallGraphSCC &SCC) override; | 
|  | 70 |  | 
|  | 71 | void getAnalysisUsage(AnalysisUsage &AU) const override; | 
|  | 72 |  | 
|  | 73 | private: | 
|  | 74 | TargetTransformInfoWrapperPass *TTIWP; | 
|  | 75 |  | 
|  | 76 | InlineParams Params; | 
|  | 77 | }; | 
|  | 78 |  | 
|  | 79 | } // end anonymous namespace | 
|  | 80 |  | 
|  | 81 | char AMDGPUInliner::ID = 0; | 
|  | 82 | INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline", | 
|  | 83 | "AMDGPU Function Integration/Inlining", false, false) | 
|  | 84 | INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) | 
|  | 85 | INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) | 
|  | 86 | INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) | 
|  | 87 | INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) | 
|  | 88 | INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) | 
|  | 89 | INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline", | 
|  | 90 | "AMDGPU Function Integration/Inlining", false, false) | 
|  | 91 |  | 
|  | 92 | Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); } | 
|  | 93 |  | 
|  | 94 | bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) { | 
|  | 95 | TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); | 
|  | 96 | return LegacyInlinerBase::runOnSCC(SCC); | 
|  | 97 | } | 
|  | 98 |  | 
|  | 99 | void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const { | 
|  | 100 | AU.addRequired<TargetTransformInfoWrapperPass>(); | 
|  | 101 | LegacyInlinerBase::getAnalysisUsage(AU); | 
|  | 102 | } | 
|  | 103 |  | 
|  | 104 | unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const { | 
|  | 105 | int Thres = Params.DefaultThreshold; | 
|  | 106 |  | 
|  | 107 | Function *Caller = CS.getCaller(); | 
|  | 108 | // Listen to the inlinehint attribute when it would increase the threshold | 
|  | 109 | // and the caller does not need to minimize its size. | 
|  | 110 | Function *Callee = CS.getCalledFunction(); | 
|  | 111 | bool InlineHint = Callee && !Callee->isDeclaration() && | 
|  | 112 | Callee->hasFnAttribute(Attribute::InlineHint); | 
|  | 113 | if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres | 
|  | 114 | && !Caller->hasFnAttribute(Attribute::MinSize)) | 
|  | 115 | Thres = Params.HintThreshold.getValue(); | 
|  | 116 |  | 
|  | 117 | const DataLayout &DL = Caller->getParent()->getDataLayout(); | 
|  | 118 | if (!Callee) | 
|  | 119 | return (unsigned)Thres; | 
|  | 120 |  | 
|  | 121 | const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*Caller->getParent()); | 
|  | 122 |  | 
|  | 123 | // If we have a pointer to private array passed into a function | 
|  | 124 | // it will not be optimized out, leaving scratch usage. | 
|  | 125 | // Increase the inline threshold to allow inliniting in this case. | 
|  | 126 | uint64_t AllocaSize = 0; | 
|  | 127 | SmallPtrSet<const AllocaInst *, 8> AIVisited; | 
|  | 128 | for (Value *PtrArg : CS.args()) { | 
|  | 129 | Type *Ty = PtrArg->getType(); | 
|  | 130 | if (!Ty->isPointerTy() || | 
|  | 131 | Ty->getPointerAddressSpace() != AS.PRIVATE_ADDRESS) | 
|  | 132 | continue; | 
|  | 133 | PtrArg = GetUnderlyingObject(PtrArg, DL); | 
|  | 134 | if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) { | 
|  | 135 | if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) | 
|  | 136 | continue; | 
|  | 137 | AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); | 
|  | 138 | // If the amount of stack memory is excessive we will not be able | 
|  | 139 | // to get rid of the scratch anyway, bail out. | 
|  | 140 | if (AllocaSize > ArgAllocaCutoff) { | 
|  | 141 | AllocaSize = 0; | 
|  | 142 | break; | 
|  | 143 | } | 
|  | 144 | } | 
|  | 145 | } | 
|  | 146 | if (AllocaSize) | 
|  | 147 | Thres += ArgAllocaCost; | 
|  | 148 |  | 
|  | 149 | return (unsigned)Thres; | 
|  | 150 | } | 
|  | 151 |  | 
|  | 152 | // Check if call is just a wrapper around another call. | 
|  | 153 | // In this case we only have call and ret instructions. | 
|  | 154 | static bool isWrapperOnlyCall(CallSite CS) { | 
|  | 155 | Function *Callee = CS.getCalledFunction(); | 
|  | 156 | if (!Callee || Callee->size() != 1) | 
|  | 157 | return false; | 
|  | 158 | const BasicBlock &BB = Callee->getEntryBlock(); | 
|  | 159 | if (const Instruction *I = BB.getFirstNonPHI()) { | 
|  | 160 | if (!isa<CallInst>(I)) { | 
|  | 161 | return false; | 
|  | 162 | } | 
|  | 163 | if (isa<ReturnInst>(*std::next(I->getIterator()))) { | 
| Nicola Zaghen | d34e60c | 2018-05-14 12:53:11 +0000 | [diff] [blame] | 164 | LLVM_DEBUG(dbgs() << "    Wrapper only call detected: " | 
|  | 165 | << Callee->getName() << '\n'); | 
| Stanislav Mekhanoshin | 5670e6d | 2017-09-20 04:25:58 +0000 | [diff] [blame] | 166 | return true; | 
|  | 167 | } | 
|  | 168 | } | 
|  | 169 | return false; | 
|  | 170 | } | 
|  | 171 |  | 
|  | 172 | InlineCost AMDGPUInliner::getInlineCost(CallSite CS) { | 
|  | 173 | Function *Callee = CS.getCalledFunction(); | 
|  | 174 | Function *Caller = CS.getCaller(); | 
|  | 175 | TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); | 
|  | 176 |  | 
|  | 177 | if (!Callee || Callee->isDeclaration() || CS.isNoInline() || | 
|  | 178 | !TTI.areInlineCompatible(Caller, Callee)) | 
|  | 179 | return llvm::InlineCost::getNever(); | 
|  | 180 |  | 
|  | 181 | if (CS.hasFnAttr(Attribute::AlwaysInline)) { | 
|  | 182 | if (isInlineViable(*Callee)) | 
|  | 183 | return llvm::InlineCost::getAlways(); | 
|  | 184 | return llvm::InlineCost::getNever(); | 
|  | 185 | } | 
|  | 186 |  | 
|  | 187 | if (isWrapperOnlyCall(CS)) | 
|  | 188 | return llvm::InlineCost::getAlways(); | 
|  | 189 |  | 
|  | 190 | InlineParams LocalParams = Params; | 
|  | 191 | LocalParams.DefaultThreshold = (int)getInlineThreshold(CS); | 
|  | 192 | bool RemarksEnabled = false; | 
|  | 193 | const auto &BBs = Caller->getBasicBlockList(); | 
|  | 194 | if (!BBs.empty()) { | 
|  | 195 | auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front()); | 
|  | 196 | if (DI.isEnabled()) | 
|  | 197 | RemarksEnabled = true; | 
|  | 198 | } | 
|  | 199 |  | 
|  | 200 | OptimizationRemarkEmitter ORE(Caller); | 
|  | 201 | std::function<AssumptionCache &(Function &)> GetAssumptionCache = | 
|  | 202 | [this](Function &F) -> AssumptionCache & { | 
|  | 203 | return ACT->getAssumptionCache(F); | 
|  | 204 | }; | 
|  | 205 |  | 
|  | 206 | return llvm::getInlineCost(CS, Callee, LocalParams, TTI, GetAssumptionCache, | 
|  | 207 | None, PSI, RemarksEnabled ? &ORE : nullptr); | 
|  | 208 | } |