Blame - llvm/lib/Target/AMDGPU/AMDGPUInline.cpp - toolchain/llvm-project

blob: 35dd9eb0a478d5c85db16d0c8cb1c820575450e0 [file] [log] [blame]

Stanislav Mekhanoshin	5670e6d	2017-09-20 04:25:58 +0000	[diff] [blame]	1	//===- AMDGPUInline.cpp - Code to perform simple function inlining --------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	/// \file
Adrian Prantl	5f8f34e4	2018-05-01 15:54:18 +0000	[diff] [blame]	11	/// This is AMDGPU specific replacement of the standard inliner.
Stanislav Mekhanoshin	5670e6d	2017-09-20 04:25:58 +0000	[diff] [blame]	12	/// The main purpose is to account for the fact that calls not only expensive
				13	/// on the AMDGPU, but much more expensive if a private memory pointer is
				14	/// passed to a function as an argument. In this situation, we are unable to
				15	/// eliminate private memory in the caller unless inlined and end up with slow
				16	/// and expensive scratch access. Thus, we boost the inline threshold for such
				17	/// functions here.
				18	///
				19	//===----------------------------------------------------------------------===//
				20
				21
				22	#include "AMDGPU.h"
				23	#include "llvm/Transforms/IPO.h"
				24	#include "llvm/Analysis/AssumptionCache.h"
				25	#include "llvm/Analysis/CallGraph.h"
				26	#include "llvm/Analysis/InlineCost.h"
				27	#include "llvm/Analysis/ValueTracking.h"
				28	#include "llvm/Analysis/TargetTransformInfo.h"
				29	#include "llvm/IR/CallSite.h"
				30	#include "llvm/IR/DataLayout.h"
				31	#include "llvm/IR/Instructions.h"
				32	#include "llvm/IR/Module.h"
				33	#include "llvm/IR/Type.h"
				34	#include "llvm/Support/CommandLine.h"
				35	#include "llvm/Support/Debug.h"
				36	#include "llvm/Transforms/IPO/Inliner.h"
				37
				38	using namespace llvm;
				39
				40	#define DEBUG_TYPE "inline"
				41
				42	static cl::opt<int>
				43	ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200),
				44	cl::desc("Cost of alloca argument"));
				45
				46	// If the amount of scratch memory to eliminate exceeds our ability to allocate
				47	// it into registers we gain nothing by agressively inlining functions for that
				48	// heuristic.
				49	static cl::opt<unsigned>
				50	ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
				51	cl::desc("Maximum alloca size to use for inline cost"));
				52
				53	namespace {
				54
				55	class AMDGPUInliner : public LegacyInlinerBase {
				56
				57	public:
				58	AMDGPUInliner() : LegacyInlinerBase(ID) {
				59	initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry());
				60	Params = getInlineParams();
				61	}
				62
				63	static char ID; // Pass identification, replacement for typeid
				64
				65	unsigned getInlineThreshold(CallSite CS) const;
				66
				67	InlineCost getInlineCost(CallSite CS) override;
				68
				69	bool runOnSCC(CallGraphSCC &SCC) override;
				70
				71	void getAnalysisUsage(AnalysisUsage &AU) const override;
				72
				73	private:
				74	TargetTransformInfoWrapperPass *TTIWP;
				75
				76	InlineParams Params;
				77	};
				78
				79	} // end anonymous namespace
				80
				81	char AMDGPUInliner::ID = 0;
				82	INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline",
				83	"AMDGPU Function Integration/Inlining", false, false)
				84	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
				85	INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
				86	INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
				87	INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
				88	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
				89	INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline",
				90	"AMDGPU Function Integration/Inlining", false, false)
				91
				92	Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); }
				93
				94	bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) {
				95	TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
				96	return LegacyInlinerBase::runOnSCC(SCC);
				97	}
				98
				99	void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const {
				100	AU.addRequired<TargetTransformInfoWrapperPass>();
				101	LegacyInlinerBase::getAnalysisUsage(AU);
				102	}
				103
				104	unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
				105	int Thres = Params.DefaultThreshold;
				106
				107	Function *Caller = CS.getCaller();
				108	// Listen to the inlinehint attribute when it would increase the threshold
				109	// and the caller does not need to minimize its size.
				110	Function *Callee = CS.getCalledFunction();
				111	bool InlineHint = Callee && !Callee->isDeclaration() &&
				112	Callee->hasFnAttribute(Attribute::InlineHint);
				113	if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres
				114	&& !Caller->hasFnAttribute(Attribute::MinSize))
				115	Thres = Params.HintThreshold.getValue();
				116
				117	const DataLayout &DL = Caller->getParent()->getDataLayout();
				118	if (!Callee)
				119	return (unsigned)Thres;
				120
				121	const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*Caller->getParent());
				122
				123	// If we have a pointer to private array passed into a function
				124	// it will not be optimized out, leaving scratch usage.
				125	// Increase the inline threshold to allow inliniting in this case.
				126	uint64_t AllocaSize = 0;
				127	SmallPtrSet<const AllocaInst *, 8> AIVisited;
				128	for (Value *PtrArg : CS.args()) {
				129	Type *Ty = PtrArg->getType();
				130	if (!Ty->isPointerTy() \|\|
				131	Ty->getPointerAddressSpace() != AS.PRIVATE_ADDRESS)
				132	continue;
				133	PtrArg = GetUnderlyingObject(PtrArg, DL);
				134	if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
				135	if (!AI->isStaticAlloca() \|\| !AIVisited.insert(AI).second)
				136	continue;
				137	AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
				138	// If the amount of stack memory is excessive we will not be able
				139	// to get rid of the scratch anyway, bail out.
				140	if (AllocaSize > ArgAllocaCutoff) {
				141	AllocaSize = 0;
				142	break;
				143	}
				144	}
				145	}
				146	if (AllocaSize)
				147	Thres += ArgAllocaCost;
				148
				149	return (unsigned)Thres;
				150	}
				151
				152	// Check if call is just a wrapper around another call.
				153	// In this case we only have call and ret instructions.
				154	static bool isWrapperOnlyCall(CallSite CS) {
				155	Function *Callee = CS.getCalledFunction();
				156	if (!Callee \|\| Callee->size() != 1)
				157	return false;
				158	const BasicBlock &BB = Callee->getEntryBlock();
				159	if (const Instruction *I = BB.getFirstNonPHI()) {
				160	if (!isa<CallInst>(I)) {
				161	return false;
				162	}
				163	if (isa<ReturnInst>(*std::next(I->getIterator()))) {
Nicola Zaghen	d34e60c	2018-05-14 12:53:11 +0000	[diff] [blame]	164	LLVM_DEBUG(dbgs() << " Wrapper only call detected: "
				165	<< Callee->getName() << '\n');
Stanislav Mekhanoshin	5670e6d	2017-09-20 04:25:58 +0000	[diff] [blame]	166	return true;
				167	}
				168	}
				169	return false;
				170	}
				171
				172	InlineCost AMDGPUInliner::getInlineCost(CallSite CS) {
				173	Function *Callee = CS.getCalledFunction();
				174	Function *Caller = CS.getCaller();
				175	TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
				176
				177	if (!Callee \|\| Callee->isDeclaration() \|\| CS.isNoInline() \|\|
				178	!TTI.areInlineCompatible(Caller, Callee))
				179	return llvm::InlineCost::getNever();
				180
				181	if (CS.hasFnAttr(Attribute::AlwaysInline)) {
				182	if (isInlineViable(*Callee))
				183	return llvm::InlineCost::getAlways();
				184	return llvm::InlineCost::getNever();
				185	}
				186
				187	if (isWrapperOnlyCall(CS))
				188	return llvm::InlineCost::getAlways();
				189
				190	InlineParams LocalParams = Params;
				191	LocalParams.DefaultThreshold = (int)getInlineThreshold(CS);
				192	bool RemarksEnabled = false;
				193	const auto &BBs = Caller->getBasicBlockList();
				194	if (!BBs.empty()) {
				195	auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front());
				196	if (DI.isEnabled())
				197	RemarksEnabled = true;
				198	}
				199
				200	OptimizationRemarkEmitter ORE(Caller);
				201	std::function<AssumptionCache &(Function &)> GetAssumptionCache =
				202	[this](Function &F) -> AssumptionCache & {
				203	return ACT->getAssumptionCache(F);
				204	};
				205
				206	return llvm::getInlineCost(CS, Callee, LocalParams, TTI, GetAssumptionCache,
				207	None, PSI, RemarksEnabled ? &ORE : nullptr);
				208	}