Blame - llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp - toolchain/llvm-project

blob: 6304098639cf4bf6ac9816bb539e6de2ee5ab6fe [file] [log] [blame]

Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	1	//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	/// \file
				11	/// This pass does misc. AMDGPU optimizations on IR before instruction
				12	/// selection.
				13	//
				14	//===----------------------------------------------------------------------===//
				15
				16	#include "AMDGPU.h"
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	17	#include "AMDGPUIntrinsicInfo.h"
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	18	#include "AMDGPUSubtarget.h"
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	19	#include "AMDGPUTargetMachine.h"
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	20
				21	#include "llvm/Analysis/DivergenceAnalysis.h"
				22	#include "llvm/CodeGen/Passes.h"
				23	#include "llvm/IR/InstVisitor.h"
				24	#include "llvm/IR/IRBuilder.h"
				25	#include "llvm/Support/Debug.h"
				26	#include "llvm/Support/raw_ostream.h"
				27
				28	#define DEBUG_TYPE "amdgpu-codegenprepare"
				29
				30	using namespace llvm;
				31
				32	namespace {
				33
				34	class AMDGPUCodeGenPrepare : public FunctionPass,
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	35	public InstVisitor<AMDGPUCodeGenPrepare, bool> {
				36	const GCNTargetMachine *TM;
				37	const SISubtarget *ST;
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	38	DivergenceAnalysis *DA;
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	39	Module *Mod;
				40	bool HasUnsafeFPMath;
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	41
Konstantin Zhuravlyov	e14df4b	2016-09-28 20:05:39 +0000	[diff] [blame^]	42	/// \brief Copies exact/nsw/nuw flags (if any) from binary operator \p I to
				43	/// binary operator \p V.
				44	///
				45	/// \returns Binary operator \p V.
				46	Value copyFlags(const BinaryOperator &I, Value V) const;
				47
				48	/// \returns Equivalent 16 bit integer type for given 32 bit integer type
				49	/// \p T.
				50	Type getI16Ty(IRBuilder<> &B, const Type T) const;
				51
				52	/// \returns Equivalent 32 bit integer type for given 16 bit integer type
				53	/// \p T.
				54	Type getI32Ty(IRBuilder<> &B, const Type T) const;
				55
				56	/// \returns True if the base element of type \p T is 16 bit integer, false
				57	/// otherwise.
				58	bool isI16Ty(const Type *T) const;
				59
				60	/// \returns True if the base element of type \p T is 32 bit integer, false
				61	/// otherwise.
				62	bool isI32Ty(const Type *T) const;
				63
				64	/// \returns True if binary operation \p I is a signed binary operation, false
				65	/// otherwise.
				66	bool isSigned(const BinaryOperator &I) const;
				67
				68	/// \returns True if the condition of 'select' operation \p I comes from a
				69	/// signed 'icmp' operation, false otherwise.
				70	bool isSigned(const SelectInst &I) const;
				71
				72	/// \brief Promotes uniform 16 bit binary operation \p I to equivalent 32 bit
				73	/// binary operation by sign or zero extending operands to 32 bits, replacing
				74	/// 16 bit operation with equivalent 32 bit operation, and truncating the
				75	/// result of 32 bit operation back to 16 bits. 16 bit division operation is
				76	/// not promoted.
				77	///
				78	/// \returns True if 16 bit binary operation is promoted to equivalent 32 bit
				79	/// binary operation, false otherwise.
				80	bool promoteUniformI16OpToI32Op(BinaryOperator &I) const;
				81
				82	/// \brief Promotes uniform 16 bit 'icmp' operation \p I to 32 bit 'icmp'
				83	/// operation by sign or zero extending operands to 32 bits, and replacing 16
				84	/// bit operation with 32 bit operation.
				85	///
				86	/// \returns True.
				87	bool promoteUniformI16OpToI32Op(ICmpInst &I) const;
				88
				89	/// \brief Promotes uniform 16 bit 'select' operation \p I to 32 bit 'select'
				90	/// operation by sign or zero extending operands to 32 bits, replacing 16 bit
				91	/// operation with 32 bit operation, and truncating the result of 32 bit
				92	/// operation back to 16 bits.
				93	///
				94	/// \returns True.
				95	bool promoteUniformI16OpToI32Op(SelectInst &I) const;
				96
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	97	public:
				98	static char ID;
				99	AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
				100	FunctionPass(ID),
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	101	TM(static_cast<const GCNTargetMachine *>(TM)),
				102	ST(nullptr),
				103	DA(nullptr),
				104	Mod(nullptr),
				105	HasUnsafeFPMath(false) { }
				106
				107	bool visitFDiv(BinaryOperator &I);
				108
Konstantin Zhuravlyov	e14df4b	2016-09-28 20:05:39 +0000	[diff] [blame^]	109	bool visitInstruction(Instruction &I) { return false; }
				110	bool visitBinaryOperator(BinaryOperator &I);
				111	bool visitICmpInst(ICmpInst &I);
				112	bool visitSelectInst(SelectInst &I);
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	113
				114	bool doInitialization(Module &M) override;
				115	bool runOnFunction(Function &F) override;
				116
				117	const char *getPassName() const override {
				118	return "AMDGPU IR optimizations";
				119	}
				120
				121	void getAnalysisUsage(AnalysisUsage &AU) const override {
				122	AU.addRequired<DivergenceAnalysis>();
				123	AU.setPreservesAll();
				124	}
				125	};
				126
				127	} // End anonymous namespace
				128
Konstantin Zhuravlyov	e14df4b	2016-09-28 20:05:39 +0000	[diff] [blame^]	129	Value *AMDGPUCodeGenPrepare::copyFlags(
				130	const BinaryOperator &I, Value *V) const {
				131	assert(isa<BinaryOperator>(V) && "V must be binary operator");
				132
				133	BinaryOperator *BinOp = cast<BinaryOperator>(V);
				134	if (isa<OverflowingBinaryOperator>(BinOp)) {
				135	BinOp->setHasNoSignedWrap(I.hasNoSignedWrap());
				136	BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
				137	} else if (isa<PossiblyExactOperator>(BinOp)) {
				138	BinOp->setIsExact(I.isExact());
				139	}
				140
				141	return V;
				142	}
				143
				144	Type AMDGPUCodeGenPrepare::getI16Ty(IRBuilder<> &B, const Type T) const {
				145	assert(isI32Ty(T) && "T must be 32 bits");
				146
				147	if (T->isIntegerTy())
				148	return B.getInt16Ty();
				149	return VectorType::get(B.getInt16Ty(), cast<VectorType>(T)->getNumElements());
				150	}
				151
				152	Type AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type T) const {
				153	assert(isI16Ty(T) && "T must be 16 bits");
				154
				155	if (T->isIntegerTy())
				156	return B.getInt32Ty();
				157	return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
				158	}
				159
				160	bool AMDGPUCodeGenPrepare::isI16Ty(const Type *T) const {
				161	if (T->isIntegerTy(16))
				162	return true;
				163	if (!T->isVectorTy())
				164	return false;
				165	return cast<VectorType>(T)->getElementType()->isIntegerTy(16);
				166	}
				167
				168	bool AMDGPUCodeGenPrepare::isI32Ty(const Type *T) const {
				169	if (T->isIntegerTy(32))
				170	return true;
				171	if (!T->isVectorTy())
				172	return false;
				173	return cast<VectorType>(T)->getElementType()->isIntegerTy(32);
				174	}
				175
				176	bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
				177	return I.getOpcode() == Instruction::SDiv \|\|
				178	I.getOpcode() == Instruction::SRem;
				179	}
				180
				181	bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
				182	return isa<ICmpInst>(I.getOperand(0)) ?
				183	cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
				184	}
				185
				186	bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(BinaryOperator &I) const {
				187	assert(isI16Ty(I.getType()) && "Op must be 16 bits");
				188
				189	if (I.getOpcode() == Instruction::SDiv \|\| I.getOpcode() == Instruction::UDiv)
				190	return false;
				191
				192	IRBuilder<> Builder(&I);
				193	Builder.SetCurrentDebugLocation(I.getDebugLoc());
				194
				195	Type *I32Ty = getI32Ty(Builder, I.getType());
				196	Value *ExtOp0 = nullptr;
				197	Value *ExtOp1 = nullptr;
				198	Value *ExtRes = nullptr;
				199	Value *TruncRes = nullptr;
				200
				201	if (isSigned(I)) {
				202	ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
				203	ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
				204	} else {
				205	ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
				206	ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
				207	}
				208	ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1));
				209	TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType()));
				210
				211	I.replaceAllUsesWith(TruncRes);
				212	I.eraseFromParent();
				213
				214	return true;
				215	}
				216
				217	bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(ICmpInst &I) const {
				218	assert(isI16Ty(I.getOperand(0)->getType()) && "Op0 must be 16 bits");
				219	assert(isI16Ty(I.getOperand(1)->getType()) && "Op1 must be 16 bits");
				220
				221	IRBuilder<> Builder(&I);
				222	Builder.SetCurrentDebugLocation(I.getDebugLoc());
				223
				224	Type *I32TyOp0 = getI32Ty(Builder, I.getOperand(0)->getType());
				225	Type *I32TyOp1 = getI32Ty(Builder, I.getOperand(1)->getType());
				226	Value *ExtOp0 = nullptr;
				227	Value *ExtOp1 = nullptr;
				228	Value *NewICmp = nullptr;
				229
				230	if (I.isSigned()) {
				231	ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32TyOp0);
				232	ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32TyOp1);
				233	} else {
				234	ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32TyOp0);
				235	ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32TyOp1);
				236	}
				237	NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
				238
				239	I.replaceAllUsesWith(NewICmp);
				240	I.eraseFromParent();
				241
				242	return true;
				243	}
				244
				245	bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(SelectInst &I) const {
				246	assert(isI16Ty(I.getType()) && "Op must be 16 bits");
				247
				248	IRBuilder<> Builder(&I);
				249	Builder.SetCurrentDebugLocation(I.getDebugLoc());
				250
				251	Type *I32Ty = getI32Ty(Builder, I.getType());
				252	Value *ExtOp1 = nullptr;
				253	Value *ExtOp2 = nullptr;
				254	Value *ExtRes = nullptr;
				255	Value *TruncRes = nullptr;
				256
				257	if (isSigned(I)) {
				258	ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
				259	ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
				260	} else {
				261	ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
				262	ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
				263	}
				264	ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
				265	TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType()));
				266
				267	I.replaceAllUsesWith(TruncRes);
				268	I.eraseFromParent();
				269
				270	return true;
				271	}
				272
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	273	static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
				274	const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
				275	if (!CNum)
				276	return false;
				277
				278	// Reciprocal f32 is handled separately without denormals.
Matt Arsenault	e3862cd	2016-07-26 23:25:44 +0000	[diff] [blame]	279	return UnsafeDiv \|\| CNum->isExactlyValue(+1.0);
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	280	}
				281
				282	// Insert an intrinsic for fast fdiv for safe math situations where we can
				283	// reduce precision. Leave fdiv for situations where the generic node is
				284	// expected to be optimized.
				285	bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
				286	Type *Ty = FDiv.getType();
				287
				288	// TODO: Handle half
				289	if (!Ty->getScalarType()->isFloatTy())
				290	return false;
				291
				292	MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
				293	if (!FPMath)
				294	return false;
				295
				296	const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
				297	float ULP = FPOp->getFPAccuracy();
				298	if (ULP < 2.5f)
				299	return false;
				300
				301	FastMathFlags FMF = FPOp->getFastMathFlags();
				302	bool UnsafeDiv = HasUnsafeFPMath \|\| FMF.unsafeAlgebra() \|\|
				303	FMF.allowReciprocal();
				304	if (ST->hasFP32Denormals() && !UnsafeDiv)
				305	return false;
				306
				307	IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
				308	Builder.setFastMathFlags(FMF);
				309	Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
				310
				311	const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
				312	Function *Decl
				313	= II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
				314
				315	Value *Num = FDiv.getOperand(0);
				316	Value *Den = FDiv.getOperand(1);
				317
				318	Value *NewFDiv = nullptr;
				319
				320	if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
				321	NewFDiv = UndefValue::get(VT);
				322
				323	// FIXME: Doesn't do the right thing for cases where the vector is partially
				324	// constant. This works when the scalarizer pass is run first.
				325	for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
				326	Value *NumEltI = Builder.CreateExtractElement(Num, I);
				327	Value *DenEltI = Builder.CreateExtractElement(Den, I);
				328	Value *NewElt;
				329
				330	if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
				331	NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
				332	} else {
				333	NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
				334	}
				335
				336	NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
				337	}
				338	} else {
				339	if (!shouldKeepFDivF32(Num, UnsafeDiv))
				340	NewFDiv = Builder.CreateCall(Decl, { Num, Den });
				341	}
				342
				343	if (NewFDiv) {
				344	FDiv.replaceAllUsesWith(NewFDiv);
				345	NewFDiv->takeName(&FDiv);
				346	FDiv.eraseFromParent();
				347	}
				348
				349	return true;
				350	}
				351
				352	static bool hasUnsafeFPMath(const Function &F) {
				353	Attribute Attr = F.getFnAttribute("unsafe-fp-math");
				354	return Attr.getValueAsString() == "true";
				355	}
				356
Konstantin Zhuravlyov	e14df4b	2016-09-28 20:05:39 +0000	[diff] [blame^]	357	bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
				358	bool Changed = false;
				359
				360	// TODO: Should we promote smaller types that will be legalized to i16?
				361	if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I))
				362	Changed \|= promoteUniformI16OpToI32Op(I);
				363
				364	return Changed;
				365	}
				366
				367	bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
				368	bool Changed = false;
				369
				370	// TODO: Should we promote smaller types that will be legalized to i16?
				371	if (ST->has16BitInsts() && isI16Ty(I.getOperand(0)->getType()) &&
				372	isI16Ty(I.getOperand(1)->getType()) && DA->isUniform(&I))
				373	Changed \|= promoteUniformI16OpToI32Op(I);
				374
				375	return Changed;
				376	}
				377
				378	bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
				379	bool Changed = false;
				380
				381	// TODO: Should we promote smaller types that will be legalized to i16?
				382	if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I))
				383	Changed \|= promoteUniformI16OpToI32Op(I);
				384
				385	return Changed;
				386	}
				387
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	388	bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	389	Mod = &M;
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	390	return false;
				391	}
				392
				393	bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
				394	if (!TM \|\| skipFunction(F))
				395	return false;
				396
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	397	ST = &TM->getSubtarget<SISubtarget>(F);
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	398	DA = &getAnalysis<DivergenceAnalysis>();
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	399	HasUnsafeFPMath = hasUnsafeFPMath(F);
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	400
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	401	bool MadeChange = false;
				402
				403	for (BasicBlock &BB : F) {
				404	BasicBlock::iterator Next;
				405	for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
				406	Next = std::next(I);
				407	MadeChange \|= visit(*I);
				408	}
				409	}
				410
				411	return MadeChange;
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	412	}
				413
				414	INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
				415	"AMDGPU IR optimizations", false, false)
				416	INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
				417	INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
				418	"AMDGPU IR optimizations", false, false)
				419
				420	char AMDGPUCodeGenPrepare::ID = 0;
				421
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	422	FunctionPass llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine TM) {
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	423	return new AMDGPUCodeGenPrepare(TM);
				424	}