Blame - llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp - toolchain/llvm-project

blob: 88b1be2e6b82f32885c2fb9af145a2e6308bcb41 [file] [log] [blame]

Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	1	//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	/// \file
				11	/// This pass does misc. AMDGPU optimizations on IR before instruction
				12	/// selection.
				13	//
				14	//===----------------------------------------------------------------------===//
				15
				16	#include "AMDGPU.h"
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	17	#include "AMDGPUIntrinsicInfo.h"
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	18	#include "AMDGPUSubtarget.h"
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	19	#include "AMDGPUTargetMachine.h"
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	20
				21	#include "llvm/Analysis/DivergenceAnalysis.h"
				22	#include "llvm/CodeGen/Passes.h"
				23	#include "llvm/IR/InstVisitor.h"
				24	#include "llvm/IR/IRBuilder.h"
				25	#include "llvm/Support/Debug.h"
				26	#include "llvm/Support/raw_ostream.h"
				27
				28	#define DEBUG_TYPE "amdgpu-codegenprepare"
				29
				30	using namespace llvm;
				31
				32	namespace {
				33
				34	class AMDGPUCodeGenPrepare : public FunctionPass,
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	35	public InstVisitor<AMDGPUCodeGenPrepare, bool> {
				36	const GCNTargetMachine *TM;
				37	const SISubtarget *ST;
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	38	DivergenceAnalysis *DA;
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	39	Module *Mod;
				40	bool HasUnsafeFPMath;
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	41
Konstantin Zhuravlyov	e14df4b	2016-09-28 20:05:39 +0000	[diff] [blame]	42	/// \brief Copies exact/nsw/nuw flags (if any) from binary operator \p I to
				43	/// binary operator \p V.
				44	///
				45	/// \returns Binary operator \p V.
				46	Value copyFlags(const BinaryOperator &I, Value V) const;
				47
				48	/// \returns Equivalent 16 bit integer type for given 32 bit integer type
				49	/// \p T.
				50	Type getI16Ty(IRBuilder<> &B, const Type T) const;
				51
				52	/// \returns Equivalent 32 bit integer type for given 16 bit integer type
				53	/// \p T.
				54	Type getI32Ty(IRBuilder<> &B, const Type T) const;
				55
				56	/// \returns True if the base element of type \p T is 16 bit integer, false
				57	/// otherwise.
				58	bool isI16Ty(const Type *T) const;
				59
				60	/// \returns True if the base element of type \p T is 32 bit integer, false
				61	/// otherwise.
				62	bool isI32Ty(const Type *T) const;
				63
				64	/// \returns True if binary operation \p I is a signed binary operation, false
				65	/// otherwise.
				66	bool isSigned(const BinaryOperator &I) const;
				67
				68	/// \returns True if the condition of 'select' operation \p I comes from a
				69	/// signed 'icmp' operation, false otherwise.
				70	bool isSigned(const SelectInst &I) const;
				71
				72	/// \brief Promotes uniform 16 bit binary operation \p I to equivalent 32 bit
				73	/// binary operation by sign or zero extending operands to 32 bits, replacing
				74	/// 16 bit operation with equivalent 32 bit operation, and truncating the
				75	/// result of 32 bit operation back to 16 bits. 16 bit division operation is
				76	/// not promoted.
				77	///
				78	/// \returns True if 16 bit binary operation is promoted to equivalent 32 bit
				79	/// binary operation, false otherwise.
				80	bool promoteUniformI16OpToI32Op(BinaryOperator &I) const;
				81
				82	/// \brief Promotes uniform 16 bit 'icmp' operation \p I to 32 bit 'icmp'
				83	/// operation by sign or zero extending operands to 32 bits, and replacing 16
				84	/// bit operation with 32 bit operation.
				85	///
				86	/// \returns True.
				87	bool promoteUniformI16OpToI32Op(ICmpInst &I) const;
				88
				89	/// \brief Promotes uniform 16 bit 'select' operation \p I to 32 bit 'select'
				90	/// operation by sign or zero extending operands to 32 bits, replacing 16 bit
				91	/// operation with 32 bit operation, and truncating the result of 32 bit
				92	/// operation back to 16 bits.
				93	///
				94	/// \returns True.
				95	bool promoteUniformI16OpToI32Op(SelectInst &I) const;
				96
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	97	public:
				98	static char ID;
				99	AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
				100	FunctionPass(ID),
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	101	TM(static_cast<const GCNTargetMachine *>(TM)),
				102	ST(nullptr),
				103	DA(nullptr),
				104	Mod(nullptr),
				105	HasUnsafeFPMath(false) { }
				106
				107	bool visitFDiv(BinaryOperator &I);
				108
Konstantin Zhuravlyov	e14df4b	2016-09-28 20:05:39 +0000	[diff] [blame]	109	bool visitInstruction(Instruction &I) { return false; }
				110	bool visitBinaryOperator(BinaryOperator &I);
				111	bool visitICmpInst(ICmpInst &I);
				112	bool visitSelectInst(SelectInst &I);
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	113
				114	bool doInitialization(Module &M) override;
				115	bool runOnFunction(Function &F) override;
				116
Mehdi Amini	117296c	2016-10-01 02:56:57 +0000	[diff] [blame]	117	StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	118
				119	void getAnalysisUsage(AnalysisUsage &AU) const override {
				120	AU.addRequired<DivergenceAnalysis>();
				121	AU.setPreservesAll();
				122	}
				123	};
				124
				125	} // End anonymous namespace
				126
Konstantin Zhuravlyov	e14df4b	2016-09-28 20:05:39 +0000	[diff] [blame]	127	Value *AMDGPUCodeGenPrepare::copyFlags(
				128	const BinaryOperator &I, Value *V) const {
				129	assert(isa<BinaryOperator>(V) && "V must be binary operator");
				130
				131	BinaryOperator *BinOp = cast<BinaryOperator>(V);
				132	if (isa<OverflowingBinaryOperator>(BinOp)) {
				133	BinOp->setHasNoSignedWrap(I.hasNoSignedWrap());
				134	BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
				135	} else if (isa<PossiblyExactOperator>(BinOp)) {
				136	BinOp->setIsExact(I.isExact());
				137	}
				138
				139	return V;
				140	}
				141
				142	Type AMDGPUCodeGenPrepare::getI16Ty(IRBuilder<> &B, const Type T) const {
				143	assert(isI32Ty(T) && "T must be 32 bits");
				144
				145	if (T->isIntegerTy())
				146	return B.getInt16Ty();
				147	return VectorType::get(B.getInt16Ty(), cast<VectorType>(T)->getNumElements());
				148	}
				149
				150	Type AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type T) const {
				151	assert(isI16Ty(T) && "T must be 16 bits");
				152
				153	if (T->isIntegerTy())
				154	return B.getInt32Ty();
				155	return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
				156	}
				157
				158	bool AMDGPUCodeGenPrepare::isI16Ty(const Type *T) const {
				159	if (T->isIntegerTy(16))
				160	return true;
				161	if (!T->isVectorTy())
				162	return false;
				163	return cast<VectorType>(T)->getElementType()->isIntegerTy(16);
				164	}
				165
				166	bool AMDGPUCodeGenPrepare::isI32Ty(const Type *T) const {
				167	if (T->isIntegerTy(32))
				168	return true;
				169	if (!T->isVectorTy())
				170	return false;
				171	return cast<VectorType>(T)->getElementType()->isIntegerTy(32);
				172	}
				173
				174	bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
Konstantin Zhuravlyov	691e2e0	2016-10-03 18:29:01 +0000	[diff] [blame^]	175	return I.getOpcode() == Instruction::AShr \|\|
				176	I.getOpcode() == Instruction::SDiv \|\| I.getOpcode() == Instruction::SRem;
Konstantin Zhuravlyov	e14df4b	2016-09-28 20:05:39 +0000	[diff] [blame]	177	}
				178
				179	bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
				180	return isa<ICmpInst>(I.getOperand(0)) ?
				181	cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
				182	}
				183
				184	bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(BinaryOperator &I) const {
				185	assert(isI16Ty(I.getType()) && "Op must be 16 bits");
				186
				187	if (I.getOpcode() == Instruction::SDiv \|\| I.getOpcode() == Instruction::UDiv)
				188	return false;
				189
				190	IRBuilder<> Builder(&I);
				191	Builder.SetCurrentDebugLocation(I.getDebugLoc());
				192
				193	Type *I32Ty = getI32Ty(Builder, I.getType());
				194	Value *ExtOp0 = nullptr;
				195	Value *ExtOp1 = nullptr;
				196	Value *ExtRes = nullptr;
				197	Value *TruncRes = nullptr;
				198
				199	if (isSigned(I)) {
				200	ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
				201	ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
				202	} else {
				203	ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
				204	ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
				205	}
				206	ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1));
				207	TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType()));
				208
				209	I.replaceAllUsesWith(TruncRes);
				210	I.eraseFromParent();
				211
				212	return true;
				213	}
				214
				215	bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(ICmpInst &I) const {
				216	assert(isI16Ty(I.getOperand(0)->getType()) && "Op0 must be 16 bits");
				217	assert(isI16Ty(I.getOperand(1)->getType()) && "Op1 must be 16 bits");
				218
				219	IRBuilder<> Builder(&I);
				220	Builder.SetCurrentDebugLocation(I.getDebugLoc());
				221
				222	Type *I32TyOp0 = getI32Ty(Builder, I.getOperand(0)->getType());
				223	Type *I32TyOp1 = getI32Ty(Builder, I.getOperand(1)->getType());
				224	Value *ExtOp0 = nullptr;
				225	Value *ExtOp1 = nullptr;
				226	Value *NewICmp = nullptr;
				227
				228	if (I.isSigned()) {
				229	ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32TyOp0);
				230	ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32TyOp1);
				231	} else {
				232	ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32TyOp0);
				233	ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32TyOp1);
				234	}
				235	NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
				236
				237	I.replaceAllUsesWith(NewICmp);
				238	I.eraseFromParent();
				239
				240	return true;
				241	}
				242
				243	bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(SelectInst &I) const {
				244	assert(isI16Ty(I.getType()) && "Op must be 16 bits");
				245
				246	IRBuilder<> Builder(&I);
				247	Builder.SetCurrentDebugLocation(I.getDebugLoc());
				248
				249	Type *I32Ty = getI32Ty(Builder, I.getType());
				250	Value *ExtOp1 = nullptr;
				251	Value *ExtOp2 = nullptr;
				252	Value *ExtRes = nullptr;
				253	Value *TruncRes = nullptr;
				254
				255	if (isSigned(I)) {
				256	ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
				257	ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
				258	} else {
				259	ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
				260	ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
				261	}
				262	ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
				263	TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType()));
				264
				265	I.replaceAllUsesWith(TruncRes);
				266	I.eraseFromParent();
				267
				268	return true;
				269	}
				270
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	271	static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
				272	const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
				273	if (!CNum)
				274	return false;
				275
				276	// Reciprocal f32 is handled separately without denormals.
Matt Arsenault	e3862cd	2016-07-26 23:25:44 +0000	[diff] [blame]	277	return UnsafeDiv \|\| CNum->isExactlyValue(+1.0);
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	278	}
				279
				280	// Insert an intrinsic for fast fdiv for safe math situations where we can
				281	// reduce precision. Leave fdiv for situations where the generic node is
				282	// expected to be optimized.
				283	bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
				284	Type *Ty = FDiv.getType();
				285
				286	// TODO: Handle half
				287	if (!Ty->getScalarType()->isFloatTy())
				288	return false;
				289
				290	MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
				291	if (!FPMath)
				292	return false;
				293
				294	const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
				295	float ULP = FPOp->getFPAccuracy();
				296	if (ULP < 2.5f)
				297	return false;
				298
				299	FastMathFlags FMF = FPOp->getFastMathFlags();
				300	bool UnsafeDiv = HasUnsafeFPMath \|\| FMF.unsafeAlgebra() \|\|
				301	FMF.allowReciprocal();
				302	if (ST->hasFP32Denormals() && !UnsafeDiv)
				303	return false;
				304
				305	IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
				306	Builder.setFastMathFlags(FMF);
				307	Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
				308
				309	const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
				310	Function *Decl
				311	= II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
				312
				313	Value *Num = FDiv.getOperand(0);
				314	Value *Den = FDiv.getOperand(1);
				315
				316	Value *NewFDiv = nullptr;
				317
				318	if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
				319	NewFDiv = UndefValue::get(VT);
				320
				321	// FIXME: Doesn't do the right thing for cases where the vector is partially
				322	// constant. This works when the scalarizer pass is run first.
				323	for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
				324	Value *NumEltI = Builder.CreateExtractElement(Num, I);
				325	Value *DenEltI = Builder.CreateExtractElement(Den, I);
				326	Value *NewElt;
				327
				328	if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
				329	NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
				330	} else {
				331	NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
				332	}
				333
				334	NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
				335	}
				336	} else {
				337	if (!shouldKeepFDivF32(Num, UnsafeDiv))
				338	NewFDiv = Builder.CreateCall(Decl, { Num, Den });
				339	}
				340
				341	if (NewFDiv) {
				342	FDiv.replaceAllUsesWith(NewFDiv);
				343	NewFDiv->takeName(&FDiv);
				344	FDiv.eraseFromParent();
				345	}
				346
				347	return true;
				348	}
				349
				350	static bool hasUnsafeFPMath(const Function &F) {
				351	Attribute Attr = F.getFnAttribute("unsafe-fp-math");
				352	return Attr.getValueAsString() == "true";
				353	}
				354
Konstantin Zhuravlyov	e14df4b	2016-09-28 20:05:39 +0000	[diff] [blame]	355	bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
				356	bool Changed = false;
				357
				358	// TODO: Should we promote smaller types that will be legalized to i16?
				359	if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I))
				360	Changed \|= promoteUniformI16OpToI32Op(I);
				361
				362	return Changed;
				363	}
				364
				365	bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
				366	bool Changed = false;
				367
				368	// TODO: Should we promote smaller types that will be legalized to i16?
				369	if (ST->has16BitInsts() && isI16Ty(I.getOperand(0)->getType()) &&
				370	isI16Ty(I.getOperand(1)->getType()) && DA->isUniform(&I))
				371	Changed \|= promoteUniformI16OpToI32Op(I);
				372
				373	return Changed;
				374	}
				375
				376	bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
				377	bool Changed = false;
				378
				379	// TODO: Should we promote smaller types that will be legalized to i16?
				380	if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I))
				381	Changed \|= promoteUniformI16OpToI32Op(I);
				382
				383	return Changed;
				384	}
				385
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	386	bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	387	Mod = &M;
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	388	return false;
				389	}
				390
				391	bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
				392	if (!TM \|\| skipFunction(F))
				393	return false;
				394
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	395	ST = &TM->getSubtarget<SISubtarget>(F);
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	396	DA = &getAnalysis<DivergenceAnalysis>();
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	397	HasUnsafeFPMath = hasUnsafeFPMath(F);
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	398
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	399	bool MadeChange = false;
				400
				401	for (BasicBlock &BB : F) {
				402	BasicBlock::iterator Next;
				403	for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
				404	Next = std::next(I);
				405	MadeChange \|= visit(*I);
				406	}
				407	}
				408
				409	return MadeChange;
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	410	}
				411
				412	INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
				413	"AMDGPU IR optimizations", false, false)
				414	INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
				415	INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
				416	"AMDGPU IR optimizations", false, false)
				417
				418	char AMDGPUCodeGenPrepare::ID = 0;
				419
Matt Arsenault	a1fe17c	2016-07-19 23:16:53 +0000	[diff] [blame]	420	FunctionPass llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine TM) {
Matt Arsenault	86de486	2016-06-24 07:07:55 +0000	[diff] [blame]	421	return new AMDGPUCodeGenPrepare(TM);
				422	}