Blame - llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp - toolchain/llvm-project

blob: ce17f027b522838e9720a41aeae2c0ba72cf765a [file] [log] [blame]

Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	1	//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
Tom Stellard	8b1e021	2013-07-27 00:01:07 +0000	[diff] [blame]	2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// \file
				11	// This file implements a TargetTransformInfo analysis pass specific to the
				12	// AMDGPU target machine. It uses the target's detailed information to provide
				13	// more precise answers to certain TTI queries, while letting the target
				14	// independent and default TTI implementations handle the rest.
				15	//
				16	//===----------------------------------------------------------------------===//
				17
Chandler Carruth	93dcdc4	2015-01-31 11:17:59 +0000	[diff] [blame]	18	#include "AMDGPUTargetTransformInfo.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	19	#include "AMDGPUSubtarget.h"
Alexander Timofeev	2e5eece	2018-03-05 15:12:21 +0000	[diff] [blame]	20	#include "Utils/AMDGPUBaseInfo.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	21	#include "llvm/ADT/STLExtras.h"
Tom Stellard	8cce9bd	2014-01-23 18:49:28 +0000	[diff] [blame]	22	#include "llvm/Analysis/LoopInfo.h"
Tom Stellard	8b1e021	2013-07-27 00:01:07 +0000	[diff] [blame]	23	#include "llvm/Analysis/TargetTransformInfo.h"
Tom Stellard	8cce9bd	2014-01-23 18:49:28 +0000	[diff] [blame]	24	#include "llvm/Analysis/ValueTracking.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	25	#include "llvm/CodeGen/ISDOpcodes.h"
Craig Topper	2fa1436	2018-03-29 17:21:10 +0000	[diff] [blame]	26	#include "llvm/CodeGen/ValueTypes.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	27	#include "llvm/IR/Argument.h"
				28	#include "llvm/IR/Attributes.h"
				29	#include "llvm/IR/BasicBlock.h"
				30	#include "llvm/IR/CallingConv.h"
				31	#include "llvm/IR/DataLayout.h"
				32	#include "llvm/IR/DerivedTypes.h"
				33	#include "llvm/IR/Function.h"
				34	#include "llvm/IR/Instruction.h"
				35	#include "llvm/IR/Instructions.h"
				36	#include "llvm/IR/IntrinsicInst.h"
Chandler Carruth	6bda14b	2017-06-06 11:49:48 +0000	[diff] [blame]	37	#include "llvm/IR/Module.h"
Matt Arsenault	376f1bd	2017-08-31 05:47:00 +0000	[diff] [blame]	38	#include "llvm/IR/PatternMatch.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	39	#include "llvm/IR/Type.h"
				40	#include "llvm/IR/Value.h"
				41	#include "llvm/MC/SubtargetFeature.h"
				42	#include "llvm/Support/Casting.h"
				43	#include "llvm/Support/CommandLine.h"
Tom Stellard	8b1e021	2013-07-27 00:01:07 +0000	[diff] [blame]	44	#include "llvm/Support/Debug.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	45	#include "llvm/Support/ErrorHandling.h"
David Blaikie	13e77db	2018-03-23 23:58:25 +0000	[diff] [blame]	46	#include "llvm/Support/MachineValueType.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	47	#include "llvm/Support/raw_ostream.h"
				48	#include "llvm/Target/TargetMachine.h"
				49	#include <algorithm>
				50	#include <cassert>
				51	#include <limits>
				52	#include <utility>
				53
Tom Stellard	8b1e021	2013-07-27 00:01:07 +0000	[diff] [blame]	54	using namespace llvm;
				55
Chandler Carruth	84e68b2	2014-04-22 02:41:26 +0000	[diff] [blame]	56	#define DEBUG_TYPE "AMDGPUtti"
				57
Stanislav Mekhanoshin	f29602d	2017-02-03 02:20:05 +0000	[diff] [blame]	58	static cl::opt<unsigned> UnrollThresholdPrivate(
				59	"amdgpu-unroll-threshold-private",
				60	cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	61	cl::init(2500), cl::Hidden);
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	62
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	63	static cl::opt<unsigned> UnrollThresholdLocal(
				64	"amdgpu-unroll-threshold-local",
				65	cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
				66	cl::init(1000), cl::Hidden);
				67
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	68	static cl::opt<unsigned> UnrollThresholdIf(
				69	"amdgpu-unroll-threshold-if",
				70	cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
				71	cl::init(150), cl::Hidden);
				72
				73	static bool dependsOnLocalPhi(const Loop L, const Value Cond,
				74	unsigned Depth = 0) {
				75	const Instruction *I = dyn_cast<Instruction>(Cond);
				76	if (!I)
				77	return false;
				78
				79	for (const Value *V : I->operand_values()) {
				80	if (!L->contains(I))
				81	continue;
				82	if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	83	if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	84	return SubLoop->contains(PHI); }))
				85	return true;
				86	} else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
				87	return true;
				88	}
				89	return false;
				90	}
				91
Geoff Berry	66d9bdb	2017-06-28 15:53:17 +0000	[diff] [blame]	92	void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
Chandler Carruth	705b185	2015-01-31 03:43:40 +0000	[diff] [blame]	93	TTI::UnrollingPreferences &UP) {
Matt Arsenault	c824458	2014-07-25 23:02:42 +0000	[diff] [blame]	94	UP.Threshold = 300; // Twice the default.
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	95	UP.MaxCount = std::numeric_limits<unsigned>::max();
Matt Arsenault	c824458	2014-07-25 23:02:42 +0000	[diff] [blame]	96	UP.Partial = true;
				97
				98	// TODO: Do we want runtime unrolling?
				99
Stanislav Mekhanoshin	f29602d	2017-02-03 02:20:05 +0000	[diff] [blame]	100	// Maximum alloca size than can fit registers. Reserve 16 registers.
				101	const unsigned MaxAlloca = (256 - 16) * 4;
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	102	unsigned ThresholdPrivate = UnrollThresholdPrivate;
				103	unsigned ThresholdLocal = UnrollThresholdLocal;
				104	unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
				105	AMDGPUAS ASST = ST->getAMDGPUAS();
Matt Arsenault	ac6e39c	2014-07-17 06:19:06 +0000	[diff] [blame]	106	for (const BasicBlock *BB : L->getBlocks()) {
Mehdi Amini	a28d91d	2015-03-10 02:37:25 +0000	[diff] [blame]	107	const DataLayout &DL = BB->getModule()->getDataLayout();
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	108	unsigned LocalGEPsSeen = 0;
				109
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	110	if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	111	return SubLoop->contains(BB); }))
				112	continue; // Block belongs to an inner loop.
				113
Matt Arsenault	ac6e39c	2014-07-17 06:19:06 +0000	[diff] [blame]	114	for (const Instruction &I : *BB) {
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	115	// Unroll a loop which contains an "if" statement whose condition
				116	// defined by a PHI belonging to the loop. This may help to eliminate
				117	// if region and potentially even PHI itself, saving on both divergence
				118	// and registers used for the PHI.
				119	// Add a small bonus for each of such "if" statements.
				120	if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
				121	if (UP.Threshold < MaxBoost && Br->isConditional()) {
				122	if (L->isLoopExiting(Br->getSuccessor(0)) \|\|
				123	L->isLoopExiting(Br->getSuccessor(1)))
				124	continue;
				125	if (dependsOnLocalPhi(L, Br->getCondition())) {
				126	UP.Threshold += UnrollThresholdIf;
Nicola Zaghen	d34e60c	2018-05-14 12:53:11 +0000	[diff] [blame]	127	LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
				128	<< " for loop:\n"
				129	<< L << " due to " << Br << '\n');
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	130	if (UP.Threshold >= MaxBoost)
				131	return;
				132	}
				133	}
				134	continue;
				135	}
				136
Matt Arsenault	ac6e39c	2014-07-17 06:19:06 +0000	[diff] [blame]	137	const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	138	if (!GEP)
Tom Stellard	8cce9bd	2014-01-23 18:49:28 +0000	[diff] [blame]	139	continue;
Matt Arsenault	ac6e39c	2014-07-17 06:19:06 +0000	[diff] [blame]	140
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	141	unsigned AS = GEP->getAddressSpace();
				142	unsigned Threshold = 0;
				143	if (AS == ASST.PRIVATE_ADDRESS)
				144	Threshold = ThresholdPrivate;
				145	else if (AS == ASST.LOCAL_ADDRESS)
				146	Threshold = ThresholdLocal;
				147	else
				148	continue;
				149
				150	if (UP.Threshold >= Threshold)
				151	continue;
				152
				153	if (AS == ASST.PRIVATE_ADDRESS) {
				154	const Value *Ptr = GEP->getPointerOperand();
				155	const AllocaInst *Alloca =
				156	dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
				157	if (!Alloca \|\| !Alloca->isStaticAlloca())
				158	continue;
Stanislav Mekhanoshin	f29602d	2017-02-03 02:20:05 +0000	[diff] [blame]	159	Type *Ty = Alloca->getAllocatedType();
				160	unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
				161	if (AllocaSize > MaxAlloca)
				162	continue;
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	163	} else if (AS == ASST.LOCAL_ADDRESS) {
				164	LocalGEPsSeen++;
				165	// Inhibit unroll for local memory if we have seen addressing not to
				166	// a variable, most likely we will be unable to combine it.
				167	// Do not unroll too deep inner loops for local memory to give a chance
				168	// to unroll an outer loop for a more important reason.
				169	if (LocalGEPsSeen > 1 \|\| L->getLoopDepth() > 2 \|\|
				170	(!isa<GlobalVariable>(GEP->getPointerOperand()) &&
				171	!isa<Argument>(GEP->getPointerOperand())))
				172	continue;
				173	}
Stanislav Mekhanoshin	f29602d	2017-02-03 02:20:05 +0000	[diff] [blame]	174
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	175	// Check if GEP depends on a value defined by this loop itself.
				176	bool HasLoopDef = false;
				177	for (const Value *Op : GEP->operands()) {
				178	const Instruction *Inst = dyn_cast<Instruction>(Op);
				179	if (!Inst \|\| L->isLoopInvariant(Op))
Stanislav Mekhanoshin	f29602d	2017-02-03 02:20:05 +0000	[diff] [blame]	180	continue;
				181
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	182	if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	183	return SubLoop->contains(Inst); }))
				184	continue;
				185	HasLoopDef = true;
				186	break;
Tom Stellard	8cce9bd	2014-01-23 18:49:28 +0000	[diff] [blame]	187	}
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	188	if (!HasLoopDef)
				189	continue;
				190
				191	// We want to do whatever we can to limit the number of alloca
				192	// instructions that make it through to the code generator. allocas
				193	// require us to use indirect addressing, which is slow and prone to
				194	// compiler bugs. If this loop does an address calculation on an
				195	// alloca ptr, then we want to use a higher than normal loop unroll
				196	// threshold. This will give SROA a better chance to eliminate these
				197	// allocas.
				198	//
				199	// We also want to have more unrolling for local memory to let ds
				200	// instructions with different offsets combine.
				201	//
				202	// Don't use the maximum allowed value here as it will make some
				203	// programs way too big.
				204	UP.Threshold = Threshold;
Nicola Zaghen	d34e60c	2018-05-14 12:53:11 +0000	[diff] [blame]	205	LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
				206	<< " for loop:\n"
				207	<< L << " due to " << GEP << '\n');
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	208	if (UP.Threshold >= MaxBoost)
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	209	return;
Tom Stellard	8cce9bd	2014-01-23 18:49:28 +0000	[diff] [blame]	210	}
				211	}
				212	}
Matt Arsenault	3dd43fc	2014-07-18 06:07:13 +0000	[diff] [blame]	213
Matt Arsenault	67cd347	2017-06-20 20:38:06 +0000	[diff] [blame]	214	unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
				215	// The concept of vector registers doesn't really exist. Some packed vector
				216	// operations operate on the normal 32-bit registers.
Matt Arsenault	a93441f	2014-07-19 18:15:16 +0000	[diff] [blame]	217
				218	// Number of VGPRs on SI.
				219	if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
				220	return 256;
				221
				222	return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
				223	}
				224
Matt Arsenault	67cd347	2017-06-20 20:38:06 +0000	[diff] [blame]	225	unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const {
				226	// This is really the number of registers to fill when vectorizing /
				227	// interleaving loops, so we lie to avoid trying to use all registers.
				228	return getHardwareNumberOfRegisters(Vec) >> 3;
				229	}
				230
Daniel Neilson	c0112ae	2017-06-12 14:22:21 +0000	[diff] [blame]	231	unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const {
Matt Arsenault	67cd347	2017-06-20 20:38:06 +0000	[diff] [blame]	232	return 32;
				233	}
				234
				235	unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const {
				236	return 32;
Matt Arsenault	4339b3f	2015-12-24 05:14:55 +0000	[diff] [blame]	237	}
Matt Arsenault	a93441f	2014-07-19 18:15:16 +0000	[diff] [blame]	238
Farhana Aleen	8919664	2018-03-07 17:09:18 +0000	[diff] [blame]	239	unsigned AMDGPUTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
				240	unsigned ChainSizeInBytes,
				241	VectorType *VecTy) const {
				242	unsigned VecRegBitWidth = VF * LoadSize;
				243	if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
				244	// TODO: Support element-size less than 32bit?
				245	return 128 / LoadSize;
				246
				247	return VF;
				248	}
				249
				250	unsigned AMDGPUTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
				251	unsigned ChainSizeInBytes,
				252	VectorType *VecTy) const {
				253	unsigned VecRegBitWidth = VF * StoreSize;
				254	if (VecRegBitWidth > 128)
				255	return 128 / StoreSize;
				256
				257	return VF;
				258	}
				259
Volkan Keles	1c38681	2016-10-03 10:31:34 +0000	[diff] [blame]	260	unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
Yaxun Liu	1a14bfa	2017-03-27 14:04:01 +0000	[diff] [blame]	261	AMDGPUAS AS = ST->getAMDGPUAS();
				262	if (AddrSpace == AS.GLOBAL_ADDRESS \|\|
				263	AddrSpace == AS.CONSTANT_ADDRESS \|\|
Farhana Aleen	8919664	2018-03-07 17:09:18 +0000	[diff] [blame]	264	AddrSpace == AS.CONSTANT_ADDRESS_32BIT) {
				265	if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
				266	return 128;
				267	return 512;
				268	}
				269
Marek Olsak	a9a58fa	2018-04-10 22:48:23 +0000	[diff] [blame]	270	if (AddrSpace == AS.FLAT_ADDRESS)
Alex Shlyapnikov	79f2c72	2018-04-09 19:47:38 +0000	[diff] [blame]	271	return 128;
Marek Olsak	52b033b	2018-04-09 16:56:32 +0000	[diff] [blame]	272
Marek Olsak	a9a58fa	2018-04-10 22:48:23 +0000	[diff] [blame]	273	if (AddrSpace == AS.LOCAL_ADDRESS \|\|
				274	AddrSpace == AS.REGION_ADDRESS)
				275	return ST->useDS128() ? 128 : 64;
				276
Yaxun Liu	1a14bfa	2017-03-27 14:04:01 +0000	[diff] [blame]	277	if (AddrSpace == AS.PRIVATE_ADDRESS)
Matt Arsenault	0994bd5	2016-07-01 00:56:27 +0000	[diff] [blame]	278	return 8 * ST->getMaxPrivateElementSize();
Yaxun Liu	1a14bfa	2017-03-27 14:04:01 +0000	[diff] [blame]	279
				280	if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS &&
				281	(AddrSpace == AS.PARAM_D_ADDRESS \|\|
				282	AddrSpace == AS.PARAM_I_ADDRESS \|\|
Farhana Aleen	8919664	2018-03-07 17:09:18 +0000	[diff] [blame]	283	(AddrSpace >= AS.CONSTANT_BUFFER_0 &&
				284	AddrSpace <= AS.CONSTANT_BUFFER_15)))
Yaxun Liu	1a14bfa	2017-03-27 14:04:01 +0000	[diff] [blame]	285	return 128;
				286	llvm_unreachable("unhandled address space");
Matt Arsenault	0994bd5	2016-07-01 00:56:27 +0000	[diff] [blame]	287	}
				288
Matt Arsenault	f0a88db	2017-02-23 03:58:53 +0000	[diff] [blame]	289	bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
				290	unsigned Alignment,
				291	unsigned AddrSpace) const {
				292	// We allow vectorization of flat stores, even though we may need to decompose
				293	// them later if they may access private memory. We don't have enough context
				294	// here, and legalization can handle it.
Yaxun Liu	1a14bfa	2017-03-27 14:04:01 +0000	[diff] [blame]	295	if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) {
Matt Arsenault	f0a88db	2017-02-23 03:58:53 +0000	[diff] [blame]	296	return (Alignment >= 4 \|\| ST->hasUnalignedScratchAccess()) &&
				297	ChainSizeInBytes <= ST->getMaxPrivateElementSize();
				298	}
				299	return true;
				300	}
				301
				302	bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
				303	unsigned Alignment,
				304	unsigned AddrSpace) const {
				305	return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
				306	}
				307
				308	bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
				309	unsigned Alignment,
				310	unsigned AddrSpace) const {
				311	return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
				312	}
				313
Wei Mi	062c744	2015-05-06 17:12:25 +0000	[diff] [blame]	314	unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
Changpeng Fang	1be9b9f	2017-03-09 00:07:00 +0000	[diff] [blame]	315	// Disable unrolling if the loop is not vectorized.
Matt Arsenault	67cd347	2017-06-20 20:38:06 +0000	[diff] [blame]	316	// TODO: Enable this again.
Changpeng Fang	1be9b9f	2017-03-09 00:07:00 +0000	[diff] [blame]	317	if (VF == 1)
				318	return 1;
				319
Matt Arsenault	67cd347	2017-06-20 20:38:06 +0000	[diff] [blame]	320	return 8;
Matt Arsenault	a93441f	2014-07-19 18:15:16 +0000	[diff] [blame]	321	}
Matt Arsenault	e830f54	2015-12-01 19:08:39 +0000	[diff] [blame]	322
Matt Arsenault	3e268cc	2017-12-11 21:38:43 +0000	[diff] [blame]	323	bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
				324	MemIntrinsicInfo &Info) const {
				325	switch (Inst->getIntrinsicID()) {
				326	case Intrinsic::amdgcn_atomic_inc:
Daniil Fukalov	6e1dc68	2018-01-26 11:09:38 +0000	[diff] [blame]	327	case Intrinsic::amdgcn_atomic_dec:
				328	case Intrinsic::amdgcn_ds_fadd:
				329	case Intrinsic::amdgcn_ds_fmin:
				330	case Intrinsic::amdgcn_ds_fmax: {
Matt Arsenault	3e268cc	2017-12-11 21:38:43 +0000	[diff] [blame]	331	auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
				332	auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
				333	if (!Ordering \|\| !Volatile)
				334	return false; // Invalid.
				335
				336	unsigned OrderingVal = Ordering->getZExtValue();
				337	if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
				338	return false;
				339
				340	Info.PtrVal = Inst->getArgOperand(0);
				341	Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
				342	Info.ReadMem = true;
				343	Info.WriteMem = true;
				344	Info.IsVolatile = !Volatile->isNullValue();
				345	return true;
				346	}
				347	default:
				348	return false;
				349	}
				350	}
				351
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	352	int AMDGPUTTIImpl::getArithmeticInstrCost(
				353	unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
				354	TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
Mohammed Agabaria	2c96c43	2017-01-11 08:23:37 +0000	[diff] [blame]	355	TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	356	EVT OrigTy = TLI->getValueType(DL, Ty);
				357	if (!OrigTy.isSimple()) {
				358	return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
				359	Opd1PropInfo, Opd2PropInfo);
				360	}
				361
				362	// Legalize the type.
				363	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
				364	int ISD = TLI->InstructionOpcodeToISD(Opcode);
				365
				366	// Because we don't have any legal vector operations, but the legal types, we
				367	// need to account for split vectors.
				368	unsigned NElts = LT.second.isVector() ?
				369	LT.second.getVectorNumElements() : 1;
				370
				371	MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
				372
				373	switch (ISD) {
Matt Arsenault	8c8fcb2	2016-03-25 01:16:40 +0000	[diff] [blame]	374	case ISD::SHL:
				375	case ISD::SRL:
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	376	case ISD::SRA:
Matt Arsenault	8c8fcb2	2016-03-25 01:16:40 +0000	[diff] [blame]	377	if (SLT == MVT::i64)
				378	return get64BitInstrCost() * LT.first * NElts;
				379
				380	// i32
				381	return getFullRateInstrCost() * LT.first * NElts;
Matt Arsenault	8c8fcb2	2016-03-25 01:16:40 +0000	[diff] [blame]	382	case ISD::ADD:
				383	case ISD::SUB:
				384	case ISD::AND:
				385	case ISD::OR:
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	386	case ISD::XOR:
Matt Arsenault	8c8fcb2	2016-03-25 01:16:40 +0000	[diff] [blame]	387	if (SLT == MVT::i64){
				388	// and, or and xor are typically split into 2 VALU instructions.
				389	return 2 * getFullRateInstrCost() * LT.first * NElts;
				390	}
				391
				392	return LT.first * NElts * getFullRateInstrCost();
Matt Arsenault	8c8fcb2	2016-03-25 01:16:40 +0000	[diff] [blame]	393	case ISD::MUL: {
				394	const int QuarterRateCost = getQuarterRateInstrCost();
				395	if (SLT == MVT::i64) {
				396	const int FullRateCost = getFullRateInstrCost();
				397	return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
				398	}
				399
				400	// i32
				401	return QuarterRateCost * NElts * LT.first;
				402	}
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	403	case ISD::FADD:
				404	case ISD::FSUB:
				405	case ISD::FMUL:
				406	if (SLT == MVT::f64)
				407	return LT.first * NElts * get64BitInstrCost();
				408
				409	if (SLT == MVT::f32 \|\| SLT == MVT::f16)
				410	return LT.first * NElts * getFullRateInstrCost();
				411	break;
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	412	case ISD::FDIV:
				413	case ISD::FREM:
				414	// FIXME: frem should be handled separately. The fdiv in it is most of it,
				415	// but the current lowering is also not entirely correct.
				416	if (SLT == MVT::f64) {
				417	int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	418	// Add cost of workaround.
				419	if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
				420	Cost += 3 * getFullRateInstrCost();
				421
				422	return LT.first * Cost * NElts;
				423	}
				424
Matt Arsenault	376f1bd	2017-08-31 05:47:00 +0000	[diff] [blame]	425	if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
				426	// TODO: This is more complicated, unsafe flags etc.
				427	if ((SLT == MVT::f32 && !ST->hasFP32Denormals()) \|\|
				428	(SLT == MVT::f16 && ST->has16BitInsts())) {
				429	return LT.first * getQuarterRateInstrCost() * NElts;
				430	}
				431	}
				432
				433	if (SLT == MVT::f16 && ST->has16BitInsts()) {
				434	// 2 x v_cvt_f32_f16
				435	// f32 rcp
				436	// f32 fmul
				437	// v_cvt_f16_f32
				438	// f16 div_fixup
				439	int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
				440	return LT.first * Cost * NElts;
				441	}
				442
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	443	if (SLT == MVT::f32 \|\| SLT == MVT::f16) {
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	444	int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
Matt Arsenault	376f1bd	2017-08-31 05:47:00 +0000	[diff] [blame]	445
				446	if (!ST->hasFP32Denormals()) {
				447	// FP mode switches.
				448	Cost += 2 * getFullRateInstrCost();
				449	}
				450
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	451	return LT.first * NElts * Cost;
				452	}
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	453	break;
				454	default:
				455	break;
				456	}
				457
				458	return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
				459	Opd1PropInfo, Opd2PropInfo);
				460	}
				461
Matt Arsenault	e05ff15	2015-12-16 18:37:19 +0000	[diff] [blame]	462	unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
				463	// XXX - For some reason this isn't called for switch.
				464	switch (Opcode) {
				465	case Instruction::Br:
				466	case Instruction::Ret:
				467	return 10;
				468	default:
				469	return BaseT::getCFInstrCost(Opcode);
				470	}
				471	}
				472
Farhana Aleen	e2dfe8a	2018-05-01 21:41:12 +0000	[diff] [blame]	473	int AMDGPUTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
				474	bool IsPairwise) {
				475	EVT OrigTy = TLI->getValueType(DL, Ty);
				476
				477	// Computes cost on targets that have packed math instructions(which support
				478	// 16-bit types only).
				479	if (IsPairwise \|\|
				480	!ST->hasVOP3PInsts() \|\|
				481	OrigTy.getScalarSizeInBits() != 16)
				482	return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise);
				483
				484	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
				485	return LT.first * getFullRateInstrCost();
				486	}
				487
Farhana Aleen	e24f3ff	2018-05-09 21:18:34 +0000	[diff] [blame]	488	int AMDGPUTTIImpl::getMinMaxReductionCost(Type Ty, Type CondTy,
				489	bool IsPairwise,
				490	bool IsUnsigned) {
				491	EVT OrigTy = TLI->getValueType(DL, Ty);
				492
				493	// Computes cost on targets that have packed math instructions(which support
				494	// 16-bit types only).
				495	if (IsPairwise \|\|
				496	!ST->hasVOP3PInsts() \|\|
				497	OrigTy.getScalarSizeInBits() != 16)
				498	return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned);
				499
				500	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
				501	return LT.first * getHalfRateInstrCost();
				502	}
				503
Matt Arsenault	e830f54	2015-12-01 19:08:39 +0000	[diff] [blame]	504	int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
				505	unsigned Index) {
				506	switch (Opcode) {
				507	case Instruction::ExtractElement:
Matt Arsenault	3c5e423	2017-05-10 21:29:33 +0000	[diff] [blame]	508	case Instruction::InsertElement: {
				509	unsigned EltSize
				510	= DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
				511	if (EltSize < 32) {
				512	if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
				513	return 0;
				514	return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
				515	}
				516
Matt Arsenault	59767ce	2016-03-25 00:14:11 +0000	[diff] [blame]	517	// Extracts are just reads of a subregister, so are free. Inserts are
				518	// considered free because we don't want to have any cost for scalarizing
				519	// operations, and we don't have to copy into a different register class.
				520
Matt Arsenault	e830f54	2015-12-01 19:08:39 +0000	[diff] [blame]	521	// Dynamic indexing isn't free and is best avoided.
				522	return Index == ~0u ? 2 : 0;
Matt Arsenault	3c5e423	2017-05-10 21:29:33 +0000	[diff] [blame]	523	}
Matt Arsenault	e830f54	2015-12-01 19:08:39 +0000	[diff] [blame]	524	default:
				525	return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
				526	}
				527	}
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	528
Alexander Timofeev	2e5eece	2018-03-05 15:12:21 +0000	[diff] [blame]	529
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	530
				531	static bool isArgPassedInSGPR(const Argument *A) {
				532	const Function *F = A->getParent();
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	533
				534	// Arguments to compute shaders are never a source of divergence.
Matt Arsenault	4c1ecde	2017-04-19 17:42:34 +0000	[diff] [blame]	535	CallingConv::ID CC = F->getCallingConv();
				536	switch (CC) {
				537	case CallingConv::AMDGPU_KERNEL:
				538	case CallingConv::SPIR_KERNEL:
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	539	return true;
Matt Arsenault	4c1ecde	2017-04-19 17:42:34 +0000	[diff] [blame]	540	case CallingConv::AMDGPU_VS:
Tim Renouf	ef1ae8f	2017-09-29 09:51:22 +0000	[diff] [blame]	541	case CallingConv::AMDGPU_LS:
Marek Olsak	a302a736	2017-05-02 15:41:10 +0000	[diff] [blame]	542	case CallingConv::AMDGPU_HS:
Tim Renouf	ef1ae8f	2017-09-29 09:51:22 +0000	[diff] [blame]	543	case CallingConv::AMDGPU_ES:
Matt Arsenault	4c1ecde	2017-04-19 17:42:34 +0000	[diff] [blame]	544	case CallingConv::AMDGPU_GS:
				545	case CallingConv::AMDGPU_PS:
				546	case CallingConv::AMDGPU_CS:
				547	// For non-compute shaders, SGPR inputs are marked with either inreg or byval.
				548	// Everything else is in VGPRs.
				549	return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) \|\|
				550	F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
				551	default:
				552	// TODO: Should calls support inreg for SGPR inputs?
				553	return false;
				554	}
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	555	}
				556
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	557	/// \returns true if the result of the value could potentially be
				558	/// different across workitems in a wavefront.
				559	bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	560	if (const Argument *A = dyn_cast<Argument>(V))
				561	return !isArgPassedInSGPR(A);
				562
				563	// Loads from the private address space are divergent, because threads
				564	// can execute the load instruction with the same inputs and get different
				565	// results.
				566	//
				567	// All other loads are not divergent, because if threads issue loads with the
				568	// same arguments, they will always get the same result.
				569	if (const LoadInst *Load = dyn_cast<LoadInst>(V))
Yaxun Liu	1a14bfa	2017-03-27 14:04:01 +0000	[diff] [blame]	570	return Load->getPointerAddressSpace() == ST->getAMDGPUAS().PRIVATE_ADDRESS;
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	571
Nicolai Haehnle	79cad85	2016-03-17 16:21:59 +0000	[diff] [blame]	572	// Atomics are divergent because they are executed sequentially: when an
				573	// atomic operation refers to the same address in each thread, then each
				574	// thread after the first sees the value written by the previous thread as
				575	// original value.
				576	if (isa<AtomicRMWInst>(V) \|\| isa<AtomicCmpXchgInst>(V))
				577	return true;
				578
Matt Arsenault	d2c8a33	2017-02-16 02:01:13 +0000	[diff] [blame]	579	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
Alexander Timofeev	2e5eece	2018-03-05 15:12:21 +0000	[diff] [blame]	580	return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	581
				582	// Assume all function calls are a source of divergence.
				583	if (isa<CallInst>(V) \|\| isa<InvokeInst>(V))
				584	return true;
				585
				586	return false;
				587	}
Matt Arsenault	3c5e423	2017-05-10 21:29:33 +0000	[diff] [blame]	588
Alexander Timofeev	0f9c84c	2017-06-15 19:33:10 +0000	[diff] [blame]	589	bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const {
				590	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
				591	switch (Intrinsic->getIntrinsicID()) {
				592	default:
				593	return false;
				594	case Intrinsic::amdgcn_readfirstlane:
				595	case Intrinsic::amdgcn_readlane:
				596	return true;
				597	}
				598	}
				599	return false;
				600	}
				601
Matt Arsenault	3c5e423	2017-05-10 21:29:33 +0000	[diff] [blame]	602	unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
				603	Type *SubTp) {
				604	if (ST->hasVOP3PInsts()) {
				605	VectorType *VT = cast<VectorType>(Tp);
				606	if (VT->getNumElements() == 2 &&
				607	DL.getTypeSizeInBits(VT->getElementType()) == 16) {
				608	// With op_sel VOP3P instructions freely can access the low half or high
				609	// half of a register, so any swizzle is free.
				610
				611	switch (Kind) {
				612	case TTI::SK_Broadcast:
				613	case TTI::SK_Reverse:
				614	case TTI::SK_PermuteSingleSrc:
				615	return 0;
				616	default:
				617	break;
				618	}
				619	}
				620	}
				621
				622	return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
				623	}
Matt Arsenault	aac47c1	2017-08-07 17:08:44 +0000	[diff] [blame]	624
				625	bool AMDGPUTTIImpl::areInlineCompatible(const Function *Caller,
				626	const Function *Callee) const {
				627	const TargetMachine &TM = getTLI()->getTargetMachine();
				628	const FeatureBitset &CallerBits =
				629	TM.getSubtargetImpl(*Caller)->getFeatureBits();
				630	const FeatureBitset &CalleeBits =
				631	TM.getSubtargetImpl(*Callee)->getFeatureBits();
				632
				633	FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
				634	FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
				635	return ((RealCallerBits & RealCalleeBits) == RealCalleeBits);
				636	}