Blame - llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp - toolchain/llvm-project

blob: 7d24b7077bc0b46cfbbb55709ff8fc35dc6874e3 [file] [log] [blame]

Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	1	//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
Tom Stellard	8b1e021	2013-07-27 00:01:07 +0000	[diff] [blame]	2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// \file
				11	// This file implements a TargetTransformInfo analysis pass specific to the
				12	// AMDGPU target machine. It uses the target's detailed information to provide
				13	// more precise answers to certain TTI queries, while letting the target
				14	// independent and default TTI implementations handle the rest.
				15	//
				16	//===----------------------------------------------------------------------===//
				17
Chandler Carruth	93dcdc4	2015-01-31 11:17:59 +0000	[diff] [blame]	18	#include "AMDGPUTargetTransformInfo.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	19	#include "AMDGPUSubtarget.h"
Alexander Timofeev	2e5eece	2018-03-05 15:12:21 +0000	[diff] [blame]	20	#include "Utils/AMDGPUBaseInfo.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	21	#include "llvm/ADT/STLExtras.h"
Tom Stellard	8cce9bd	2014-01-23 18:49:28 +0000	[diff] [blame]	22	#include "llvm/Analysis/LoopInfo.h"
Tom Stellard	8b1e021	2013-07-27 00:01:07 +0000	[diff] [blame]	23	#include "llvm/Analysis/TargetTransformInfo.h"
Tom Stellard	8cce9bd	2014-01-23 18:49:28 +0000	[diff] [blame]	24	#include "llvm/Analysis/ValueTracking.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	25	#include "llvm/CodeGen/ISDOpcodes.h"
Craig Topper	2fa1436	2018-03-29 17:21:10 +0000	[diff] [blame]	26	#include "llvm/CodeGen/ValueTypes.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	27	#include "llvm/IR/Argument.h"
				28	#include "llvm/IR/Attributes.h"
				29	#include "llvm/IR/BasicBlock.h"
				30	#include "llvm/IR/CallingConv.h"
				31	#include "llvm/IR/DataLayout.h"
				32	#include "llvm/IR/DerivedTypes.h"
				33	#include "llvm/IR/Function.h"
				34	#include "llvm/IR/Instruction.h"
				35	#include "llvm/IR/Instructions.h"
				36	#include "llvm/IR/IntrinsicInst.h"
Chandler Carruth	6bda14b	2017-06-06 11:49:48 +0000	[diff] [blame]	37	#include "llvm/IR/Module.h"
Matt Arsenault	376f1bd	2017-08-31 05:47:00 +0000	[diff] [blame]	38	#include "llvm/IR/PatternMatch.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	39	#include "llvm/IR/Type.h"
				40	#include "llvm/IR/Value.h"
				41	#include "llvm/MC/SubtargetFeature.h"
				42	#include "llvm/Support/Casting.h"
				43	#include "llvm/Support/CommandLine.h"
Tom Stellard	8b1e021	2013-07-27 00:01:07 +0000	[diff] [blame]	44	#include "llvm/Support/Debug.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	45	#include "llvm/Support/ErrorHandling.h"
David Blaikie	13e77db	2018-03-23 23:58:25 +0000	[diff] [blame]	46	#include "llvm/Support/MachineValueType.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	47	#include "llvm/Support/raw_ostream.h"
				48	#include "llvm/Target/TargetMachine.h"
				49	#include <algorithm>
				50	#include <cassert>
				51	#include <limits>
				52	#include <utility>
				53
Tom Stellard	8b1e021	2013-07-27 00:01:07 +0000	[diff] [blame]	54	using namespace llvm;
				55
Chandler Carruth	84e68b2	2014-04-22 02:41:26 +0000	[diff] [blame]	56	#define DEBUG_TYPE "AMDGPUtti"
				57
Stanislav Mekhanoshin	f29602d	2017-02-03 02:20:05 +0000	[diff] [blame]	58	static cl::opt<unsigned> UnrollThresholdPrivate(
				59	"amdgpu-unroll-threshold-private",
				60	cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	61	cl::init(2500), cl::Hidden);
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	62
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	63	static cl::opt<unsigned> UnrollThresholdLocal(
				64	"amdgpu-unroll-threshold-local",
				65	cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
				66	cl::init(1000), cl::Hidden);
				67
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	68	static cl::opt<unsigned> UnrollThresholdIf(
				69	"amdgpu-unroll-threshold-if",
				70	cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
				71	cl::init(150), cl::Hidden);
				72
				73	static bool dependsOnLocalPhi(const Loop L, const Value Cond,
				74	unsigned Depth = 0) {
				75	const Instruction *I = dyn_cast<Instruction>(Cond);
				76	if (!I)
				77	return false;
				78
				79	for (const Value *V : I->operand_values()) {
				80	if (!L->contains(I))
				81	continue;
				82	if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	83	if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	84	return SubLoop->contains(PHI); }))
				85	return true;
				86	} else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
				87	return true;
				88	}
				89	return false;
				90	}
				91
Geoff Berry	66d9bdb	2017-06-28 15:53:17 +0000	[diff] [blame]	92	void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
Chandler Carruth	705b185	2015-01-31 03:43:40 +0000	[diff] [blame]	93	TTI::UnrollingPreferences &UP) {
Matt Arsenault	c824458	2014-07-25 23:02:42 +0000	[diff] [blame]	94	UP.Threshold = 300; // Twice the default.
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	95	UP.MaxCount = std::numeric_limits<unsigned>::max();
Matt Arsenault	c824458	2014-07-25 23:02:42 +0000	[diff] [blame]	96	UP.Partial = true;
				97
				98	// TODO: Do we want runtime unrolling?
				99
Stanislav Mekhanoshin	f29602d	2017-02-03 02:20:05 +0000	[diff] [blame]	100	// Maximum alloca size than can fit registers. Reserve 16 registers.
				101	const unsigned MaxAlloca = (256 - 16) * 4;
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	102	unsigned ThresholdPrivate = UnrollThresholdPrivate;
				103	unsigned ThresholdLocal = UnrollThresholdLocal;
				104	unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
				105	AMDGPUAS ASST = ST->getAMDGPUAS();
Matt Arsenault	ac6e39c	2014-07-17 06:19:06 +0000	[diff] [blame]	106	for (const BasicBlock *BB : L->getBlocks()) {
Mehdi Amini	a28d91d	2015-03-10 02:37:25 +0000	[diff] [blame]	107	const DataLayout &DL = BB->getModule()->getDataLayout();
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	108	unsigned LocalGEPsSeen = 0;
				109
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	110	if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	111	return SubLoop->contains(BB); }))
				112	continue; // Block belongs to an inner loop.
				113
Matt Arsenault	ac6e39c	2014-07-17 06:19:06 +0000	[diff] [blame]	114	for (const Instruction &I : *BB) {
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	115	// Unroll a loop which contains an "if" statement whose condition
				116	// defined by a PHI belonging to the loop. This may help to eliminate
				117	// if region and potentially even PHI itself, saving on both divergence
				118	// and registers used for the PHI.
				119	// Add a small bonus for each of such "if" statements.
				120	if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
				121	if (UP.Threshold < MaxBoost && Br->isConditional()) {
				122	if (L->isLoopExiting(Br->getSuccessor(0)) \|\|
				123	L->isLoopExiting(Br->getSuccessor(1)))
				124	continue;
				125	if (dependsOnLocalPhi(L, Br->getCondition())) {
				126	UP.Threshold += UnrollThresholdIf;
Nicola Zaghen	d34e60c	2018-05-14 12:53:11 +0000	[diff] [blame]	127	LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
				128	<< " for loop:\n"
				129	<< L << " due to " << Br << '\n');
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	130	if (UP.Threshold >= MaxBoost)
				131	return;
				132	}
				133	}
				134	continue;
				135	}
				136
Matt Arsenault	ac6e39c	2014-07-17 06:19:06 +0000	[diff] [blame]	137	const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	138	if (!GEP)
Tom Stellard	8cce9bd	2014-01-23 18:49:28 +0000	[diff] [blame]	139	continue;
Matt Arsenault	ac6e39c	2014-07-17 06:19:06 +0000	[diff] [blame]	140
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	141	unsigned AS = GEP->getAddressSpace();
				142	unsigned Threshold = 0;
				143	if (AS == ASST.PRIVATE_ADDRESS)
				144	Threshold = ThresholdPrivate;
				145	else if (AS == ASST.LOCAL_ADDRESS)
				146	Threshold = ThresholdLocal;
				147	else
				148	continue;
				149
				150	if (UP.Threshold >= Threshold)
				151	continue;
				152
				153	if (AS == ASST.PRIVATE_ADDRESS) {
				154	const Value *Ptr = GEP->getPointerOperand();
				155	const AllocaInst *Alloca =
				156	dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
				157	if (!Alloca \|\| !Alloca->isStaticAlloca())
				158	continue;
Stanislav Mekhanoshin	f29602d	2017-02-03 02:20:05 +0000	[diff] [blame]	159	Type *Ty = Alloca->getAllocatedType();
				160	unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
				161	if (AllocaSize > MaxAlloca)
				162	continue;
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	163	} else if (AS == ASST.LOCAL_ADDRESS) {
				164	LocalGEPsSeen++;
				165	// Inhibit unroll for local memory if we have seen addressing not to
				166	// a variable, most likely we will be unable to combine it.
				167	// Do not unroll too deep inner loops for local memory to give a chance
				168	// to unroll an outer loop for a more important reason.
				169	if (LocalGEPsSeen > 1 \|\| L->getLoopDepth() > 2 \|\|
				170	(!isa<GlobalVariable>(GEP->getPointerOperand()) &&
				171	!isa<Argument>(GEP->getPointerOperand())))
				172	continue;
				173	}
Stanislav Mekhanoshin	f29602d	2017-02-03 02:20:05 +0000	[diff] [blame]	174
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	175	// Check if GEP depends on a value defined by this loop itself.
				176	bool HasLoopDef = false;
				177	for (const Value *Op : GEP->operands()) {
				178	const Instruction *Inst = dyn_cast<Instruction>(Op);
				179	if (!Inst \|\| L->isLoopInvariant(Op))
Stanislav Mekhanoshin	f29602d	2017-02-03 02:20:05 +0000	[diff] [blame]	180	continue;
				181
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	182	if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	183	return SubLoop->contains(Inst); }))
				184	continue;
				185	HasLoopDef = true;
				186	break;
Tom Stellard	8cce9bd	2014-01-23 18:49:28 +0000	[diff] [blame]	187	}
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	188	if (!HasLoopDef)
				189	continue;
				190
				191	// We want to do whatever we can to limit the number of alloca
				192	// instructions that make it through to the code generator. allocas
				193	// require us to use indirect addressing, which is slow and prone to
				194	// compiler bugs. If this loop does an address calculation on an
				195	// alloca ptr, then we want to use a higher than normal loop unroll
				196	// threshold. This will give SROA a better chance to eliminate these
				197	// allocas.
				198	//
				199	// We also want to have more unrolling for local memory to let ds
				200	// instructions with different offsets combine.
				201	//
				202	// Don't use the maximum allowed value here as it will make some
				203	// programs way too big.
				204	UP.Threshold = Threshold;
Nicola Zaghen	d34e60c	2018-05-14 12:53:11 +0000	[diff] [blame]	205	LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
				206	<< " for loop:\n"
				207	<< L << " due to " << GEP << '\n');
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	208	if (UP.Threshold >= MaxBoost)
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	209	return;
Tom Stellard	8cce9bd	2014-01-23 18:49:28 +0000	[diff] [blame]	210	}
				211	}
				212	}
Matt Arsenault	3dd43fc	2014-07-18 06:07:13 +0000	[diff] [blame]	213
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	214	unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
Matt Arsenault	67cd347	2017-06-20 20:38:06 +0000	[diff] [blame]	215	// The concept of vector registers doesn't really exist. Some packed vector
				216	// operations operate on the normal 32-bit registers.
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	217	return 256;
Matt Arsenault	a93441f	2014-07-19 18:15:16 +0000	[diff] [blame]	218	}
				219
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	220	unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
Matt Arsenault	67cd347	2017-06-20 20:38:06 +0000	[diff] [blame]	221	// This is really the number of registers to fill when vectorizing /
				222	// interleaving loops, so we lie to avoid trying to use all registers.
				223	return getHardwareNumberOfRegisters(Vec) >> 3;
				224	}
				225
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	226	unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
Matt Arsenault	67cd347	2017-06-20 20:38:06 +0000	[diff] [blame]	227	return 32;
				228	}
				229
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	230	unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
Matt Arsenault	67cd347	2017-06-20 20:38:06 +0000	[diff] [blame]	231	return 32;
Matt Arsenault	4339b3f	2015-12-24 05:14:55 +0000	[diff] [blame]	232	}
Matt Arsenault	a93441f	2014-07-19 18:15:16 +0000	[diff] [blame]	233
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	234	unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
Farhana Aleen	8919664	2018-03-07 17:09:18 +0000	[diff] [blame]	235	unsigned ChainSizeInBytes,
				236	VectorType *VecTy) const {
				237	unsigned VecRegBitWidth = VF * LoadSize;
				238	if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
				239	// TODO: Support element-size less than 32bit?
				240	return 128 / LoadSize;
				241
				242	return VF;
				243	}
				244
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	245	unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
Farhana Aleen	8919664	2018-03-07 17:09:18 +0000	[diff] [blame]	246	unsigned ChainSizeInBytes,
				247	VectorType *VecTy) const {
				248	unsigned VecRegBitWidth = VF * StoreSize;
				249	if (VecRegBitWidth > 128)
				250	return 128 / StoreSize;
				251
				252	return VF;
				253	}
				254
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	255	unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
Yaxun Liu	1a14bfa	2017-03-27 14:04:01 +0000	[diff] [blame]	256	AMDGPUAS AS = ST->getAMDGPUAS();
				257	if (AddrSpace == AS.GLOBAL_ADDRESS \|\|
				258	AddrSpace == AS.CONSTANT_ADDRESS \|\|
Farhana Aleen	8919664	2018-03-07 17:09:18 +0000	[diff] [blame]	259	AddrSpace == AS.CONSTANT_ADDRESS_32BIT) {
Farhana Aleen	8919664	2018-03-07 17:09:18 +0000	[diff] [blame]	260	return 512;
				261	}
				262
Farhana Aleen	eacb102	2018-05-28 18:15:11 +0000	[diff] [blame]	263	if (AddrSpace == AS.FLAT_ADDRESS \|\|
				264	AddrSpace == AS.LOCAL_ADDRESS \|\|
Marek Olsak	a9a58fa	2018-04-10 22:48:23 +0000	[diff] [blame]	265	AddrSpace == AS.REGION_ADDRESS)
Farhana Aleen	eacb102	2018-05-28 18:15:11 +0000	[diff] [blame]	266	return 128;
Marek Olsak	a9a58fa	2018-04-10 22:48:23 +0000	[diff] [blame]	267
Yaxun Liu	1a14bfa	2017-03-27 14:04:01 +0000	[diff] [blame]	268	if (AddrSpace == AS.PRIVATE_ADDRESS)
Matt Arsenault	0994bd5	2016-07-01 00:56:27 +0000	[diff] [blame]	269	return 8 * ST->getMaxPrivateElementSize();
Yaxun Liu	1a14bfa	2017-03-27 14:04:01 +0000	[diff] [blame]	270
Yaxun Liu	1a14bfa	2017-03-27 14:04:01 +0000	[diff] [blame]	271	llvm_unreachable("unhandled address space");
Matt Arsenault	0994bd5	2016-07-01 00:56:27 +0000	[diff] [blame]	272	}
				273
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	274	bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
Matt Arsenault	f0a88db	2017-02-23 03:58:53 +0000	[diff] [blame]	275	unsigned Alignment,
				276	unsigned AddrSpace) const {
				277	// We allow vectorization of flat stores, even though we may need to decompose
				278	// them later if they may access private memory. We don't have enough context
				279	// here, and legalization can handle it.
Yaxun Liu	1a14bfa	2017-03-27 14:04:01 +0000	[diff] [blame]	280	if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) {
Matt Arsenault	f0a88db	2017-02-23 03:58:53 +0000	[diff] [blame]	281	return (Alignment >= 4 \|\| ST->hasUnalignedScratchAccess()) &&
				282	ChainSizeInBytes <= ST->getMaxPrivateElementSize();
				283	}
				284	return true;
				285	}
				286
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	287	bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
Matt Arsenault	f0a88db	2017-02-23 03:58:53 +0000	[diff] [blame]	288	unsigned Alignment,
				289	unsigned AddrSpace) const {
				290	return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
				291	}
				292
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	293	bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
Matt Arsenault	f0a88db	2017-02-23 03:58:53 +0000	[diff] [blame]	294	unsigned Alignment,
				295	unsigned AddrSpace) const {
				296	return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
				297	}
				298
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	299	unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
Changpeng Fang	1be9b9f	2017-03-09 00:07:00 +0000	[diff] [blame]	300	// Disable unrolling if the loop is not vectorized.
Matt Arsenault	67cd347	2017-06-20 20:38:06 +0000	[diff] [blame]	301	// TODO: Enable this again.
Changpeng Fang	1be9b9f	2017-03-09 00:07:00 +0000	[diff] [blame]	302	if (VF == 1)
				303	return 1;
				304
Matt Arsenault	67cd347	2017-06-20 20:38:06 +0000	[diff] [blame]	305	return 8;
Matt Arsenault	a93441f	2014-07-19 18:15:16 +0000	[diff] [blame]	306	}
Matt Arsenault	e830f54	2015-12-01 19:08:39 +0000	[diff] [blame]	307
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	308	bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
Matt Arsenault	3e268cc	2017-12-11 21:38:43 +0000	[diff] [blame]	309	MemIntrinsicInfo &Info) const {
				310	switch (Inst->getIntrinsicID()) {
				311	case Intrinsic::amdgcn_atomic_inc:
Daniil Fukalov	6e1dc68	2018-01-26 11:09:38 +0000	[diff] [blame]	312	case Intrinsic::amdgcn_atomic_dec:
				313	case Intrinsic::amdgcn_ds_fadd:
				314	case Intrinsic::amdgcn_ds_fmin:
				315	case Intrinsic::amdgcn_ds_fmax: {
Matt Arsenault	3e268cc	2017-12-11 21:38:43 +0000	[diff] [blame]	316	auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
				317	auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
				318	if (!Ordering \|\| !Volatile)
				319	return false; // Invalid.
				320
				321	unsigned OrderingVal = Ordering->getZExtValue();
				322	if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
				323	return false;
				324
				325	Info.PtrVal = Inst->getArgOperand(0);
				326	Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
				327	Info.ReadMem = true;
				328	Info.WriteMem = true;
				329	Info.IsVolatile = !Volatile->isNullValue();
				330	return true;
				331	}
				332	default:
				333	return false;
				334	}
				335	}
				336
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	337	int GCNTTIImpl::getArithmeticInstrCost(
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	338	unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
				339	TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
Mohammed Agabaria	2c96c43	2017-01-11 08:23:37 +0000	[diff] [blame]	340	TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	341	EVT OrigTy = TLI->getValueType(DL, Ty);
				342	if (!OrigTy.isSimple()) {
				343	return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
				344	Opd1PropInfo, Opd2PropInfo);
				345	}
				346
				347	// Legalize the type.
				348	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
				349	int ISD = TLI->InstructionOpcodeToISD(Opcode);
				350
				351	// Because we don't have any legal vector operations, but the legal types, we
				352	// need to account for split vectors.
				353	unsigned NElts = LT.second.isVector() ?
				354	LT.second.getVectorNumElements() : 1;
				355
				356	MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
				357
				358	switch (ISD) {
Matt Arsenault	8c8fcb2	2016-03-25 01:16:40 +0000	[diff] [blame]	359	case ISD::SHL:
				360	case ISD::SRL:
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	361	case ISD::SRA:
Matt Arsenault	8c8fcb2	2016-03-25 01:16:40 +0000	[diff] [blame]	362	if (SLT == MVT::i64)
				363	return get64BitInstrCost() * LT.first * NElts;
				364
				365	// i32
				366	return getFullRateInstrCost() * LT.first * NElts;
Matt Arsenault	8c8fcb2	2016-03-25 01:16:40 +0000	[diff] [blame]	367	case ISD::ADD:
				368	case ISD::SUB:
				369	case ISD::AND:
				370	case ISD::OR:
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	371	case ISD::XOR:
Matt Arsenault	8c8fcb2	2016-03-25 01:16:40 +0000	[diff] [blame]	372	if (SLT == MVT::i64){
				373	// and, or and xor are typically split into 2 VALU instructions.
				374	return 2 * getFullRateInstrCost() * LT.first * NElts;
				375	}
				376
				377	return LT.first * NElts * getFullRateInstrCost();
Matt Arsenault	8c8fcb2	2016-03-25 01:16:40 +0000	[diff] [blame]	378	case ISD::MUL: {
				379	const int QuarterRateCost = getQuarterRateInstrCost();
				380	if (SLT == MVT::i64) {
				381	const int FullRateCost = getFullRateInstrCost();
				382	return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
				383	}
				384
				385	// i32
				386	return QuarterRateCost * NElts * LT.first;
				387	}
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	388	case ISD::FADD:
				389	case ISD::FSUB:
				390	case ISD::FMUL:
				391	if (SLT == MVT::f64)
				392	return LT.first * NElts * get64BitInstrCost();
				393
				394	if (SLT == MVT::f32 \|\| SLT == MVT::f16)
				395	return LT.first * NElts * getFullRateInstrCost();
				396	break;
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	397	case ISD::FDIV:
				398	case ISD::FREM:
				399	// FIXME: frem should be handled separately. The fdiv in it is most of it,
				400	// but the current lowering is also not entirely correct.
				401	if (SLT == MVT::f64) {
				402	int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	403	// Add cost of workaround.
				404	if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
				405	Cost += 3 * getFullRateInstrCost();
				406
				407	return LT.first * Cost * NElts;
				408	}
				409
Matt Arsenault	376f1bd	2017-08-31 05:47:00 +0000	[diff] [blame]	410	if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
				411	// TODO: This is more complicated, unsafe flags etc.
				412	if ((SLT == MVT::f32 && !ST->hasFP32Denormals()) \|\|
				413	(SLT == MVT::f16 && ST->has16BitInsts())) {
				414	return LT.first * getQuarterRateInstrCost() * NElts;
				415	}
				416	}
				417
				418	if (SLT == MVT::f16 && ST->has16BitInsts()) {
				419	// 2 x v_cvt_f32_f16
				420	// f32 rcp
				421	// f32 fmul
				422	// v_cvt_f16_f32
				423	// f16 div_fixup
				424	int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
				425	return LT.first * Cost * NElts;
				426	}
				427
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	428	if (SLT == MVT::f32 \|\| SLT == MVT::f16) {
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	429	int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
Matt Arsenault	376f1bd	2017-08-31 05:47:00 +0000	[diff] [blame]	430
				431	if (!ST->hasFP32Denormals()) {
				432	// FP mode switches.
				433	Cost += 2 * getFullRateInstrCost();
				434	}
				435
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	436	return LT.first * NElts * Cost;
				437	}
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	438	break;
				439	default:
				440	break;
				441	}
				442
				443	return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
				444	Opd1PropInfo, Opd2PropInfo);
				445	}
				446
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	447	unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) {
Matt Arsenault	e05ff15	2015-12-16 18:37:19 +0000	[diff] [blame]	448	// XXX - For some reason this isn't called for switch.
				449	switch (Opcode) {
				450	case Instruction::Br:
				451	case Instruction::Ret:
				452	return 10;
				453	default:
				454	return BaseT::getCFInstrCost(Opcode);
				455	}
				456	}
				457
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	458	int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
Farhana Aleen	e2dfe8a	2018-05-01 21:41:12 +0000	[diff] [blame]	459	bool IsPairwise) {
				460	EVT OrigTy = TLI->getValueType(DL, Ty);
				461
				462	// Computes cost on targets that have packed math instructions(which support
				463	// 16-bit types only).
				464	if (IsPairwise \|\|
				465	!ST->hasVOP3PInsts() \|\|
				466	OrigTy.getScalarSizeInBits() != 16)
				467	return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise);
				468
				469	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
				470	return LT.first * getFullRateInstrCost();
				471	}
				472
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	473	int GCNTTIImpl::getMinMaxReductionCost(Type Ty, Type CondTy,
Farhana Aleen	e24f3ff	2018-05-09 21:18:34 +0000	[diff] [blame]	474	bool IsPairwise,
				475	bool IsUnsigned) {
				476	EVT OrigTy = TLI->getValueType(DL, Ty);
				477
				478	// Computes cost on targets that have packed math instructions(which support
				479	// 16-bit types only).
				480	if (IsPairwise \|\|
				481	!ST->hasVOP3PInsts() \|\|
				482	OrigTy.getScalarSizeInBits() != 16)
				483	return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned);
				484
				485	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
				486	return LT.first * getHalfRateInstrCost();
				487	}
				488
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	489	int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
Matt Arsenault	e830f54	2015-12-01 19:08:39 +0000	[diff] [blame]	490	unsigned Index) {
				491	switch (Opcode) {
				492	case Instruction::ExtractElement:
Matt Arsenault	3c5e423	2017-05-10 21:29:33 +0000	[diff] [blame]	493	case Instruction::InsertElement: {
				494	unsigned EltSize
				495	= DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
				496	if (EltSize < 32) {
				497	if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
				498	return 0;
				499	return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
				500	}
				501
Matt Arsenault	59767ce	2016-03-25 00:14:11 +0000	[diff] [blame]	502	// Extracts are just reads of a subregister, so are free. Inserts are
				503	// considered free because we don't want to have any cost for scalarizing
				504	// operations, and we don't have to copy into a different register class.
				505
Matt Arsenault	e830f54	2015-12-01 19:08:39 +0000	[diff] [blame]	506	// Dynamic indexing isn't free and is best avoided.
				507	return Index == ~0u ? 2 : 0;
Matt Arsenault	3c5e423	2017-05-10 21:29:33 +0000	[diff] [blame]	508	}
Matt Arsenault	e830f54	2015-12-01 19:08:39 +0000	[diff] [blame]	509	default:
				510	return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
				511	}
				512	}
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	513
Alexander Timofeev	2e5eece	2018-03-05 15:12:21 +0000	[diff] [blame]	514
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	515
				516	static bool isArgPassedInSGPR(const Argument *A) {
				517	const Function *F = A->getParent();
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	518
				519	// Arguments to compute shaders are never a source of divergence.
Matt Arsenault	4c1ecde	2017-04-19 17:42:34 +0000	[diff] [blame]	520	CallingConv::ID CC = F->getCallingConv();
				521	switch (CC) {
				522	case CallingConv::AMDGPU_KERNEL:
				523	case CallingConv::SPIR_KERNEL:
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	524	return true;
Matt Arsenault	4c1ecde	2017-04-19 17:42:34 +0000	[diff] [blame]	525	case CallingConv::AMDGPU_VS:
Tim Renouf	ef1ae8f	2017-09-29 09:51:22 +0000	[diff] [blame]	526	case CallingConv::AMDGPU_LS:
Marek Olsak	a302a736	2017-05-02 15:41:10 +0000	[diff] [blame]	527	case CallingConv::AMDGPU_HS:
Tim Renouf	ef1ae8f	2017-09-29 09:51:22 +0000	[diff] [blame]	528	case CallingConv::AMDGPU_ES:
Matt Arsenault	4c1ecde	2017-04-19 17:42:34 +0000	[diff] [blame]	529	case CallingConv::AMDGPU_GS:
				530	case CallingConv::AMDGPU_PS:
				531	case CallingConv::AMDGPU_CS:
				532	// For non-compute shaders, SGPR inputs are marked with either inreg or byval.
				533	// Everything else is in VGPRs.
				534	return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) \|\|
				535	F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
				536	default:
				537	// TODO: Should calls support inreg for SGPR inputs?
				538	return false;
				539	}
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	540	}
				541
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	542	/// \returns true if the result of the value could potentially be
				543	/// different across workitems in a wavefront.
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	544	bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	545	if (const Argument *A = dyn_cast<Argument>(V))
				546	return !isArgPassedInSGPR(A);
				547
				548	// Loads from the private address space are divergent, because threads
				549	// can execute the load instruction with the same inputs and get different
				550	// results.
				551	//
				552	// All other loads are not divergent, because if threads issue loads with the
				553	// same arguments, they will always get the same result.
				554	if (const LoadInst *Load = dyn_cast<LoadInst>(V))
Yaxun Liu	1a14bfa	2017-03-27 14:04:01 +0000	[diff] [blame]	555	return Load->getPointerAddressSpace() == ST->getAMDGPUAS().PRIVATE_ADDRESS;
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	556
Nicolai Haehnle	79cad85	2016-03-17 16:21:59 +0000	[diff] [blame]	557	// Atomics are divergent because they are executed sequentially: when an
				558	// atomic operation refers to the same address in each thread, then each
				559	// thread after the first sees the value written by the previous thread as
				560	// original value.
				561	if (isa<AtomicRMWInst>(V) \|\| isa<AtomicCmpXchgInst>(V))
				562	return true;
				563
Matt Arsenault	d2c8a33	2017-02-16 02:01:13 +0000	[diff] [blame]	564	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
Alexander Timofeev	2e5eece	2018-03-05 15:12:21 +0000	[diff] [blame]	565	return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	566
				567	// Assume all function calls are a source of divergence.
				568	if (isa<CallInst>(V) \|\| isa<InvokeInst>(V))
				569	return true;
				570
				571	return false;
				572	}
Matt Arsenault	3c5e423	2017-05-10 21:29:33 +0000	[diff] [blame]	573
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	574	bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
Alexander Timofeev	0f9c84c	2017-06-15 19:33:10 +0000	[diff] [blame]	575	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
				576	switch (Intrinsic->getIntrinsicID()) {
				577	default:
				578	return false;
				579	case Intrinsic::amdgcn_readfirstlane:
				580	case Intrinsic::amdgcn_readlane:
				581	return true;
				582	}
				583	}
				584	return false;
				585	}
				586
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	587	unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Matt Arsenault	3c5e423	2017-05-10 21:29:33 +0000	[diff] [blame]	588	Type *SubTp) {
				589	if (ST->hasVOP3PInsts()) {
				590	VectorType *VT = cast<VectorType>(Tp);
				591	if (VT->getNumElements() == 2 &&
				592	DL.getTypeSizeInBits(VT->getElementType()) == 16) {
				593	// With op_sel VOP3P instructions freely can access the low half or high
				594	// half of a register, so any swizzle is free.
				595
				596	switch (Kind) {
				597	case TTI::SK_Broadcast:
				598	case TTI::SK_Reverse:
				599	case TTI::SK_PermuteSingleSrc:
				600	return 0;
				601	default:
				602	break;
				603	}
				604	}
				605	}
				606
				607	return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
				608	}
Matt Arsenault	aac47c1	2017-08-07 17:08:44 +0000	[diff] [blame]	609
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	610	bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
Matt Arsenault	aac47c1	2017-08-07 17:08:44 +0000	[diff] [blame]	611	const Function *Callee) const {
				612	const TargetMachine &TM = getTLI()->getTargetMachine();
				613	const FeatureBitset &CallerBits =
				614	TM.getSubtargetImpl(*Caller)->getFeatureBits();
				615	const FeatureBitset &CalleeBits =
				616	TM.getSubtargetImpl(*Callee)->getFeatureBits();
				617
				618	FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
				619	FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
				620	return ((RealCallerBits & RealCalleeBits) == RealCalleeBits);
				621	}
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	622
				623	void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
				624	TTI::UnrollingPreferences &UP) {
				625	CommonTTI.getUnrollingPreferences(L, SE, UP);
				626	}
				627
				628	unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
				629	return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
				630	}
				631
				632	unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
				633	return getHardwareNumberOfRegisters(Vec);
				634	}
				635
				636	unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
				637	return 32;
				638	}
				639
				640	unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
				641	return 32;
				642	}
				643
				644	unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
				645	AMDGPUAS AS = ST->getAMDGPUAS();
				646	if (AddrSpace == AS.GLOBAL_ADDRESS \|\|
				647	AddrSpace == AS.CONSTANT_ADDRESS)
				648	return 128;
				649	if (AddrSpace == AS.LOCAL_ADDRESS \|\|
				650	AddrSpace == AS.REGION_ADDRESS)
				651	return 64;
				652	if (AddrSpace == AS.PRIVATE_ADDRESS)
				653	return 32;
				654
				655	if ((AddrSpace == AS.PARAM_D_ADDRESS \|\|
				656	AddrSpace == AS.PARAM_I_ADDRESS \|\|
				657	(AddrSpace >= AS.CONSTANT_BUFFER_0 &&
				658	AddrSpace <= AS.CONSTANT_BUFFER_15)))
				659	return 128;
				660	llvm_unreachable("unhandled address space");
				661	}
				662
				663	bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
				664	unsigned Alignment,
				665	unsigned AddrSpace) const {
				666	// We allow vectorization of flat stores, even though we may need to decompose
				667	// them later if they may access private memory. We don't have enough context
				668	// here, and legalization can handle it.
				669	if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS)
				670	return false;
				671	return true;
				672	}
				673
				674	bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
				675	unsigned Alignment,
				676	unsigned AddrSpace) const {
				677	return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
				678	}
				679
				680	bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
				681	unsigned Alignment,
				682	unsigned AddrSpace) const {
				683	return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
				684	}
				685
				686	unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
				687	// Disable unrolling if the loop is not vectorized.
				688	// TODO: Enable this again.
				689	if (VF == 1)
				690	return 1;
				691
				692	return 8;
				693	}
				694
				695	unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) {
				696	// XXX - For some reason this isn't called for switch.
				697	switch (Opcode) {
				698	case Instruction::Br:
				699	case Instruction::Ret:
				700	return 10;
				701	default:
				702	return BaseT::getCFInstrCost(Opcode);
				703	}
				704	}
				705
				706	int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
				707	unsigned Index) {
				708	switch (Opcode) {
				709	case Instruction::ExtractElement:
				710	case Instruction::InsertElement: {
				711	unsigned EltSize
				712	= DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
				713	if (EltSize < 32) {
				714	return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
				715	}
				716
				717	// Extracts are just reads of a subregister, so are free. Inserts are
				718	// considered free because we don't want to have any cost for scalarizing
				719	// operations, and we don't have to copy into a different register class.
				720
				721	// Dynamic indexing isn't free and is best avoided.
				722	return Index == ~0u ? 2 : 0;
				723	}
				724	default:
				725	return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
				726	}
				727	}
				728
				729	void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
				730	TTI::UnrollingPreferences &UP) {
				731	CommonTTI.getUnrollingPreferences(L, SE, UP);
				732	}