Blame - llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp - toolchain/llvm-project

blob: 8fcabeba5ed5bec5fed7e6f782876dd0752eb7b4 [file] [log] [blame]

Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	1	//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
Tom Stellard	8b1e021	2013-07-27 00:01:07 +0000	[diff] [blame]	2	//
Chandler Carruth	2946cd7	2019-01-19 08:50:56 +0000	[diff] [blame]	3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				4	// See https://llvm.org/LICENSE.txt for license information.
				5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Tom Stellard	8b1e021	2013-07-27 00:01:07 +0000	[diff] [blame]	6	//
				7	//===----------------------------------------------------------------------===//
				8	//
				9	// \file
				10	// This file implements a TargetTransformInfo analysis pass specific to the
				11	// AMDGPU target machine. It uses the target's detailed information to provide
				12	// more precise answers to certain TTI queries, while letting the target
				13	// independent and default TTI implementations handle the rest.
				14	//
				15	//===----------------------------------------------------------------------===//
				16
Chandler Carruth	93dcdc4	2015-01-31 11:17:59 +0000	[diff] [blame]	17	#include "AMDGPUTargetTransformInfo.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	18	#include "AMDGPUSubtarget.h"
Alexander Timofeev	2e5eece	2018-03-05 15:12:21 +0000	[diff] [blame]	19	#include "Utils/AMDGPUBaseInfo.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	20	#include "llvm/ADT/STLExtras.h"
Tom Stellard	8cce9bd	2014-01-23 18:49:28 +0000	[diff] [blame]	21	#include "llvm/Analysis/LoopInfo.h"
Tom Stellard	8b1e021	2013-07-27 00:01:07 +0000	[diff] [blame]	22	#include "llvm/Analysis/TargetTransformInfo.h"
Tom Stellard	8cce9bd	2014-01-23 18:49:28 +0000	[diff] [blame]	23	#include "llvm/Analysis/ValueTracking.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	24	#include "llvm/CodeGen/ISDOpcodes.h"
Craig Topper	2fa1436	2018-03-29 17:21:10 +0000	[diff] [blame]	25	#include "llvm/CodeGen/ValueTypes.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	26	#include "llvm/IR/Argument.h"
				27	#include "llvm/IR/Attributes.h"
				28	#include "llvm/IR/BasicBlock.h"
				29	#include "llvm/IR/CallingConv.h"
				30	#include "llvm/IR/DataLayout.h"
				31	#include "llvm/IR/DerivedTypes.h"
				32	#include "llvm/IR/Function.h"
				33	#include "llvm/IR/Instruction.h"
				34	#include "llvm/IR/Instructions.h"
				35	#include "llvm/IR/IntrinsicInst.h"
Chandler Carruth	6bda14b	2017-06-06 11:49:48 +0000	[diff] [blame]	36	#include "llvm/IR/Module.h"
Matt Arsenault	376f1bd	2017-08-31 05:47:00 +0000	[diff] [blame]	37	#include "llvm/IR/PatternMatch.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	38	#include "llvm/IR/Type.h"
				39	#include "llvm/IR/Value.h"
				40	#include "llvm/MC/SubtargetFeature.h"
				41	#include "llvm/Support/Casting.h"
				42	#include "llvm/Support/CommandLine.h"
Tom Stellard	8b1e021	2013-07-27 00:01:07 +0000	[diff] [blame]	43	#include "llvm/Support/Debug.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	44	#include "llvm/Support/ErrorHandling.h"
David Blaikie	13e77db	2018-03-23 23:58:25 +0000	[diff] [blame]	45	#include "llvm/Support/MachineValueType.h"
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	46	#include "llvm/Support/raw_ostream.h"
				47	#include "llvm/Target/TargetMachine.h"
				48	#include <algorithm>
				49	#include <cassert>
				50	#include <limits>
				51	#include <utility>
				52
Tom Stellard	8b1e021	2013-07-27 00:01:07 +0000	[diff] [blame]	53	using namespace llvm;
				54
Chandler Carruth	84e68b2	2014-04-22 02:41:26 +0000	[diff] [blame]	55	#define DEBUG_TYPE "AMDGPUtti"
				56
Stanislav Mekhanoshin	f29602d	2017-02-03 02:20:05 +0000	[diff] [blame]	57	static cl::opt<unsigned> UnrollThresholdPrivate(
				58	"amdgpu-unroll-threshold-private",
				59	cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	60	cl::init(2500), cl::Hidden);
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	61
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	62	static cl::opt<unsigned> UnrollThresholdLocal(
				63	"amdgpu-unroll-threshold-local",
				64	cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
				65	cl::init(1000), cl::Hidden);
				66
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	67	static cl::opt<unsigned> UnrollThresholdIf(
				68	"amdgpu-unroll-threshold-if",
				69	cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
				70	cl::init(150), cl::Hidden);
				71
				72	static bool dependsOnLocalPhi(const Loop L, const Value Cond,
				73	unsigned Depth = 0) {
				74	const Instruction *I = dyn_cast<Instruction>(Cond);
				75	if (!I)
				76	return false;
				77
				78	for (const Value *V : I->operand_values()) {
				79	if (!L->contains(I))
				80	continue;
				81	if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	82	if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	83	return SubLoop->contains(PHI); }))
				84	return true;
				85	} else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
				86	return true;
				87	}
				88	return false;
				89	}
				90
Geoff Berry	66d9bdb	2017-06-28 15:53:17 +0000	[diff] [blame]	91	void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
Chandler Carruth	705b185	2015-01-31 03:43:40 +0000	[diff] [blame]	92	TTI::UnrollingPreferences &UP) {
Matt Arsenault	c824458	2014-07-25 23:02:42 +0000	[diff] [blame]	93	UP.Threshold = 300; // Twice the default.
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	94	UP.MaxCount = std::numeric_limits<unsigned>::max();
Matt Arsenault	c824458	2014-07-25 23:02:42 +0000	[diff] [blame]	95	UP.Partial = true;
				96
				97	// TODO: Do we want runtime unrolling?
				98
Stanislav Mekhanoshin	f29602d	2017-02-03 02:20:05 +0000	[diff] [blame]	99	// Maximum alloca size than can fit registers. Reserve 16 registers.
				100	const unsigned MaxAlloca = (256 - 16) * 4;
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	101	unsigned ThresholdPrivate = UnrollThresholdPrivate;
				102	unsigned ThresholdLocal = UnrollThresholdLocal;
				103	unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
Matt Arsenault	ac6e39c	2014-07-17 06:19:06 +0000	[diff] [blame]	104	for (const BasicBlock *BB : L->getBlocks()) {
Mehdi Amini	a28d91d	2015-03-10 02:37:25 +0000	[diff] [blame]	105	const DataLayout &DL = BB->getModule()->getDataLayout();
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	106	unsigned LocalGEPsSeen = 0;
				107
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	108	if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	109	return SubLoop->contains(BB); }))
				110	continue; // Block belongs to an inner loop.
				111
Matt Arsenault	ac6e39c	2014-07-17 06:19:06 +0000	[diff] [blame]	112	for (const Instruction &I : *BB) {
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	113	// Unroll a loop which contains an "if" statement whose condition
				114	// defined by a PHI belonging to the loop. This may help to eliminate
				115	// if region and potentially even PHI itself, saving on both divergence
				116	// and registers used for the PHI.
				117	// Add a small bonus for each of such "if" statements.
				118	if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
				119	if (UP.Threshold < MaxBoost && Br->isConditional()) {
				120	if (L->isLoopExiting(Br->getSuccessor(0)) \|\|
				121	L->isLoopExiting(Br->getSuccessor(1)))
				122	continue;
				123	if (dependsOnLocalPhi(L, Br->getCondition())) {
				124	UP.Threshold += UnrollThresholdIf;
Nicola Zaghen	d34e60c	2018-05-14 12:53:11 +0000	[diff] [blame]	125	LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
				126	<< " for loop:\n"
				127	<< L << " due to " << Br << '\n');
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	128	if (UP.Threshold >= MaxBoost)
				129	return;
				130	}
				131	}
				132	continue;
				133	}
				134
Matt Arsenault	ac6e39c	2014-07-17 06:19:06 +0000	[diff] [blame]	135	const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	136	if (!GEP)
Tom Stellard	8cce9bd	2014-01-23 18:49:28 +0000	[diff] [blame]	137	continue;
Matt Arsenault	ac6e39c	2014-07-17 06:19:06 +0000	[diff] [blame]	138
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	139	unsigned AS = GEP->getAddressSpace();
				140	unsigned Threshold = 0;
Matt Arsenault	0da6350	2018-08-31 05:49:54 +0000	[diff] [blame]	141	if (AS == AMDGPUAS::PRIVATE_ADDRESS)
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	142	Threshold = ThresholdPrivate;
Matt Arsenault	0da6350	2018-08-31 05:49:54 +0000	[diff] [blame]	143	else if (AS == AMDGPUAS::LOCAL_ADDRESS)
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	144	Threshold = ThresholdLocal;
				145	else
				146	continue;
				147
				148	if (UP.Threshold >= Threshold)
				149	continue;
				150
Matt Arsenault	0da6350	2018-08-31 05:49:54 +0000	[diff] [blame]	151	if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	152	const Value *Ptr = GEP->getPointerOperand();
				153	const AllocaInst *Alloca =
				154	dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
				155	if (!Alloca \|\| !Alloca->isStaticAlloca())
				156	continue;
Stanislav Mekhanoshin	f29602d	2017-02-03 02:20:05 +0000	[diff] [blame]	157	Type *Ty = Alloca->getAllocatedType();
				158	unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
				159	if (AllocaSize > MaxAlloca)
				160	continue;
Matt Arsenault	0da6350	2018-08-31 05:49:54 +0000	[diff] [blame]	161	} else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	162	LocalGEPsSeen++;
				163	// Inhibit unroll for local memory if we have seen addressing not to
				164	// a variable, most likely we will be unable to combine it.
				165	// Do not unroll too deep inner loops for local memory to give a chance
				166	// to unroll an outer loop for a more important reason.
				167	if (LocalGEPsSeen > 1 \|\| L->getLoopDepth() > 2 \|\|
				168	(!isa<GlobalVariable>(GEP->getPointerOperand()) &&
				169	!isa<Argument>(GEP->getPointerOperand())))
				170	continue;
				171	}
Stanislav Mekhanoshin	f29602d	2017-02-03 02:20:05 +0000	[diff] [blame]	172
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	173	// Check if GEP depends on a value defined by this loop itself.
				174	bool HasLoopDef = false;
				175	for (const Value *Op : GEP->operands()) {
				176	const Instruction *Inst = dyn_cast<Instruction>(Op);
				177	if (!Inst \|\| L->isLoopInvariant(Op))
Stanislav Mekhanoshin	f29602d	2017-02-03 02:20:05 +0000	[diff] [blame]	178	continue;
				179
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	180	if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	181	return SubLoop->contains(Inst); }))
				182	continue;
				183	HasLoopDef = true;
				184	break;
Tom Stellard	8cce9bd	2014-01-23 18:49:28 +0000	[diff] [blame]	185	}
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	186	if (!HasLoopDef)
				187	continue;
				188
				189	// We want to do whatever we can to limit the number of alloca
				190	// instructions that make it through to the code generator. allocas
				191	// require us to use indirect addressing, which is slow and prone to
				192	// compiler bugs. If this loop does an address calculation on an
				193	// alloca ptr, then we want to use a higher than normal loop unroll
				194	// threshold. This will give SROA a better chance to eliminate these
				195	// allocas.
				196	//
				197	// We also want to have more unrolling for local memory to let ds
				198	// instructions with different offsets combine.
				199	//
				200	// Don't use the maximum allowed value here as it will make some
				201	// programs way too big.
				202	UP.Threshold = Threshold;
Nicola Zaghen	d34e60c	2018-05-14 12:53:11 +0000	[diff] [blame]	203	LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
				204	<< " for loop:\n"
				205	<< L << " due to " << GEP << '\n');
Stanislav Mekhanoshin	478b819	2017-04-07 16:26:28 +0000	[diff] [blame]	206	if (UP.Threshold >= MaxBoost)
Stanislav Mekhanoshin	baf31ac	2017-03-28 22:13:51 +0000	[diff] [blame]	207	return;
Tom Stellard	8cce9bd	2014-01-23 18:49:28 +0000	[diff] [blame]	208	}
				209	}
				210	}
Matt Arsenault	3dd43fc	2014-07-18 06:07:13 +0000	[diff] [blame]	211
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	212	unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
Matt Arsenault	67cd347	2017-06-20 20:38:06 +0000	[diff] [blame]	213	// The concept of vector registers doesn't really exist. Some packed vector
				214	// operations operate on the normal 32-bit registers.
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	215	return 256;
Matt Arsenault	a93441f	2014-07-19 18:15:16 +0000	[diff] [blame]	216	}
				217
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	218	unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
Matt Arsenault	67cd347	2017-06-20 20:38:06 +0000	[diff] [blame]	219	// This is really the number of registers to fill when vectorizing /
				220	// interleaving loops, so we lie to avoid trying to use all registers.
				221	return getHardwareNumberOfRegisters(Vec) >> 3;
				222	}
				223
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	224	unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
Matt Arsenault	67cd347	2017-06-20 20:38:06 +0000	[diff] [blame]	225	return 32;
				226	}
				227
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	228	unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
Matt Arsenault	67cd347	2017-06-20 20:38:06 +0000	[diff] [blame]	229	return 32;
Matt Arsenault	4339b3f	2015-12-24 05:14:55 +0000	[diff] [blame]	230	}
Matt Arsenault	a93441f	2014-07-19 18:15:16 +0000	[diff] [blame]	231
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	232	unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
Farhana Aleen	8919664	2018-03-07 17:09:18 +0000	[diff] [blame]	233	unsigned ChainSizeInBytes,
				234	VectorType *VecTy) const {
				235	unsigned VecRegBitWidth = VF * LoadSize;
				236	if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
				237	// TODO: Support element-size less than 32bit?
				238	return 128 / LoadSize;
				239
				240	return VF;
				241	}
				242
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	243	unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
Farhana Aleen	8919664	2018-03-07 17:09:18 +0000	[diff] [blame]	244	unsigned ChainSizeInBytes,
				245	VectorType *VecTy) const {
				246	unsigned VecRegBitWidth = VF * StoreSize;
				247	if (VecRegBitWidth > 128)
				248	return 128 / StoreSize;
				249
				250	return VF;
				251	}
				252
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	253	unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
Matt Arsenault	0da6350	2018-08-31 05:49:54 +0000	[diff] [blame]	254	if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS \|\|
				255	AddrSpace == AMDGPUAS::CONSTANT_ADDRESS \|\|
Neil Henning	523dab0	2019-03-18 14:44:28 +0000	[diff] [blame]	256	AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT \|\|
				257	AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
Farhana Aleen	8919664	2018-03-07 17:09:18 +0000	[diff] [blame]	258	return 512;
				259	}
				260
Matt Arsenault	0da6350	2018-08-31 05:49:54 +0000	[diff] [blame]	261	if (AddrSpace == AMDGPUAS::FLAT_ADDRESS \|\|
				262	AddrSpace == AMDGPUAS::LOCAL_ADDRESS \|\|
				263	AddrSpace == AMDGPUAS::REGION_ADDRESS)
Farhana Aleen	eacb102	2018-05-28 18:15:11 +0000	[diff] [blame]	264	return 128;
Marek Olsak	a9a58fa	2018-04-10 22:48:23 +0000	[diff] [blame]	265
Matt Arsenault	0da6350	2018-08-31 05:49:54 +0000	[diff] [blame]	266	if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
Matt Arsenault	0994bd5	2016-07-01 00:56:27 +0000	[diff] [blame]	267	return 8 * ST->getMaxPrivateElementSize();
Yaxun Liu	1a14bfa	2017-03-27 14:04:01 +0000	[diff] [blame]	268
Yaxun Liu	1a14bfa	2017-03-27 14:04:01 +0000	[diff] [blame]	269	llvm_unreachable("unhandled address space");
Matt Arsenault	0994bd5	2016-07-01 00:56:27 +0000	[diff] [blame]	270	}
				271
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	272	bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
Matt Arsenault	f0a88db	2017-02-23 03:58:53 +0000	[diff] [blame]	273	unsigned Alignment,
				274	unsigned AddrSpace) const {
				275	// We allow vectorization of flat stores, even though we may need to decompose
				276	// them later if they may access private memory. We don't have enough context
				277	// here, and legalization can handle it.
Matt Arsenault	0da6350	2018-08-31 05:49:54 +0000	[diff] [blame]	278	if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
Matt Arsenault	f0a88db	2017-02-23 03:58:53 +0000	[diff] [blame]	279	return (Alignment >= 4 \|\| ST->hasUnalignedScratchAccess()) &&
				280	ChainSizeInBytes <= ST->getMaxPrivateElementSize();
				281	}
				282	return true;
				283	}
				284
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	285	bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
Matt Arsenault	f0a88db	2017-02-23 03:58:53 +0000	[diff] [blame]	286	unsigned Alignment,
				287	unsigned AddrSpace) const {
				288	return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
				289	}
				290
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	291	bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
Matt Arsenault	f0a88db	2017-02-23 03:58:53 +0000	[diff] [blame]	292	unsigned Alignment,
				293	unsigned AddrSpace) const {
				294	return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
				295	}
				296
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	297	unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
Changpeng Fang	1be9b9f	2017-03-09 00:07:00 +0000	[diff] [blame]	298	// Disable unrolling if the loop is not vectorized.
Matt Arsenault	67cd347	2017-06-20 20:38:06 +0000	[diff] [blame]	299	// TODO: Enable this again.
Changpeng Fang	1be9b9f	2017-03-09 00:07:00 +0000	[diff] [blame]	300	if (VF == 1)
				301	return 1;
				302
Matt Arsenault	67cd347	2017-06-20 20:38:06 +0000	[diff] [blame]	303	return 8;
Matt Arsenault	a93441f	2014-07-19 18:15:16 +0000	[diff] [blame]	304	}
Matt Arsenault	e830f54	2015-12-01 19:08:39 +0000	[diff] [blame]	305
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	306	bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
Matt Arsenault	3e268cc	2017-12-11 21:38:43 +0000	[diff] [blame]	307	MemIntrinsicInfo &Info) const {
				308	switch (Inst->getIntrinsicID()) {
				309	case Intrinsic::amdgcn_atomic_inc:
Daniil Fukalov	6e1dc68	2018-01-26 11:09:38 +0000	[diff] [blame]	310	case Intrinsic::amdgcn_atomic_dec:
Marek Olsak	c5cec5e	2019-01-16 15:43:53 +0000	[diff] [blame]	311	case Intrinsic::amdgcn_ds_ordered_add:
				312	case Intrinsic::amdgcn_ds_ordered_swap:
Daniil Fukalov	6e1dc68	2018-01-26 11:09:38 +0000	[diff] [blame]	313	case Intrinsic::amdgcn_ds_fadd:
				314	case Intrinsic::amdgcn_ds_fmin:
				315	case Intrinsic::amdgcn_ds_fmax: {
Matt Arsenault	3e268cc	2017-12-11 21:38:43 +0000	[diff] [blame]	316	auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
				317	auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
				318	if (!Ordering \|\| !Volatile)
				319	return false; // Invalid.
				320
				321	unsigned OrderingVal = Ordering->getZExtValue();
				322	if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
				323	return false;
				324
				325	Info.PtrVal = Inst->getArgOperand(0);
				326	Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
				327	Info.ReadMem = true;
				328	Info.WriteMem = true;
				329	Info.IsVolatile = !Volatile->isNullValue();
				330	return true;
				331	}
				332	default:
				333	return false;
				334	}
				335	}
				336
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	337	int GCNTTIImpl::getArithmeticInstrCost(
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	338	unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
				339	TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
Mohammed Agabaria	2c96c43	2017-01-11 08:23:37 +0000	[diff] [blame]	340	TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	341	EVT OrigTy = TLI->getValueType(DL, Ty);
				342	if (!OrigTy.isSimple()) {
				343	return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
				344	Opd1PropInfo, Opd2PropInfo);
				345	}
				346
				347	// Legalize the type.
				348	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
				349	int ISD = TLI->InstructionOpcodeToISD(Opcode);
				350
				351	// Because we don't have any legal vector operations, but the legal types, we
				352	// need to account for split vectors.
				353	unsigned NElts = LT.second.isVector() ?
				354	LT.second.getVectorNumElements() : 1;
				355
				356	MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
				357
				358	switch (ISD) {
Matt Arsenault	8c8fcb2	2016-03-25 01:16:40 +0000	[diff] [blame]	359	case ISD::SHL:
				360	case ISD::SRL:
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	361	case ISD::SRA:
Matt Arsenault	8c8fcb2	2016-03-25 01:16:40 +0000	[diff] [blame]	362	if (SLT == MVT::i64)
				363	return get64BitInstrCost() * LT.first * NElts;
				364
				365	// i32
				366	return getFullRateInstrCost() * LT.first * NElts;
Matt Arsenault	8c8fcb2	2016-03-25 01:16:40 +0000	[diff] [blame]	367	case ISD::ADD:
				368	case ISD::SUB:
				369	case ISD::AND:
				370	case ISD::OR:
Eugene Zelenko	d16eff8	2017-08-08 23:53:55 +0000	[diff] [blame]	371	case ISD::XOR:
Matt Arsenault	8c8fcb2	2016-03-25 01:16:40 +0000	[diff] [blame]	372	if (SLT == MVT::i64){
				373	// and, or and xor are typically split into 2 VALU instructions.
				374	return 2 * getFullRateInstrCost() * LT.first * NElts;
				375	}
				376
				377	return LT.first * NElts * getFullRateInstrCost();
Matt Arsenault	8c8fcb2	2016-03-25 01:16:40 +0000	[diff] [blame]	378	case ISD::MUL: {
				379	const int QuarterRateCost = getQuarterRateInstrCost();
				380	if (SLT == MVT::i64) {
				381	const int FullRateCost = getFullRateInstrCost();
				382	return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
				383	}
				384
				385	// i32
				386	return QuarterRateCost * NElts * LT.first;
				387	}
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	388	case ISD::FADD:
				389	case ISD::FSUB:
				390	case ISD::FMUL:
				391	if (SLT == MVT::f64)
				392	return LT.first * NElts * get64BitInstrCost();
				393
				394	if (SLT == MVT::f32 \|\| SLT == MVT::f16)
				395	return LT.first * NElts * getFullRateInstrCost();
				396	break;
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	397	case ISD::FDIV:
				398	case ISD::FREM:
				399	// FIXME: frem should be handled separately. The fdiv in it is most of it,
				400	// but the current lowering is also not entirely correct.
				401	if (SLT == MVT::f64) {
				402	int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	403	// Add cost of workaround.
				404	if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
				405	Cost += 3 * getFullRateInstrCost();
				406
				407	return LT.first * Cost * NElts;
				408	}
				409
Matt Arsenault	376f1bd	2017-08-31 05:47:00 +0000	[diff] [blame]	410	if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
				411	// TODO: This is more complicated, unsafe flags etc.
				412	if ((SLT == MVT::f32 && !ST->hasFP32Denormals()) \|\|
				413	(SLT == MVT::f16 && ST->has16BitInsts())) {
				414	return LT.first * getQuarterRateInstrCost() * NElts;
				415	}
				416	}
				417
				418	if (SLT == MVT::f16 && ST->has16BitInsts()) {
				419	// 2 x v_cvt_f32_f16
				420	// f32 rcp
				421	// f32 fmul
				422	// v_cvt_f16_f32
				423	// f16 div_fixup
				424	int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
				425	return LT.first * Cost * NElts;
				426	}
				427
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	428	if (SLT == MVT::f32 \|\| SLT == MVT::f16) {
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	429	int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
Matt Arsenault	376f1bd	2017-08-31 05:47:00 +0000	[diff] [blame]	430
				431	if (!ST->hasFP32Denormals()) {
				432	// FP mode switches.
				433	Cost += 2 * getFullRateInstrCost();
				434	}
				435
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	436	return LT.first * NElts * Cost;
				437	}
Matt Arsenault	9651813	2016-03-25 01:00:32 +0000	[diff] [blame]	438	break;
				439	default:
				440	break;
				441	}
				442
				443	return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
				444	Opd1PropInfo, Opd2PropInfo);
				445	}
				446
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	447	unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) {
Matt Arsenault	e05ff15	2015-12-16 18:37:19 +0000	[diff] [blame]	448	// XXX - For some reason this isn't called for switch.
				449	switch (Opcode) {
				450	case Instruction::Br:
				451	case Instruction::Ret:
				452	return 10;
				453	default:
				454	return BaseT::getCFInstrCost(Opcode);
				455	}
				456	}
				457
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	458	int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
Farhana Aleen	e2dfe8a	2018-05-01 21:41:12 +0000	[diff] [blame]	459	bool IsPairwise) {
				460	EVT OrigTy = TLI->getValueType(DL, Ty);
				461
				462	// Computes cost on targets that have packed math instructions(which support
				463	// 16-bit types only).
				464	if (IsPairwise \|\|
				465	!ST->hasVOP3PInsts() \|\|
				466	OrigTy.getScalarSizeInBits() != 16)
				467	return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise);
				468
				469	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
				470	return LT.first * getFullRateInstrCost();
				471	}
				472
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	473	int GCNTTIImpl::getMinMaxReductionCost(Type Ty, Type CondTy,
Farhana Aleen	e24f3ff	2018-05-09 21:18:34 +0000	[diff] [blame]	474	bool IsPairwise,
				475	bool IsUnsigned) {
				476	EVT OrigTy = TLI->getValueType(DL, Ty);
				477
				478	// Computes cost on targets that have packed math instructions(which support
				479	// 16-bit types only).
				480	if (IsPairwise \|\|
				481	!ST->hasVOP3PInsts() \|\|
				482	OrigTy.getScalarSizeInBits() != 16)
				483	return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned);
				484
				485	std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
				486	return LT.first * getHalfRateInstrCost();
				487	}
				488
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	489	int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
Matt Arsenault	e830f54	2015-12-01 19:08:39 +0000	[diff] [blame]	490	unsigned Index) {
				491	switch (Opcode) {
				492	case Instruction::ExtractElement:
Matt Arsenault	3c5e423	2017-05-10 21:29:33 +0000	[diff] [blame]	493	case Instruction::InsertElement: {
				494	unsigned EltSize
				495	= DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
				496	if (EltSize < 32) {
				497	if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
				498	return 0;
				499	return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
				500	}
				501
Matt Arsenault	59767ce	2016-03-25 00:14:11 +0000	[diff] [blame]	502	// Extracts are just reads of a subregister, so are free. Inserts are
				503	// considered free because we don't want to have any cost for scalarizing
				504	// operations, and we don't have to copy into a different register class.
				505
Matt Arsenault	e830f54	2015-12-01 19:08:39 +0000	[diff] [blame]	506	// Dynamic indexing isn't free and is best avoided.
				507	return Index == ~0u ? 2 : 0;
Matt Arsenault	3c5e423	2017-05-10 21:29:33 +0000	[diff] [blame]	508	}
Matt Arsenault	e830f54	2015-12-01 19:08:39 +0000	[diff] [blame]	509	default:
				510	return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
				511	}
				512	}
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	513
Alexander Timofeev	2e5eece	2018-03-05 15:12:21 +0000	[diff] [blame]	514
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	515
				516	static bool isArgPassedInSGPR(const Argument *A) {
				517	const Function *F = A->getParent();
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	518
				519	// Arguments to compute shaders are never a source of divergence.
Matt Arsenault	4c1ecde	2017-04-19 17:42:34 +0000	[diff] [blame]	520	CallingConv::ID CC = F->getCallingConv();
				521	switch (CC) {
				522	case CallingConv::AMDGPU_KERNEL:
				523	case CallingConv::SPIR_KERNEL:
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	524	return true;
Matt Arsenault	4c1ecde	2017-04-19 17:42:34 +0000	[diff] [blame]	525	case CallingConv::AMDGPU_VS:
Tim Renouf	ef1ae8f	2017-09-29 09:51:22 +0000	[diff] [blame]	526	case CallingConv::AMDGPU_LS:
Marek Olsak	a302a736	2017-05-02 15:41:10 +0000	[diff] [blame]	527	case CallingConv::AMDGPU_HS:
Tim Renouf	ef1ae8f	2017-09-29 09:51:22 +0000	[diff] [blame]	528	case CallingConv::AMDGPU_ES:
Matt Arsenault	4c1ecde	2017-04-19 17:42:34 +0000	[diff] [blame]	529	case CallingConv::AMDGPU_GS:
				530	case CallingConv::AMDGPU_PS:
				531	case CallingConv::AMDGPU_CS:
				532	// For non-compute shaders, SGPR inputs are marked with either inreg or byval.
				533	// Everything else is in VGPRs.
				534	return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) \|\|
				535	F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
				536	default:
				537	// TODO: Should calls support inreg for SGPR inputs?
				538	return false;
				539	}
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	540	}
				541
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	542	/// \returns true if the result of the value could potentially be
				543	/// different across workitems in a wavefront.
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	544	bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	545	if (const Argument *A = dyn_cast<Argument>(V))
				546	return !isArgPassedInSGPR(A);
				547
Scott Linder	72855e3	2018-08-21 21:24:31 +0000	[diff] [blame]	548	// Loads from the private and flat address spaces are divergent, because
				549	// threads can execute the load instruction with the same inputs and get
				550	// different results.
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	551	//
				552	// All other loads are not divergent, because if threads issue loads with the
				553	// same arguments, they will always get the same result.
				554	if (const LoadInst *Load = dyn_cast<LoadInst>(V))
Matt Arsenault	0da6350	2018-08-31 05:49:54 +0000	[diff] [blame]	555	return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS \|\|
				556	Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	557
Nicolai Haehnle	79cad85	2016-03-17 16:21:59 +0000	[diff] [blame]	558	// Atomics are divergent because they are executed sequentially: when an
				559	// atomic operation refers to the same address in each thread, then each
				560	// thread after the first sees the value written by the previous thread as
				561	// original value.
				562	if (isa<AtomicRMWInst>(V) \|\| isa<AtomicCmpXchgInst>(V))
				563	return true;
				564
Matt Arsenault	d2c8a33	2017-02-16 02:01:13 +0000	[diff] [blame]	565	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
Alexander Timofeev	2e5eece	2018-03-05 15:12:21 +0000	[diff] [blame]	566	return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
Tom Stellard	dbe374b	2015-12-15 18:04:38 +0000	[diff] [blame]	567
				568	// Assume all function calls are a source of divergence.
				569	if (isa<CallInst>(V) \|\| isa<InvokeInst>(V))
				570	return true;
				571
				572	return false;
				573	}
Matt Arsenault	3c5e423	2017-05-10 21:29:33 +0000	[diff] [blame]	574
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	575	bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
Alexander Timofeev	0f9c84c	2017-06-15 19:33:10 +0000	[diff] [blame]	576	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
				577	switch (Intrinsic->getIntrinsicID()) {
				578	default:
				579	return false;
				580	case Intrinsic::amdgcn_readfirstlane:
				581	case Intrinsic::amdgcn_readlane:
Neil Henning	3ed09f8	2019-01-18 16:39:27 +0000	[diff] [blame]	582	case Intrinsic::amdgcn_icmp:
				583	case Intrinsic::amdgcn_fcmp:
Alexander Timofeev	0f9c84c	2017-06-15 19:33:10 +0000	[diff] [blame]	584	return true;
				585	}
				586	}
				587	return false;
				588	}
				589
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	590	unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Matt Arsenault	3c5e423	2017-05-10 21:29:33 +0000	[diff] [blame]	591	Type *SubTp) {
				592	if (ST->hasVOP3PInsts()) {
				593	VectorType *VT = cast<VectorType>(Tp);
				594	if (VT->getNumElements() == 2 &&
				595	DL.getTypeSizeInBits(VT->getElementType()) == 16) {
				596	// With op_sel VOP3P instructions freely can access the low half or high
				597	// half of a register, so any swizzle is free.
				598
				599	switch (Kind) {
				600	case TTI::SK_Broadcast:
				601	case TTI::SK_Reverse:
				602	case TTI::SK_PermuteSingleSrc:
				603	return 0;
				604	default:
				605	break;
				606	}
				607	}
				608	}
				609
				610	return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
				611	}
Matt Arsenault	aac47c1	2017-08-07 17:08:44 +0000	[diff] [blame]	612
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	613	bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
Matt Arsenault	aac47c1	2017-08-07 17:08:44 +0000	[diff] [blame]	614	const Function *Callee) const {
				615	const TargetMachine &TM = getTLI()->getTargetMachine();
				616	const FeatureBitset &CallerBits =
				617	TM.getSubtargetImpl(*Caller)->getFeatureBits();
				618	const FeatureBitset &CalleeBits =
				619	TM.getSubtargetImpl(*Callee)->getFeatureBits();
				620
				621	FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
				622	FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
				623	return ((RealCallerBits & RealCalleeBits) == RealCalleeBits);
				624	}
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	625
				626	void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
				627	TTI::UnrollingPreferences &UP) {
				628	CommonTTI.getUnrollingPreferences(L, SE, UP);
				629	}
				630
				631	unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
				632	return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
				633	}
				634
				635	unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
				636	return getHardwareNumberOfRegisters(Vec);
				637	}
				638
				639	unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
				640	return 32;
				641	}
				642
				643	unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
				644	return 32;
				645	}
				646
				647	unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
Matt Arsenault	0da6350	2018-08-31 05:49:54 +0000	[diff] [blame]	648	if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS \|\|
				649	AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	650	return 128;
Matt Arsenault	0da6350	2018-08-31 05:49:54 +0000	[diff] [blame]	651	if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS \|\|
				652	AddrSpace == AMDGPUAS::REGION_ADDRESS)
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	653	return 64;
Matt Arsenault	0da6350	2018-08-31 05:49:54 +0000	[diff] [blame]	654	if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	655	return 32;
				656
Matt Arsenault	0da6350	2018-08-31 05:49:54 +0000	[diff] [blame]	657	if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS \|\|
				658	AddrSpace == AMDGPUAS::PARAM_I_ADDRESS \|\|
				659	(AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
				660	AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	661	return 128;
				662	llvm_unreachable("unhandled address space");
				663	}
				664
				665	bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
				666	unsigned Alignment,
				667	unsigned AddrSpace) const {
				668	// We allow vectorization of flat stores, even though we may need to decompose
				669	// them later if they may access private memory. We don't have enough context
				670	// here, and legalization can handle it.
Matt Arsenault	0da6350	2018-08-31 05:49:54 +0000	[diff] [blame]	671	return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
Tom Stellard	c762431	2018-05-30 22:55:35 +0000	[diff] [blame]	672	}
				673
				674	bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
				675	unsigned Alignment,
				676	unsigned AddrSpace) const {
				677	return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
				678	}
				679
				680	bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
				681	unsigned Alignment,
				682	unsigned AddrSpace) const {
				683	return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
				684	}
				685
				686	unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
				687	// Disable unrolling if the loop is not vectorized.
				688	// TODO: Enable this again.
				689	if (VF == 1)
				690	return 1;
				691
				692	return 8;
				693	}
				694
				695	unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) {
				696	// XXX - For some reason this isn't called for switch.
				697	switch (Opcode) {
				698	case Instruction::Br:
				699	case Instruction::Ret:
				700	return 10;
				701	default:
				702	return BaseT::getCFInstrCost(Opcode);
				703	}
				704	}
				705
				706	int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
				707	unsigned Index) {
				708	switch (Opcode) {
				709	case Instruction::ExtractElement:
				710	case Instruction::InsertElement: {
				711	unsigned EltSize
				712	= DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
				713	if (EltSize < 32) {
				714	return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
				715	}
				716
				717	// Extracts are just reads of a subregister, so are free. Inserts are
				718	// considered free because we don't want to have any cost for scalarizing
				719	// operations, and we don't have to copy into a different register class.
				720
				721	// Dynamic indexing isn't free and is best avoided.
				722	return Index == ~0u ? 2 : 0;
				723	}
				724	default:
				725	return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
				726	}
				727	}
				728
				729	void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
				730	TTI::UnrollingPreferences &UP) {
				731	CommonTTI.getUnrollingPreferences(L, SE, UP);
				732	}