Blame - llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp - toolchain/llvm-project

blob: 3bb6dd4571c0355c8c7847de85bae1bd03bc53eb [file] [log] [blame]

Tom Stellard	75aadc2	2012-12-11 21:25:42 +0000	[diff] [blame]	1	//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
				2	//
Chandler Carruth	2946cd7	2019-01-19 08:50:56 +0000	[diff] [blame]	3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				4	// See https://llvm.org/LICENSE.txt for license information.
				5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Tom Stellard	75aadc2	2012-12-11 21:25:42 +0000	[diff] [blame]	6	//
				7	//===----------------------------------------------------------------------===//
				8	//
				9	/// \file
Adrian Prantl	5f8f34e4	2018-05-01 15:54:18 +0000	[diff] [blame]	10	/// Implements the AMDGPU specific subclass of TargetSubtarget.
Tom Stellard	75aadc2	2012-12-11 21:25:42 +0000	[diff] [blame]	11	//
				12	//===----------------------------------------------------------------------===//
				13
				14	#include "AMDGPUSubtarget.h"
Quentin Colombet	f3f7d4d	2017-07-05 18:40:56 +0000	[diff] [blame]	15	#include "AMDGPU.h"
				16	#include "AMDGPUTargetMachine.h"
Quentin Colombet	f3f7d4d	2017-07-05 18:40:56 +0000	[diff] [blame]	17	#include "AMDGPUCallLowering.h"
				18	#include "AMDGPUInstructionSelector.h"
				19	#include "AMDGPULegalizerInfo.h"
				20	#include "AMDGPURegisterBankInfo.h"
Konstantin Zhuravlyov	e03b1d7	2017-02-08 13:02:33 +0000	[diff] [blame]	21	#include "SIMachineFunctionInfo.h"
Tom Stellard	44b30b4	2018-05-22 02:03:23 +0000	[diff] [blame]	22	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
Matt Arsenault	d9a23ab	2014-07-13 02:08:26 +0000	[diff] [blame]	23	#include "llvm/ADT/SmallString.h"
Tom Stellard	83f0bce	2015-01-29 16:55:25 +0000	[diff] [blame]	24	#include "llvm/CodeGen/MachineScheduler.h"
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	25	#include "llvm/MC/MCSubtargetInfo.h"
Stanislav Mekhanoshin	c90347d	2017-04-12 20:48:56 +0000	[diff] [blame]	26	#include "llvm/IR/MDBuilder.h"
David Blaikie	1be62f0	2017-11-03 22:32:11 +0000	[diff] [blame]	27	#include "llvm/CodeGen/TargetFrameLowering.h"
Eugene Zelenko	6a9226d	2016-12-12 22:23:53 +0000	[diff] [blame]	28	#include <algorithm>
Matt Arsenault	d9a23ab	2014-07-13 02:08:26 +0000	[diff] [blame]	29
Tom Stellard	75aadc2	2012-12-11 21:25:42 +0000	[diff] [blame]	30	using namespace llvm;
				31
Chandler Carruth	e96dd89	2014-04-21 22:55:11 +0000	[diff] [blame]	32	#define DEBUG_TYPE "amdgpu-subtarget"
				33
Tom Stellard	75aadc2	2012-12-11 21:25:42 +0000	[diff] [blame]	34	#define GET_SUBTARGETINFO_TARGET_DESC
				35	#define GET_SUBTARGETINFO_CTOR
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	36	#define AMDGPUSubtarget GCNSubtarget
Tom Stellard	75aadc2	2012-12-11 21:25:42 +0000	[diff] [blame]	37	#include "AMDGPUGenSubtargetInfo.inc"
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	38	#define GET_SUBTARGETINFO_TARGET_DESC
				39	#define GET_SUBTARGETINFO_CTOR
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	40	#undef AMDGPUSubtarget
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	41	#include "R600GenSubtargetInfo.inc"
Tom Stellard	75aadc2	2012-12-11 21:25:42 +0000	[diff] [blame]	42
Stanislav Mekhanoshin	b83e283	2019-07-11 21:25:00 +0000	[diff] [blame]	43	static cl::opt<bool> DisablePowerSched(
				44	"amdgpu-disable-power-sched",
				45	cl::desc("Disable scheduling to minimize mAI power bursts"),
				46	cl::init(false));
				47
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	48	GCNSubtarget::~GCNSubtarget() = default;
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	49
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	50	R600Subtarget &
				51	R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
				52	StringRef GPU, StringRef FS) {
Matt Arsenault	055e4dc	2019-03-29 19:14:54 +0000	[diff] [blame]	53	SmallString<256> FullFS("+promote-alloca,");
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	54	FullFS += FS;
				55	ParseSubtargetFeatures(GPU, FullFS);
				56
				57	// FIXME: I don't think think Evergreen has any useful support for
				58	// denormals, but should be checked. Should we issue a warning somewhere
				59	// if someone tries to enable these?
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	60	if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	61	FP32Denormals = false;
				62	}
				63
				64	HasMulU24 = getGeneration() >= EVERGREEN;
				65	HasMulI24 = hasCaymanISA();
				66
				67	return *this;
				68	}
				69
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	70	GCNSubtarget &
				71	GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
Matt Arsenault	f426ddb	2019-04-03 01:58:57 +0000	[diff] [blame]	72	StringRef GPU, StringRef FS) {
Eric Christopher	ac4b69e	2014-07-25 22:22:39 +0000	[diff] [blame]	73	// Determine default and user-specified characteristics
Matt Arsenault	f171cf2	2014-07-14 23:40:49 +0000	[diff] [blame]	74	// On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
				75	// enabled, but some instructions do not respect them and they run at the
				76	// double precision rate, so don't enable by default.
				77	//
				78	// We want to be able to turn these off, but making this a subtarget feature
				79	// for SI has the unhelpful behavior that it unsets everything else if you
				80	// disable it.
David Stuttard	f77079f	2019-01-14 11:55:24 +0000	[diff] [blame]	81	//
				82	// Similarly we want enable-prt-strict-null to be on by default and not to
				83	// unset everything else if it is disabled
Matt Arsenault	d9a23ab	2014-07-13 02:08:26 +0000	[diff] [blame]	84
Matt Arsenault	f426ddb	2019-04-03 01:58:57 +0000	[diff] [blame]	85	// Assuming ECC is enabled is the conservative default.
Matt Arsenault	df24c92	2019-05-16 14:48:34 +0000	[diff] [blame]	86	SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
Jan Vesely	d1c9b61	2017-12-04 22:57:29 +0000	[diff] [blame]	87
Changpeng Fang	b41574a	2015-12-22 20:55:23 +0000	[diff] [blame]	88	if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
Matt Arsenault	e0c1f9e	2019-03-17 21:31:35 +0000	[diff] [blame]	89	FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
Matt Arsenault	a6867fd	2017-01-23 22:31:03 +0000	[diff] [blame]	90
Jan Vesely	d1c9b61	2017-12-04 22:57:29 +0000	[diff] [blame]	91	// FIXME: I don't think think Evergreen has any useful support for
				92	// denormals, but should be checked. Should we issue a warning somewhere
				93	// if someone tries to enable these?
				94	if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
				95	FullFS += "+fp64-fp16-denormals,";
				96	} else {
				97	FullFS += "-fp32-denormals,";
				98	}
				99
David Stuttard	f77079f	2019-01-14 11:55:24 +0000	[diff] [blame]	100	FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
				101
Stanislav Mekhanoshin	8bcc9bb	2019-06-13 19:18:29 +0000	[diff] [blame]	102	// Disable mutually exclusive bits.
				103	if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
				104	if (FS.find_lower("wavefrontsize16") == StringRef::npos)
				105	FullFS += "-wavefrontsize16,";
				106	if (FS.find_lower("wavefrontsize32") == StringRef::npos)
				107	FullFS += "-wavefrontsize32,";
				108	if (FS.find_lower("wavefrontsize64") == StringRef::npos)
				109	FullFS += "-wavefrontsize64,";
				110	}
				111
Matt Arsenault	d9a23ab	2014-07-13 02:08:26 +0000	[diff] [blame]	112	FullFS += FS;
				113
				114	ParseSubtargetFeatures(GPU, FullFS);
Tom Stellard	2e59a45	2014-06-13 01:32:00 +0000	[diff] [blame]	115
Jan Vesely	d1c9b61	2017-12-04 22:57:29 +0000	[diff] [blame]	116	// We don't support FP64 for EG/NI atm.
				117	assert(!hasFP64() \|\| (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
				118
Matt Arsenault	d8f7ea3	2017-01-27 17:42:26 +0000	[diff] [blame]	119	// Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
				120	// on VI and newer hardware to avoid assertion failures due to missing ADDR64
				121	// variants of MUBUF instructions.
				122	if (!hasAddr64() && !FS.contains("flat-for-global")) {
				123	FlatForGlobal = true;
				124	}
				125
Matt Arsenault	24ee078	2016-02-12 02:40:47 +0000	[diff] [blame]	126	// Set defaults if needed.
				127	if (MaxPrivateElementSize == 0)
Matt Arsenault	e8ed8e5	2016-05-11 00:28:54 +0000	[diff] [blame]	128	MaxPrivateElementSize = 4;
Matt Arsenault	24ee078	2016-02-12 02:40:47 +0000	[diff] [blame]	129
Matt Arsenault	8728c5f	2017-08-07 14:58:04 +0000	[diff] [blame]	130	if (LDSBankCount == 0)
				131	LDSBankCount = 32;
				132
				133	if (TT.getArch() == Triple::amdgcn) {
				134	if (LocalMemorySize == 0)
				135	LocalMemorySize = 32768;
				136
				137	// Do something sensible for unspecified target.
				138	if (!HasMovrel && !HasVGPRIndexMode)
				139	HasMovrel = true;
				140	}
				141
Matt Arsenault	d704727	2019-02-08 19:18:01 +0000	[diff] [blame]	142	// Don't crash on invalid devices.
				143	if (WavefrontSize == 0)
				144	WavefrontSize = 64;
				145
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	146	HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
				147
Matt Arsenault	df24c92	2019-05-16 14:48:34 +0000	[diff] [blame]	148	if (DoesNotSupportXNACK && EnableXNACK) {
				149	ToggleFeature(AMDGPU::FeatureXNACK);
				150	EnableXNACK = false;
				151	}
				152
Matt Arsenault	f426ddb	2019-04-03 01:58:57 +0000	[diff] [blame]	153	// ECC is on by default, but turn it off if the hardware doesn't support it
				154	// anyway. This matters for the gfx9 targets with d16 loads, but don't support
				155	// ECC.
				156	if (DoesNotSupportSRAMECC && EnableSRAMECC) {
				157	ToggleFeature(AMDGPU::FeatureSRAMECC);
				158	EnableSRAMECC = false;
				159	}
				160
Eric Christopher	ac4b69e	2014-07-25 22:22:39 +0000	[diff] [blame]	161	return *this;
				162	}
				163
Konstantin Zhuravlyov	71e43ee	2018-09-12 18:50:47 +0000	[diff] [blame]	164	AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	165	TargetTriple(TT),
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	166	Has16BitInsts(false),
				167	HasMadMixInsts(false),
				168	FP32Denormals(false),
				169	FPExceptions(false),
				170	HasSDWA(false),
				171	HasVOP3PInsts(false),
				172	HasMulI24(true),
				173	HasMulU24(true),
Matt Arsenault	6c7ba82	2018-08-15 21:03:55 +0000	[diff] [blame]	174	HasInv2PiInlineImm(false),
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	175	HasFminFmaxLegacy(true),
				176	EnablePromoteAlloca(false),
David Stuttard	20de3e9	2018-09-14 10:27:19 +0000	[diff] [blame]	177	HasTrigReducedRange(false),
Stanislav Mekhanoshin	2594fa8	2019-07-31 01:07:10 +0000	[diff] [blame]	178	MaxWavesPerEU(10),
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	179	LocalMemorySize(0),
				180	WavefrontSize(0)
				181	{ }
				182
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	183	GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
Konstantin Zhuravlyov	71e43ee	2018-09-12 18:50:47 +0000	[diff] [blame]	184	const GCNTargetMachine &TM) :
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	185	AMDGPUGenSubtargetInfo(TT, GPU, FS),
Konstantin Zhuravlyov	71e43ee	2018-09-12 18:50:47 +0000	[diff] [blame]	186	AMDGPUSubtarget(TT),
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	187	TargetTriple(TT),
Matt Arsenault	e0c1f9e	2019-03-17 21:31:35 +0000	[diff] [blame]	188	Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
Stanislav Mekhanoshin	06d3b41	2018-09-17 16:04:32 +0000	[diff] [blame]	189	InstrItins(getInstrItineraryForCPU(GPU)),
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	190	LDSBankCount(0),
				191	MaxPrivateElementSize(0),
Tom Stellard	40ce8af	2015-01-28 16:04:26 +0000	[diff] [blame]	192
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	193	FastFMAF32(false),
				194	HalfRate64Ops(false),
				195
Matt Arsenault	a6867fd	2017-01-23 22:31:03 +0000	[diff] [blame]	196	FP64FP16Denormals(false),
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	197	FlatForGlobal(false),
Konstantin Zhuravlyov	be6c0ca	2017-06-02 17:40:26 +0000	[diff] [blame]	198	AutoWaitcntBeforeBarrier(false),
Konstantin Zhuravlyov	eda425e	2017-10-14 15:59:07 +0000	[diff] [blame]	199	CodeObjectV3(false),
Tom Stellard	64a9d08	2016-10-14 18:10:39 +0000	[diff] [blame]	200	UnalignedScratchAccess(false),
Matt Arsenault	7f681ac	2016-07-01 23:03:44 +0000	[diff] [blame]	201	UnalignedBufferAccess(false),
				202
Matt Arsenault	e823d92	2017-02-18 18:29:53 +0000	[diff] [blame]	203	HasApertureRegs(false),
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	204	EnableXNACK(false),
Matt Arsenault	df24c92	2019-05-16 14:48:34 +0000	[diff] [blame]	205	DoesNotSupportXNACK(false),
Stanislav Mekhanoshin	cee607e	2019-04-24 17:03:15 +0000	[diff] [blame]	206	EnableCuMode(false),
Wei Ding	205bfdb	2017-02-10 02:15:29 +0000	[diff] [blame]	207	TrapHandler(false),
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	208
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	209	EnableLoadStoreOpt(false),
				210	EnableUnsafeDSOffsetFolding(false),
				211	EnableSIScheduler(false),
Marek Olsak	a9a58fa	2018-04-10 22:48:23 +0000	[diff] [blame]	212	EnableDS128(false),
David Stuttard	f77079f	2019-01-14 11:55:24 +0000	[diff] [blame]	213	EnablePRTStrictNull(false),
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	214	DumpCode(false),
				215
				216	FP64(false),
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	217	GCN3Encoding(false),
				218	CIInsts(false),
Stanislav Mekhanoshin	7895c03	2019-04-05 18:24:34 +0000	[diff] [blame]	219	GFX8Insts(false),
Matt Arsenault	2021f08	2017-02-18 19:12:26 +0000	[diff] [blame]	220	GFX9Insts(false),
Stanislav Mekhanoshin	cee607e	2019-04-24 17:03:15 +0000	[diff] [blame]	221	GFX10Insts(false),
Stanislav Mekhanoshin	7895c03	2019-04-05 18:24:34 +0000	[diff] [blame]	222	GFX7GFX8GFX9Insts(false),
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	223	SGPRInitBug(false),
				224	HasSMemRealTime(false),
Dmitry Preobrazhensky	ff64aa5	2017-08-16 13:51:56 +0000	[diff] [blame]	225	HasIntClamp(false),
Matt Arsenault	0084adc	2018-04-30 19:08:16 +0000	[diff] [blame]	226	HasFmaMixInsts(false),
Matt Arsenault	cc88ce3	2016-10-12 18:00:51 +0000	[diff] [blame]	227	HasMovrel(false),
				228	HasVGPRIndexMode(false),
Matt Arsenault	c88ba36	2016-10-29 04:05:06 +0000	[diff] [blame]	229	HasScalarStores(false),
Dmitry Preobrazhensky	6bad04e	2018-04-02 16:10:25 +0000	[diff] [blame]	230	HasScalarAtomics(false),
Sam Kolton	3c4933f	2017-06-22 06:26:41 +0000	[diff] [blame]	231	HasSDWAOmod(false),
				232	HasSDWAScalar(false),
				233	HasSDWASdst(false),
				234	HasSDWAMac(false),
Sam Kolton	a179d25	2017-06-27 15:02:23 +0000	[diff] [blame]	235	HasSDWAOutModsVOPC(false),
Sam Kolton	07dbde2	2017-01-20 10:01:25 +0000	[diff] [blame]	236	HasDPP(false),
Stanislav Mekhanoshin	245b5ba	2019-06-12 18:02:41 +0000	[diff] [blame]	237	HasDPP8(false),
Ryan Taylor	1f334d0	2018-08-28 15:07:30 +0000	[diff] [blame]	238	HasR128A16(false),
Stanislav Mekhanoshin	cee607e	2019-04-24 17:03:15 +0000	[diff] [blame]	239	HasNSAEncoding(false),
Matt Arsenault	0084adc	2018-04-30 19:08:16 +0000	[diff] [blame]	240	HasDLInsts(false),
Stanislav Mekhanoshin	0e858b0	2019-02-09 00:34:21 +0000	[diff] [blame]	241	HasDot1Insts(false),
				242	HasDot2Insts(false),
Stanislav Mekhanoshin	22b2c3d	2019-07-09 18:10:06 +0000	[diff] [blame]	243	HasDot3Insts(false),
				244	HasDot4Insts(false),
Stanislav Mekhanoshin	c43e67b	2019-06-14 00:33:31 +0000	[diff] [blame]	245	HasDot5Insts(false),
				246	HasDot6Insts(false),
Stanislav Mekhanoshin	22b2c3d	2019-07-09 18:10:06 +0000	[diff] [blame]	247	HasMAIInsts(false),
				248	HasPkFmacF16Inst(false),
				249	HasAtomicFaddInsts(false),
Konstantin Zhuravlyov	108927b	2018-11-05 22:44:19 +0000	[diff] [blame]	250	EnableSRAMECC(false),
Matt Arsenault	f426ddb	2019-04-03 01:58:57 +0000	[diff] [blame]	251	DoesNotSupportSRAMECC(false),
Stanislav Mekhanoshin	cee607e	2019-04-24 17:03:15 +0000	[diff] [blame]	252	HasNoSdstCMPX(false),
				253	HasVscnt(false),
				254	HasRegisterBanking(false),
				255	HasVOP3Literal(false),
				256	HasNoDataDepHazard(false),
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	257	FlatAddressSpace(false),
Matt Arsenault	acdc765	2017-05-10 21:19:05 +0000	[diff] [blame]	258	FlatInstOffsets(false),
				259	FlatGlobalInsts(false),
				260	FlatScratchInsts(false),
Stanislav Mekhanoshin	cee607e	2019-04-24 17:03:15 +0000	[diff] [blame]	261	ScalarFlatScratchInsts(false),
Matt Arsenault	c37fe66	2017-07-20 17:42:47 +0000	[diff] [blame]	262	AddNoCarryInsts(false),
Changpeng Fang	44dfa1d	2018-01-12 21:12:19 +0000	[diff] [blame]	263	HasUnpackedD16VMem(false),
Stanislav Mekhanoshin	cee607e	2019-04-24 17:03:15 +0000	[diff] [blame]	264	LDSMisalignedBug(false),
Stanislav Mekhanoshin	8fe1245	2019-08-23 22:09:58 +0000	[diff] [blame]	265	HasMFMAInlineLiteralBug(false),
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	266
Alexander Timofeev	1800956	2016-12-08 17:28:47 +0000	[diff] [blame]	267	ScalarizeGlobal(false),
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	268
Stanislav Mekhanoshin	cee607e	2019-04-24 17:03:15 +0000	[diff] [blame]	269	HasVcmpxPermlaneHazard(false),
				270	HasVMEMtoScalarWriteHazard(false),
				271	HasSMEMtoVectorWriteHazard(false),
				272	HasInstFwdPrefetchBug(false),
				273	HasVcmpxExecWARHazard(false),
				274	HasLdsBranchVmemWARHazard(false),
				275	HasNSAtoVMEMBug(false),
Ryan Taylor	9ab812d	2019-06-26 17:34:57 +0000	[diff] [blame]	276	HasOffset3fBug(false),
Stanislav Mekhanoshin	cee607e	2019-04-24 17:03:15 +0000	[diff] [blame]	277	HasFlatSegmentOffsetBug(false),
				278
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	279	FeatureDisable(false),
Tom Stellard	752ddbd	2018-07-11 22:15:15 +0000	[diff] [blame]	280	InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
Matt Arsenault	4bec7d4	2018-07-20 09:05:08 +0000	[diff] [blame]	281	TLInfo(TM, *this),
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	282	FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
Stanislav Mekhanoshin	2594fa8	2019-07-31 01:07:10 +0000	[diff] [blame]	283	MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	284	CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
				285	Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
Matt Arsenault	3ecab8e	2019-09-19 16:26:14 +0000	[diff] [blame]	286	RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	287	InstSelector.reset(new AMDGPUInstructionSelector(
				288	this, static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
Tom Stellard	a40f971	2014-01-22 21:55:43 +0000	[diff] [blame]	289	}
Tom Stellard	b8fd6ef	2014-12-02 22:00:07 +0000	[diff] [blame]	290
Stanislav Mekhanoshin	f2baae0	2019-05-02 03:47:23 +0000	[diff] [blame]	291	unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
				292	if (getGeneration() < GFX10)
				293	return 1;
				294
				295	switch (Opcode) {
				296	case AMDGPU::V_LSHLREV_B64:
				297	case AMDGPU::V_LSHLREV_B64_gfx10:
				298	case AMDGPU::V_LSHL_B64:
				299	case AMDGPU::V_LSHRREV_B64:
				300	case AMDGPU::V_LSHRREV_B64_gfx10:
				301	case AMDGPU::V_LSHR_B64:
				302	case AMDGPU::V_ASHRREV_I64:
				303	case AMDGPU::V_ASHRREV_I64_gfx10:
				304	case AMDGPU::V_ASHR_I64:
				305	return 1;
				306	}
				307
				308	return 2;
				309	}
				310
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	311	unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
Stanislav Mekhanoshin	2b913b1	2017-02-01 22:59:50 +0000	[diff] [blame]	312	const Function &F) const {
				313	if (NWaves == 1)
Matt Arsenault	8a028bf	2016-05-16 21:19:59 +0000	[diff] [blame]	314	return getLocalMemorySize();
Stanislav Mekhanoshin	2b913b1	2017-02-01 22:59:50 +0000	[diff] [blame]	315	unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
				316	unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
Stanislav Mekhanoshin	cee607e	2019-04-24 17:03:15 +0000	[diff] [blame]	317	if (!WorkGroupsPerCu)
				318	return 0;
Stanislav Mekhanoshin	2b913b1	2017-02-01 22:59:50 +0000	[diff] [blame]	319	unsigned MaxWaves = getMaxWavesPerEU();
				320	return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
Matt Arsenault	8a028bf	2016-05-16 21:19:59 +0000	[diff] [blame]	321	}
				322
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	323	unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
Stanislav Mekhanoshin	2b913b1	2017-02-01 22:59:50 +0000	[diff] [blame]	324	const Function &F) const {
				325	unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
				326	unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
Stanislav Mekhanoshin	cee607e	2019-04-24 17:03:15 +0000	[diff] [blame]	327	if (!WorkGroupsPerCu)
				328	return 0;
Stanislav Mekhanoshin	2b913b1	2017-02-01 22:59:50 +0000	[diff] [blame]	329	unsigned MaxWaves = getMaxWavesPerEU();
				330	unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
				331	unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
				332	NumWaves = std::min(NumWaves, MaxWaves);
				333	NumWaves = std::max(NumWaves, 1u);
				334	return NumWaves;
Matt Arsenault	8a028bf	2016-05-16 21:19:59 +0000	[diff] [blame]	335	}
				336
Tom Stellard	44b30b4	2018-05-22 02:03:23 +0000	[diff] [blame]	337	unsigned
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	338	AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
Tom Stellard	44b30b4	2018-05-22 02:03:23 +0000	[diff] [blame]	339	const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
				340	return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
				341	}
				342
Matt Arsenault	b791802	2017-10-23 17:09:35 +0000	[diff] [blame]	343	std::pair<unsigned, unsigned>
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	344	AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
Matt Arsenault	b791802	2017-10-23 17:09:35 +0000	[diff] [blame]	345	switch (CC) {
				346	case CallingConv::AMDGPU_CS:
				347	case CallingConv::AMDGPU_KERNEL:
				348	case CallingConv::SPIR_KERNEL:
Stanislav Mekhanoshin	cee607e	2019-04-24 17:03:15 +0000	[diff] [blame]	349	return std::make_pair(getWavefrontSize() * 2,
				350	std::max(getWavefrontSize() * 4, 256u));
Matt Arsenault	b791802	2017-10-23 17:09:35 +0000	[diff] [blame]	351	case CallingConv::AMDGPU_VS:
				352	case CallingConv::AMDGPU_LS:
				353	case CallingConv::AMDGPU_HS:
				354	case CallingConv::AMDGPU_ES:
				355	case CallingConv::AMDGPU_GS:
				356	case CallingConv::AMDGPU_PS:
				357	return std::make_pair(1, getWavefrontSize());
				358	default:
				359	return std::make_pair(1, 16 * getWavefrontSize());
				360	}
				361	}
				362
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	363	std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
Konstantin Zhuravlyov	1d65026	2016-09-06 20:22:28 +0000	[diff] [blame]	364	const Function &F) const {
Matt Arsenault	b791802	2017-10-23 17:09:35 +0000	[diff] [blame]	365	// FIXME: 1024 if function.
Konstantin Zhuravlyov	1d65026	2016-09-06 20:22:28 +0000	[diff] [blame]	366	// Default minimum/maximum flat work group sizes.
				367	std::pair<unsigned, unsigned> Default =
Matt Arsenault	b791802	2017-10-23 17:09:35 +0000	[diff] [blame]	368	getDefaultFlatWorkGroupSize(F.getCallingConv());
Konstantin Zhuravlyov	1d65026	2016-09-06 20:22:28 +0000	[diff] [blame]	369
Konstantin Zhuravlyov	1d65026	2016-09-06 20:22:28 +0000	[diff] [blame]	370	// Requested minimum/maximum flat work group sizes.
				371	std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
				372	F, "amdgpu-flat-work-group-size", Default);
				373
				374	// Make sure requested minimum is less than requested maximum.
				375	if (Requested.first > Requested.second)
				376	return Default;
				377
				378	// Make sure requested values do not violate subtarget's specifications.
				379	if (Requested.first < getMinFlatWorkGroupSize())
				380	return Default;
				381	if (Requested.second > getMaxFlatWorkGroupSize())
				382	return Default;
				383
				384	return Requested;
				385	}
				386
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	387	std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
Konstantin Zhuravlyov	1d65026	2016-09-06 20:22:28 +0000	[diff] [blame]	388	const Function &F) const {
Konstantin Zhuravlyov	1d65026	2016-09-06 20:22:28 +0000	[diff] [blame]	389	// Default minimum/maximum number of waves per execution unit.
Konstantin Zhuravlyov	fd87137	2017-02-09 21:33:23 +0000	[diff] [blame]	390	std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
Konstantin Zhuravlyov	1d65026	2016-09-06 20:22:28 +0000	[diff] [blame]	391
				392	// Default/requested minimum/maximum flat work group sizes.
				393	std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
				394
				395	// If minimum/maximum flat work group sizes were explicitly requested using
				396	// "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
				397	// number of waves per execution unit to values implied by requested
				398	// minimum/maximum flat work group sizes.
				399	unsigned MinImpliedByFlatWorkGroupSize =
				400	getMaxWavesPerEU(FlatWorkGroupSizes.second);
				401	bool RequestedFlatWorkGroupSize = false;
				402
Matt Arsenault	4fb580c	2019-06-05 20:32:32 +0000	[diff] [blame]	403	if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
Konstantin Zhuravlyov	1d65026	2016-09-06 20:22:28 +0000	[diff] [blame]	404	Default.first = MinImpliedByFlatWorkGroupSize;
				405	RequestedFlatWorkGroupSize = true;
				406	}
				407
				408	// Requested minimum/maximum number of waves per execution unit.
				409	std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
				410	F, "amdgpu-waves-per-eu", Default, true);
				411
				412	// Make sure requested minimum is less than requested maximum.
				413	if (Requested.second && Requested.first > Requested.second)
				414	return Default;
				415
				416	// Make sure requested values do not violate subtarget's specifications.
				417	if (Requested.first < getMinWavesPerEU() \|\|
				418	Requested.first > getMaxWavesPerEU())
				419	return Default;
				420	if (Requested.second > getMaxWavesPerEU())
				421	return Default;
				422
				423	// Make sure requested values are compatible with values implied by requested
				424	// minimum/maximum flat work group sizes.
				425	if (RequestedFlatWorkGroupSize &&
Konstantin Zhuravlyov	2ec725c	2017-07-16 19:38:47 +0000	[diff] [blame]	426	Requested.first < MinImpliedByFlatWorkGroupSize)
Konstantin Zhuravlyov	1d65026	2016-09-06 20:22:28 +0000	[diff] [blame]	427	return Default;
				428
				429	return Requested;
				430	}
				431
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	432	bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
Stanislav Mekhanoshin	c90347d	2017-04-12 20:48:56 +0000	[diff] [blame]	433	Function *Kernel = I->getParent()->getParent();
				434	unsigned MinSize = 0;
				435	unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
				436	bool IdQuery = false;
				437
				438	// If reqd_work_group_size is present it narrows value down.
				439	if (auto *CI = dyn_cast<CallInst>(I)) {
				440	const Function *F = CI->getCalledFunction();
				441	if (F) {
				442	unsigned Dim = UINT_MAX;
				443	switch (F->getIntrinsicID()) {
				444	case Intrinsic::amdgcn_workitem_id_x:
				445	case Intrinsic::r600_read_tidig_x:
				446	IdQuery = true;
Simon Pilgrim	0f5b350	2017-07-07 10:18:57 +0000	[diff] [blame]	447	LLVM_FALLTHROUGH;
Stanislav Mekhanoshin	c90347d	2017-04-12 20:48:56 +0000	[diff] [blame]	448	case Intrinsic::r600_read_local_size_x:
				449	Dim = 0;
				450	break;
				451	case Intrinsic::amdgcn_workitem_id_y:
				452	case Intrinsic::r600_read_tidig_y:
				453	IdQuery = true;
Simon Pilgrim	0f5b350	2017-07-07 10:18:57 +0000	[diff] [blame]	454	LLVM_FALLTHROUGH;
Stanislav Mekhanoshin	c90347d	2017-04-12 20:48:56 +0000	[diff] [blame]	455	case Intrinsic::r600_read_local_size_y:
				456	Dim = 1;
				457	break;
				458	case Intrinsic::amdgcn_workitem_id_z:
				459	case Intrinsic::r600_read_tidig_z:
				460	IdQuery = true;
Simon Pilgrim	0f5b350	2017-07-07 10:18:57 +0000	[diff] [blame]	461	LLVM_FALLTHROUGH;
Stanislav Mekhanoshin	c90347d	2017-04-12 20:48:56 +0000	[diff] [blame]	462	case Intrinsic::r600_read_local_size_z:
				463	Dim = 2;
				464	break;
				465	default:
				466	break;
				467	}
				468	if (Dim <= 3) {
				469	if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
				470	if (Node->getNumOperands() == 3)
				471	MinSize = MaxSize = mdconst::extract<ConstantInt>(
				472	Node->getOperand(Dim))->getZExtValue();
				473	}
				474	}
				475	}
				476
				477	if (!MaxSize)
				478	return false;
				479
				480	// Range metadata is [Lo, Hi). For ID query we need to pass max size
				481	// as Hi. For size query we need to pass Hi + 1.
				482	if (IdQuery)
				483	MinSize = 0;
				484	else
				485	++MaxSize;
				486
				487	MDBuilder MDB(I->getContext());
				488	MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
				489	APInt(32, MaxSize));
				490	I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
				491	return true;
				492	}
				493
Matt Arsenault	4bec7d4	2018-07-20 09:05:08 +0000	[diff] [blame]	494	uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
Guillaume Chatelet	b65fa48	2019-10-15 12:56:24 +0000	[diff] [blame]	495	Align &MaxAlign) const {
Matt Arsenault	4bec7d4	2018-07-20 09:05:08 +0000	[diff] [blame]	496	assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL \|\|
				497	F.getCallingConv() == CallingConv::SPIR_KERNEL);
				498
				499	const DataLayout &DL = F.getParent()->getDataLayout();
				500	uint64_t ExplicitArgBytes = 0;
Guillaume Chatelet	b65fa48	2019-10-15 12:56:24 +0000	[diff] [blame]	501	MaxAlign = Align::None();
Matt Arsenault	4bec7d4	2018-07-20 09:05:08 +0000	[diff] [blame]	502
				503	for (const Argument &Arg : F.args()) {
				504	Type *ArgTy = Arg.getType();
				505
Guillaume Chatelet	b65fa48	2019-10-15 12:56:24 +0000	[diff] [blame]	506	const Align Alignment(DL.getABITypeAlignment(ArgTy));
Matt Arsenault	4bec7d4	2018-07-20 09:05:08 +0000	[diff] [blame]	507	uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
Guillaume Chatelet	b65fa48	2019-10-15 12:56:24 +0000	[diff] [blame]	508	ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
				509	MaxAlign = std::max(MaxAlign, Alignment);
Matt Arsenault	4bec7d4	2018-07-20 09:05:08 +0000	[diff] [blame]	510	}
				511
				512	return ExplicitArgBytes;
				513	}
				514
				515	unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
Guillaume Chatelet	b65fa48	2019-10-15 12:56:24 +0000	[diff] [blame]	516	Align &MaxAlign) const {
Matt Arsenault	4bec7d4	2018-07-20 09:05:08 +0000	[diff] [blame]	517	uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
				518
				519	unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
				520
				521	uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
				522	unsigned ImplicitBytes = getImplicitArgNumBytes(F);
				523	if (ImplicitBytes != 0) {
Guillaume Chatelet	b65fa48	2019-10-15 12:56:24 +0000	[diff] [blame]	524	const Align Alignment = getAlignmentForImplicitArgPtr();
Matt Arsenault	4bec7d4	2018-07-20 09:05:08 +0000	[diff] [blame]	525	TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
				526	}
				527
				528	// Being able to dereference past the end is useful for emitting scalar loads.
				529	return alignTo(TotalSize, 4);
				530	}
				531
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	532	R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
				533	const TargetMachine &TM) :
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	534	R600GenSubtargetInfo(TT, GPU, FS),
Konstantin Zhuravlyov	71e43ee	2018-09-12 18:50:47 +0000	[diff] [blame]	535	AMDGPUSubtarget(TT),
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	536	InstrInfo(*this),
				537	FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	538	FMA(false),
				539	CaymanISA(false),
				540	CFALUBug(false),
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	541	HasVertexCache(false),
				542	R600ALUInst(false),
				543	FP64(false),
				544	TexVTXClauseSize(0),
				545	Gen(R600),
				546	TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
Matt Arsenault	0da6350	2018-08-31 05:49:54 +0000	[diff] [blame]	547	InstrItins(getInstrItineraryForCPU(GPU)) { }
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	548
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	549	void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
Matt Arsenault	55dff27	2016-06-28 00:11:26 +0000	[diff] [blame]	550	unsigned NumRegionInstrs) const {
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	551	// Track register pressure so the scheduler can try to decrease
				552	// pressure once register usage is above the threshold defined by
				553	// SIRegisterInfo::getRegPressureSetLimit()
				554	Policy.ShouldTrackPressure = true;
Tom Stellard	83f0bce	2015-01-29 16:55:25 +0000	[diff] [blame]	555
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	556	// Enabling both top down and bottom up scheduling seems to give us less
				557	// register spills than just using one of these approaches on its own.
				558	Policy.OnlyTopDown = false;
				559	Policy.OnlyBottomUp = false;
Tom Stellard	83f0bce	2015-01-29 16:55:25 +0000	[diff] [blame]	560
Alexander Timofeev	9f61fea	2017-02-14 14:29:05 +0000	[diff] [blame]	561	// Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
				562	if (!enableSIScheduler())
				563	Policy.ShouldTrackLaneMasks = true;
Matt Arsenault	43e92fe	2016-06-24 06:30:11 +0000	[diff] [blame]	564	}
Tom Stellard	0bc954e	2016-03-30 16:35:09 +0000	[diff] [blame]	565
Stanislav Mekhanoshin	cee607e	2019-04-24 17:03:15 +0000	[diff] [blame]	566	bool GCNSubtarget::hasMadF16() const {
				567	return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
				568	}
				569
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	570	unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
Stanislav Mekhanoshin	cee607e	2019-04-24 17:03:15 +0000	[diff] [blame]	571	if (getGeneration() >= AMDGPUSubtarget::GFX10)
Stanislav Mekhanoshin	2594fa8	2019-07-31 01:07:10 +0000	[diff] [blame]	572	return getMaxWavesPerEU();
Stanislav Mekhanoshin	cee607e	2019-04-24 17:03:15 +0000	[diff] [blame]	573
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	574	if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
Tom Stellard	0d23ebe	2016-08-29 19:42:52 +0000	[diff] [blame]	575	if (SGPRs <= 80)
				576	return 10;
				577	if (SGPRs <= 88)
				578	return 9;
				579	if (SGPRs <= 100)
				580	return 8;
				581	return 7;
				582	}
				583	if (SGPRs <= 48)
				584	return 10;
				585	if (SGPRs <= 56)
				586	return 9;
				587	if (SGPRs <= 64)
				588	return 8;
				589	if (SGPRs <= 72)
				590	return 7;
				591	if (SGPRs <= 80)
				592	return 6;
				593	return 5;
				594	}
				595
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	596	unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
Stanislav Mekhanoshin	7b5a54e	2019-07-19 21:29:51 +0000	[diff] [blame]	597	unsigned MaxWaves = getMaxWavesPerEU();
				598	unsigned Granule = getVGPRAllocGranule();
				599	if (VGPRs < Granule)
				600	return MaxWaves;
				601	unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
Stanislav Mekhanoshin	d487d64	2019-09-19 20:09:04 +0000	[diff] [blame]	602	return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
Tom Stellard	0d23ebe	2016-08-29 19:42:52 +0000	[diff] [blame]	603	}
Matt Arsenault	4eae301	2016-10-28 20:31:47 +0000	[diff] [blame]	604
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	605	unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
Konstantin Zhuravlyov	e03b1d7	2017-02-08 13:02:33 +0000	[diff] [blame]	606	const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
Stanislav Mekhanoshin	cee607e	2019-04-24 17:03:15 +0000	[diff] [blame]	607	if (getGeneration() >= AMDGPUSubtarget::GFX10)
				608	return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
				609
Konstantin Zhuravlyov	e03b1d7	2017-02-08 13:02:33 +0000	[diff] [blame]	610	if (MFI.hasFlatScratchInit()) {
				611	if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
				612	return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
				613	if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
				614	return 4; // FLAT_SCRATCH, VCC (in that order).
				615	}
				616
				617	if (isXNACKEnabled())
				618	return 4; // XNACK, VCC (in that order).
				619	return 2; // VCC.
				620	}
				621
Stanislav Mekhanoshin	2594fa8	2019-07-31 01:07:10 +0000	[diff] [blame]	622	unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
				623	unsigned LDSSize,
				624	unsigned NumSGPRs,
				625	unsigned NumVGPRs) const {
				626	unsigned Occupancy =
				627	std::min(getMaxWavesPerEU(),
				628	getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
				629	if (NumSGPRs)
				630	Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
				631	if (NumVGPRs)
				632	Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
				633	return Occupancy;
				634	}
				635
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	636	unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
Matthias Braun	f1caa28	2017-12-15 22:22:58 +0000	[diff] [blame]	637	const Function &F = MF.getFunction();
Konstantin Zhuravlyov	e03b1d7	2017-02-08 13:02:33 +0000	[diff] [blame]	638	const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
				639
				640	// Compute maximum number of SGPRs function can use using default/requested
				641	// minimum number of waves per execution unit.
				642	std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
				643	unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
				644	unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
				645
				646	// Check if maximum number of SGPRs was explicitly requested using
				647	// "amdgpu-num-sgpr" attribute.
				648	if (F.hasFnAttribute("amdgpu-num-sgpr")) {
				649	unsigned Requested = AMDGPU::getIntegerAttribute(
				650	F, "amdgpu-num-sgpr", MaxNumSGPRs);
				651
				652	// Make sure requested value does not violate subtarget's specifications.
				653	if (Requested && (Requested <= getReservedNumSGPRs(MF)))
				654	Requested = 0;
				655
				656	// If more SGPRs are required to support the input user/system SGPRs,
				657	// increase to accommodate them.
				658	//
				659	// FIXME: This really ends up using the requested number of SGPRs + number
				660	// of reserved special registers in total. Theoretically you could re-use
				661	// the last input registers for these special registers, but this would
				662	// require a lot of complexity to deal with the weird aliasing.
				663	unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
				664	if (Requested && Requested < InputNumSGPRs)
				665	Requested = InputNumSGPRs;
				666
				667	// Make sure requested value is compatible with values implied by
				668	// default/requested minimum/maximum number of waves per execution unit.
				669	if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
				670	Requested = 0;
				671	if (WavesPerEU.second &&
				672	Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
				673	Requested = 0;
				674
				675	if (Requested)
				676	MaxNumSGPRs = Requested;
				677	}
				678
Matt Arsenault	4eae301	2016-10-28 20:31:47 +0000	[diff] [blame]	679	if (hasSGPRInitBug())
Konstantin Zhuravlyov	9f89ede	2017-02-08 14:05:23 +0000	[diff] [blame]	680	MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
Matt Arsenault	4eae301	2016-10-28 20:31:47 +0000	[diff] [blame]	681
Konstantin Zhuravlyov	e03b1d7	2017-02-08 13:02:33 +0000	[diff] [blame]	682	return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
				683	MaxAddressableNumSGPRs);
				684	}
Matt Arsenault	4eae301	2016-10-28 20:31:47 +0000	[diff] [blame]	685
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	686	unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
Matthias Braun	f1caa28	2017-12-15 22:22:58 +0000	[diff] [blame]	687	const Function &F = MF.getFunction();
Konstantin Zhuravlyov	e03b1d7	2017-02-08 13:02:33 +0000	[diff] [blame]	688	const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
				689
				690	// Compute maximum number of VGPRs function can use using default/requested
				691	// minimum number of waves per execution unit.
				692	std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
				693	unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
				694
				695	// Check if maximum number of VGPRs was explicitly requested using
				696	// "amdgpu-num-vgpr" attribute.
				697	if (F.hasFnAttribute("amdgpu-num-vgpr")) {
				698	unsigned Requested = AMDGPU::getIntegerAttribute(
				699	F, "amdgpu-num-vgpr", MaxNumVGPRs);
				700
Konstantin Zhuravlyov	e03b1d7	2017-02-08 13:02:33 +0000	[diff] [blame]	701	// Make sure requested value is compatible with values implied by
				702	// default/requested minimum/maximum number of waves per execution unit.
				703	if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
				704	Requested = 0;
				705	if (WavesPerEU.second &&
				706	Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
				707	Requested = 0;
				708
				709	if (Requested)
				710	MaxNumVGPRs = Requested;
				711	}
				712
Konstantin Zhuravlyov	e004b3d	2018-06-21 20:28:19 +0000	[diff] [blame]	713	return MaxNumVGPRs;
Matt Arsenault	4eae301	2016-10-28 20:31:47 +0000	[diff] [blame]	714	}
Stanislav Mekhanoshin	d4ae470	2017-09-19 20:54:38 +0000	[diff] [blame]	715
Benjamin Kramer	f9ab3dd	2017-10-31 23:21:30 +0000	[diff] [blame]	716	namespace {
Stanislav Mekhanoshin	d4ae470	2017-09-19 20:54:38 +0000	[diff] [blame]	717	struct MemOpClusterMutation : ScheduleDAGMutation {
				718	const SIInstrInfo *TII;
				719
				720	MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
				721
Clement Courbet	b70355f	2019-03-29 08:33:05 +0000	[diff] [blame]	722	void apply(ScheduleDAGInstrs *DAG) override {
Stanislav Mekhanoshin	d4ae470	2017-09-19 20:54:38 +0000	[diff] [blame]	723	SUnit *SUa = nullptr;
				724	// Search for two consequent memory operations and link them
				725	// to prevent scheduler from moving them apart.
				726	// In DAG pre-process SUnits are in the original order of
				727	// the instructions before scheduling.
				728	for (SUnit &SU : DAG->SUnits) {
				729	MachineInstr &MI2 = *SU.getInstr();
				730	if (!MI2.mayLoad() && !MI2.mayStore()) {
				731	SUa = nullptr;
				732	continue;
				733	}
				734	if (!SUa) {
				735	SUa = &SU;
				736	continue;
				737	}
				738
				739	MachineInstr &MI1 = *SUa->getInstr();
				740	if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) \|\|
				741	(TII->isFLAT(MI1) && TII->isFLAT(MI2)) \|\|
				742	(TII->isSMRD(MI1) && TII->isSMRD(MI2)) \|\|
				743	(TII->isDS(MI1) && TII->isDS(MI2))) {
				744	SU.addPredBarrier(SUa);
				745
				746	for (const SDep &SI : SU.Preds) {
				747	if (SI.getSUnit() != SUa)
				748	SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
				749	}
				750
				751	if (&SU != &DAG->ExitSU) {
				752	for (const SDep &SI : SUa->Succs) {
				753	if (SI.getSUnit() != &SU)
				754	SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
				755	}
				756	}
				757	}
				758
				759	SUa = &SU;
				760	}
				761	}
				762	};
Stanislav Mekhanoshin	b83e283	2019-07-11 21:25:00 +0000	[diff] [blame]	763
				764	struct FillMFMAShadowMutation : ScheduleDAGMutation {
				765	const SIInstrInfo *TII;
				766
				767	ScheduleDAGMI *DAG;
				768
				769	FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
				770
				771	bool isSALU(const SUnit *SU) const {
Stanislav Mekhanoshin	fd08dcb	2019-07-15 15:34:05 +0000	[diff] [blame]	772	const MachineInstr *MI = SU->getInstr();
				773	return MI && TII->isSALU(*MI) && !MI->isTerminator();
Stanislav Mekhanoshin	b83e283	2019-07-11 21:25:00 +0000	[diff] [blame]	774	}
				775
				776	bool canAddEdge(const SUnit Succ, const SUnit Pred) const {
				777	if (Pred->NodeNum < Succ->NodeNum)
				778	return true;
				779
				780	SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
				781
				782	for (unsigned I = 0; I < Succs.size(); ++I) {
				783	for (const SDep &SI : Succs[I]->Succs) {
				784	const SUnit *SU = SI.getSUnit();
				785	if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
				786	Succs.push_back(SU);
				787	}
				788	}
				789
				790	SmallPtrSet<const SUnit*, 32> Visited;
				791	while (!Preds.empty()) {
				792	const SUnit *SU = Preds.pop_back_val();
				793	if (llvm::find(Succs, SU) != Succs.end())
				794	return false;
				795	Visited.insert(SU);
				796	for (const SDep &SI : SU->Preds)
				797	if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
				798	Preds.push_back(SI.getSUnit());
				799	}
				800
				801	return true;
				802	}
				803
				804	// Link as much SALU intructions in chain as possible. Return the size
				805	// of the chain. Links up to MaxChain instructions.
				806	unsigned linkSALUChain(SUnit From, SUnit To, unsigned MaxChain,
				807	SmallPtrSetImpl<SUnit *> &Visited) const {
				808	SmallVector<SUnit *, 8> Worklist({To});
				809	unsigned Linked = 0;
				810
				811	while (!Worklist.empty() && MaxChain-- > 0) {
				812	SUnit *SU = Worklist.pop_back_val();
				813	if (!Visited.insert(SU).second)
				814	continue;
				815
				816	LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
				817	dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
				818
				819	if (SU->addPred(SDep(From, SDep::Artificial), false))
				820	++Linked;
				821
				822	for (SDep &SI : From->Succs) {
				823	SUnit *SUv = SI.getSUnit();
				824	if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU))
				825	SUv->addPred(SDep(SU, SDep::Artificial), false);
				826	}
				827
				828	for (SDep &SI : SU->Succs) {
				829	SUnit *Succ = SI.getSUnit();
				830	if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
				831	Worklist.push_back(Succ);
				832	}
				833	}
				834
				835	return Linked;
				836	}
				837
				838	void apply(ScheduleDAGInstrs *DAGInstrs) override {
				839	const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
				840	if (!ST.hasMAIInsts() \|\| DisablePowerSched)
				841	return;
				842	DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
				843	const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
				844	if (!TSchedModel \|\| DAG->SUnits.empty())
				845	return;
				846
				847	// Scan for MFMA long latency instructions and try to add a dependency
				848	// of available SALU instructions to give them a chance to fill MFMA
				849	// shadow. That is desirable to fill MFMA shadow with SALU instructions
				850	// rather than VALU to prevent power consumption bursts and throttle.
				851	auto LastSALU = DAG->SUnits.begin();
				852	auto E = DAG->SUnits.end();
				853	SmallPtrSet<SUnit*, 32> Visited;
				854	for (SUnit &SU : DAG->SUnits) {
				855	MachineInstr &MAI = *SU.getInstr();
				856	if (!TII->isMAI(MAI) \|\|
				857	MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 \|\|
				858	MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
				859	continue;
				860
				861	unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
				862
				863	LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
				864	dbgs() << "Need " << Lat
				865	<< " instructions to cover latency.\n");
				866
				867	// Find up to Lat independent scalar instructions as early as
				868	// possible such that they can be scheduled after this MFMA.
				869	for ( ; Lat && LastSALU != E; ++LastSALU) {
				870	if (Visited.count(&*LastSALU))
				871	continue;
				872
				873	if (!isSALU(&LastSALU) \|\| !canAddEdge(&LastSALU, &SU))
				874	continue;
				875
				876	Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
				877	}
				878	}
				879	}
				880	};
Benjamin Kramer	f9ab3dd	2017-10-31 23:21:30 +0000	[diff] [blame]	881	} // namespace
Stanislav Mekhanoshin	d4ae470	2017-09-19 20:54:38 +0000	[diff] [blame]	882
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	883	void GCNSubtarget::getPostRAMutations(
Stanislav Mekhanoshin	d4ae470	2017-09-19 20:54:38 +0000	[diff] [blame]	884	std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
Jonas Devlieghere	0eaee54	2019-08-15 15:54:37 +0000	[diff] [blame]	885	Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
				886	Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
Stanislav Mekhanoshin	d4ae470	2017-09-19 20:54:38 +0000	[diff] [blame]	887	}
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	888
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	889	const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	890	if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	891	return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	892	else
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	893	return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	894	}
				895
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	896	const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	897	if (TM.getTargetTriple().getArch() == Triple::amdgcn)
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	898	return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	899	else
Tom Stellard	5bfbae5	2018-07-11 20:59:01 +0000	[diff] [blame]	900	return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
Tom Stellard	c5a154d	2018-06-28 23:47:12 +0000	[diff] [blame]	901	}