Blame - llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp - toolchain/llvm-project

blob: 4fae0a53b325d404104819dac48822f94f34999a [file] [log] [blame]

Tim Northover	3b0846e	2014-05-24 12:50:23 +0000	[diff] [blame]	1	//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass --------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	/// \file
				10	/// This file implements a TargetTransformInfo analysis pass specific to the
				11	/// AArch64 target machine. It uses the target's detailed information to provide
				12	/// more precise answers to certain TTI queries, while letting the target
				13	/// independent and default TTI implementations handle the rest.
				14	///
				15	//===----------------------------------------------------------------------===//
				16
				17	#include "AArch64.h"
				18	#include "AArch64TargetMachine.h"
				19	#include "MCTargetDesc/AArch64AddressingModes.h"
				20	#include "llvm/Analysis/TargetTransformInfo.h"
				21	#include "llvm/Support/Debug.h"
				22	#include "llvm/Target/CostTable.h"
				23	#include "llvm/Target/TargetLowering.h"
				24	#include <algorithm>
				25	using namespace llvm;
				26
				27	#define DEBUG_TYPE "aarch64tti"
				28
				29	// Declare the pass initialization routine locally as target-specific passes
				30	// don't have a target-wide initialization entry point, and so we rely on the
				31	// pass constructor initialization.
				32	namespace llvm {
				33	void initializeAArch64TTIPass(PassRegistry &);
				34	}
				35
				36	namespace {
				37
				38	class AArch64TTI final : public ImmutablePass, public TargetTransformInfo {
				39	const AArch64TargetMachine *TM;
				40	const AArch64Subtarget *ST;
				41	const AArch64TargetLowering *TLI;
				42
				43	/// Estimate the overhead of scalarizing an instruction. Insert and Extract
				44	/// are set if the result needs to be inserted and/or extracted from vectors.
				45	unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
				46
				47	public:
				48	AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
				49	llvm_unreachable("This pass cannot be directly constructed");
				50	}
				51
				52	AArch64TTI(const AArch64TargetMachine *TM)
				53	: ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
				54	TLI(TM->getTargetLowering()) {
				55	initializeAArch64TTIPass(*PassRegistry::getPassRegistry());
				56	}
				57
				58	void initializePass() override { pushTTIStack(this); }
				59
				60	void getAnalysisUsage(AnalysisUsage &AU) const override {
				61	TargetTransformInfo::getAnalysisUsage(AU);
				62	}
				63
				64	/// Pass identification.
				65	static char ID;
				66
				67	/// Provide necessary pointer adjustments for the two base classes.
				68	void getAdjustedAnalysisPointer(const void ID) override {
				69	if (ID == &TargetTransformInfo::ID)
				70	return (TargetTransformInfo *)this;
				71	return this;
				72	}
				73
				74	/// \name Scalar TTI Implementations
				75	/// @{
				76	unsigned getIntImmCost(int64_t Val) const;
				77	unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
				78	unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
				79	Type *Ty) const override;
				80	unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
				81	Type *Ty) const override;
				82	PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
				83
				84	/// @}
				85
				86	/// \name Vector TTI Implementations
				87	/// @{
				88
				89	unsigned getNumberOfRegisters(bool Vector) const override {
				90	if (Vector) {
				91	if (ST->hasNEON())
				92	return 32;
				93	return 0;
				94	}
				95	return 31;
				96	}
				97
				98	unsigned getRegisterBitWidth(bool Vector) const override {
				99	if (Vector) {
				100	if (ST->hasNEON())
				101	return 128;
				102	return 0;
				103	}
				104	return 64;
				105	}
				106
				107	unsigned getMaximumUnrollFactor() const override { return 2; }
				108
				109	unsigned getCastInstrCost(unsigned Opcode, Type Dst, Type Src) const
				110	override;
				111
				112	unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
				113	override;
				114
				115	unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
				116	OperandValueKind Opd1Info = OK_AnyValue,
				117	OperandValueKind Opd2Info = OK_AnyValue) const
				118	override;
				119
				120	unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
				121
				122	unsigned getCmpSelInstrCost(unsigned Opcode, Type ValTy, Type CondTy) const
				123	override;
				124
				125	unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
				126	unsigned AddressSpace) const override;
				127	/// @}
				128	};
				129
				130	} // end anonymous namespace
				131
				132	INITIALIZE_AG_PASS(AArch64TTI, TargetTransformInfo, "aarch64tti",
				133	"AArch64 Target Transform Info", true, true, false)
				134	char AArch64TTI::ID = 0;
				135
				136	ImmutablePass *
				137	llvm::createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM) {
				138	return new AArch64TTI(TM);
				139	}
				140
				141	/// \brief Calculate the cost of materializing a 64-bit value. This helper
				142	/// method might only calculate a fraction of a larger immediate. Therefore it
				143	/// is valid to return a cost of ZERO.
				144	unsigned AArch64TTI::getIntImmCost(int64_t Val) const {
				145	// Check if the immediate can be encoded within an instruction.
				146	if (Val == 0 \|\| AArch64_AM::isLogicalImmediate(Val, 64))
				147	return 0;
				148
				149	if (Val < 0)
				150	Val = ~Val;
				151
				152	// Calculate how many moves we will need to materialize this constant.
				153	unsigned LZ = countLeadingZeros((uint64_t)Val);
				154	return (64 - LZ + 15) / 16;
				155	}
				156
				157	/// \brief Calculate the cost of materializing the given constant.
				158	unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
				159	assert(Ty->isIntegerTy());
				160
				161	unsigned BitSize = Ty->getPrimitiveSizeInBits();
				162	if (BitSize == 0)
				163	return ~0U;
				164
				165	// Sign-extend all constants to a multiple of 64-bit.
				166	APInt ImmVal = Imm;
				167	if (BitSize & 0x3f)
				168	ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
				169
				170	// Split the constant into 64-bit chunks and calculate the cost for each
				171	// chunk.
				172	unsigned Cost = 0;
				173	for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
				174	APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
				175	int64_t Val = Tmp.getSExtValue();
				176	Cost += getIntImmCost(Val);
				177	}
				178	// We need at least one instruction to materialze the constant.
				179	return std::max(1U, Cost);
				180	}
				181
				182	unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx,
				183	const APInt &Imm, Type *Ty) const {
				184	assert(Ty->isIntegerTy());
				185
				186	unsigned BitSize = Ty->getPrimitiveSizeInBits();
				187	// There is no cost model for constants with a bit size of 0. Return TCC_Free
				188	// here, so that constant hoisting will ignore this constant.
				189	if (BitSize == 0)
				190	return TCC_Free;
				191
				192	unsigned ImmIdx = ~0U;
				193	switch (Opcode) {
				194	default:
				195	return TCC_Free;
				196	case Instruction::GetElementPtr:
				197	// Always hoist the base address of a GetElementPtr.
				198	if (Idx == 0)
				199	return 2 * TCC_Basic;
				200	return TCC_Free;
				201	case Instruction::Store:
				202	ImmIdx = 0;
				203	break;
				204	case Instruction::Add:
				205	case Instruction::Sub:
				206	case Instruction::Mul:
				207	case Instruction::UDiv:
				208	case Instruction::SDiv:
				209	case Instruction::URem:
				210	case Instruction::SRem:
				211	case Instruction::And:
				212	case Instruction::Or:
				213	case Instruction::Xor:
				214	case Instruction::ICmp:
				215	ImmIdx = 1;
				216	break;
				217	// Always return TCC_Free for the shift value of a shift instruction.
				218	case Instruction::Shl:
				219	case Instruction::LShr:
				220	case Instruction::AShr:
				221	if (Idx == 1)
				222	return TCC_Free;
				223	break;
				224	case Instruction::Trunc:
				225	case Instruction::ZExt:
				226	case Instruction::SExt:
				227	case Instruction::IntToPtr:
				228	case Instruction::PtrToInt:
				229	case Instruction::BitCast:
				230	case Instruction::PHI:
				231	case Instruction::Call:
				232	case Instruction::Select:
				233	case Instruction::Ret:
				234	case Instruction::Load:
				235	break;
				236	}
				237
				238	if (Idx == ImmIdx) {
				239	unsigned NumConstants = (BitSize + 63) / 64;
				240	unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
				241	return (Cost <= NumConstants * TCC_Basic)
				242	? static_cast<unsigned>(TCC_Free) : Cost;
				243	}
				244	return AArch64TTI::getIntImmCost(Imm, Ty);
				245	}
				246
				247	unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
				248	const APInt &Imm, Type *Ty) const {
				249	assert(Ty->isIntegerTy());
				250
				251	unsigned BitSize = Ty->getPrimitiveSizeInBits();
				252	// There is no cost model for constants with a bit size of 0. Return TCC_Free
				253	// here, so that constant hoisting will ignore this constant.
				254	if (BitSize == 0)
				255	return TCC_Free;
				256
				257	switch (IID) {
				258	default:
				259	return TCC_Free;
				260	case Intrinsic::sadd_with_overflow:
				261	case Intrinsic::uadd_with_overflow:
				262	case Intrinsic::ssub_with_overflow:
				263	case Intrinsic::usub_with_overflow:
				264	case Intrinsic::smul_with_overflow:
				265	case Intrinsic::umul_with_overflow:
				266	if (Idx == 1) {
				267	unsigned NumConstants = (BitSize + 63) / 64;
				268	unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
				269	return (Cost <= NumConstants * TCC_Basic)
				270	? static_cast<unsigned>(TCC_Free) : Cost;
				271	}
				272	break;
				273	case Intrinsic::experimental_stackmap:
				274	if ((Idx < 2) \|\| (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
				275	return TCC_Free;
				276	break;
				277	case Intrinsic::experimental_patchpoint_void:
				278	case Intrinsic::experimental_patchpoint_i64:
				279	if ((Idx < 4) \|\| (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
				280	return TCC_Free;
				281	break;
				282	}
				283	return AArch64TTI::getIntImmCost(Imm, Ty);
				284	}
				285
				286	AArch64TTI::PopcntSupportKind
				287	AArch64TTI::getPopcntSupport(unsigned TyWidth) const {
				288	assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
				289	if (TyWidth == 32 \|\| TyWidth == 64)
				290	return PSK_FastHardware;
				291	// TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
				292	return PSK_Software;
				293	}
				294
				295	unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
				296	Type *Src) const {
				297	int ISD = TLI->InstructionOpcodeToISD(Opcode);
				298	assert(ISD && "Invalid opcode");
				299
				300	EVT SrcTy = TLI->getValueType(Src);
				301	EVT DstTy = TLI->getValueType(Dst);
				302
				303	if (!SrcTy.isSimple() \|\| !DstTy.isSimple())
				304	return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
				305
				306	static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
				307	// LowerVectorINT_TO_FP:
				308	{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
Tim Northover	ef0d760	2014-06-15 09:27:06 +0000	[diff] [blame^]	309	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
Tim Northover	3b0846e	2014-05-24 12:50:23 +0000	[diff] [blame]	310	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
				311	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
Tim Northover	ef0d760	2014-06-15 09:27:06 +0000	[diff] [blame^]	312	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
Tim Northover	3b0846e	2014-05-24 12:50:23 +0000	[diff] [blame]	313	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
Tim Northover	ef0d760	2014-06-15 09:27:06 +0000	[diff] [blame^]	314
				315	// Complex: to v2f32
				316	{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
				317	{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
				318	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
				319	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
				320
				321	// Complex: to v4f32
				322	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
				323	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
				324	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
				325	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
				326
				327	// Complex: to v2f64
				328	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
				329	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
				330	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
				331	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
				332	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
				333	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
				334
				335
Tim Northover	3b0846e	2014-05-24 12:50:23 +0000	[diff] [blame]	336	// LowerVectorFP_TO_INT
Tim Northover	ef0d760	2014-06-15 09:27:06 +0000	[diff] [blame^]	337	{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
Tim Northover	3b0846e	2014-05-24 12:50:23 +0000	[diff] [blame]	338	{ ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
				339	{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
Tim Northover	ef0d760	2014-06-15 09:27:06 +0000	[diff] [blame^]	340	{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
Tim Northover	3b0846e	2014-05-24 12:50:23 +0000	[diff] [blame]	341	{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
				342	{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
Tim Northover	ef0d760	2014-06-15 09:27:06 +0000	[diff] [blame^]	343
				344	{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
				345	{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
				346	{ ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
				347	{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
				348	{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
				349	{ ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
Tim Northover	3b0846e	2014-05-24 12:50:23 +0000	[diff] [blame]	350	};
				351
				352	int Idx = ConvertCostTableLookup<MVT>(
				353	ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
				354	SrcTy.getSimpleVT());
				355	if (Idx != -1)
				356	return ConversionTbl[Idx].Cost;
				357
				358	return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
				359	}
				360
				361	unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
				362	unsigned Index) const {
				363	assert(Val->isVectorTy() && "This must be a vector type");
				364
				365	if (Index != -1U) {
				366	// Legalize the type.
				367	std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
				368
				369	// This type is legalized to a scalar type.
				370	if (!LT.second.isVector())
				371	return 0;
				372
				373	// The type may be split. Normalize the index to the new type.
				374	unsigned Width = LT.second.getVectorNumElements();
				375	Index = Index % Width;
				376
				377	// The element at index zero is already inside the vector.
				378	if (Index == 0)
				379	return 0;
				380	}
				381
				382	// All other insert/extracts cost this much.
				383	return 2;
				384	}
				385
				386	unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
				387	OperandValueKind Opd1Info,
				388	OperandValueKind Opd2Info) const {
				389	// Legalize the type.
				390	std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
				391
				392	int ISD = TLI->InstructionOpcodeToISD(Opcode);
				393
				394	switch (ISD) {
				395	default:
				396	return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
				397	Opd2Info);
				398	case ISD::ADD:
				399	case ISD::MUL:
				400	case ISD::XOR:
				401	case ISD::OR:
				402	case ISD::AND:
				403	// These nodes are marked as 'custom' for combining purposes only.
				404	// We know that they are legal. See LowerAdd in ISelLowering.
				405	return 1 * LT.first;
				406	}
				407	}
				408
				409	unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
				410	// Address computations in vectorized code with non-consecutive addresses will
				411	// likely result in more instructions compared to scalar code where the
				412	// computation can more often be merged into the index mode. The resulting
				413	// extra micro-ops can significantly decrease throughput.
				414	unsigned NumVectorInstToHideOverhead = 10;
				415
				416	if (Ty->isVectorTy() && IsComplex)
				417	return NumVectorInstToHideOverhead;
				418
				419	// In many cases the address computation is not merged into the instruction
				420	// addressing mode.
				421	return 1;
				422	}
				423
				424	unsigned AArch64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
				425	Type *CondTy) const {
				426
				427	int ISD = TLI->InstructionOpcodeToISD(Opcode);
				428	// We don't lower vector selects well that are wider than the register width.
				429	if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
				430	// We would need this many instructions to hide the scalarization happening.
				431	unsigned AmortizationCost = 20;
				432	static const TypeConversionCostTblEntry<MVT::SimpleValueType>
				433	VectorSelectTbl[] = {
				434	{ ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
				435	{ ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
				436	{ ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
				437	{ ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
				438	{ ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
				439	{ ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
				440	};
				441
				442	EVT SelCondTy = TLI->getValueType(CondTy);
				443	EVT SelValTy = TLI->getValueType(ValTy);
				444	if (SelCondTy.isSimple() && SelValTy.isSimple()) {
				445	int Idx =
				446	ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
				447	SelValTy.getSimpleVT());
				448	if (Idx != -1)
				449	return VectorSelectTbl[Idx].Cost;
				450	}
				451	}
				452	return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
				453	}
				454
				455	unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
				456	unsigned Alignment,
				457	unsigned AddressSpace) const {
				458	std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
				459
				460	if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
				461	Src->getVectorElementType()->isIntegerTy(64)) {
				462	// Unaligned stores are extremely inefficient. We don't split
				463	// unaligned v2i64 stores because the negative impact that has shown in
				464	// practice on inlined memcpy code.
				465	// We make v2i64 stores expensive so that we will only vectorize if there
				466	// are 6 other instructions getting vectorized.
				467	unsigned AmortizationCost = 6;
				468
				469	return LT.first * 2 * AmortizationCost;
				470	}
				471
				472	if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
				473	Src->getVectorNumElements() < 8) {
				474	// We scalarize the loads/stores because there is not v.4b register and we
				475	// have to promote the elements to v.4h.
				476	unsigned NumVecElts = Src->getVectorNumElements();
				477	unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
				478	// We generate 2 instructions per vector element.
				479	return NumVectorizableInstsToAmortize * NumVecElts * 2;
				480	}
				481
				482	return LT.first;
				483	}