Blame - llvm/lib/Target/ARM64/ARM64TargetTransformInfo.cpp - toolchain/llvm-project

blob: 372900e99feee1ab7ac984ce0c3d3a9ab676c7b1 [file] [log] [blame]

Tim Northover	00ed996	2014-03-29 10:18:08 +0000	[diff] [blame]	1	//===-- ARM64TargetTransformInfo.cpp - ARM64 specific TTI pass ------------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	/// \file
				10	/// This file implements a TargetTransformInfo analysis pass specific to the
				11	/// ARM64 target machine. It uses the target's detailed information to provide
				12	/// more precise answers to certain TTI queries, while letting the target
				13	/// independent and default TTI implementations handle the rest.
				14	///
				15	//===----------------------------------------------------------------------===//
				16
				17	#define DEBUG_TYPE "arm64tti"
				18	#include "ARM64.h"
				19	#include "ARM64TargetMachine.h"
				20	#include "MCTargetDesc/ARM64AddressingModes.h"
				21	#include "llvm/Analysis/TargetTransformInfo.h"
				22	#include "llvm/Support/Debug.h"
				23	#include "llvm/Target/CostTable.h"
				24	#include "llvm/Target/TargetLowering.h"
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	25	#include <algorithm>
Tim Northover	00ed996	2014-03-29 10:18:08 +0000	[diff] [blame]	26	using namespace llvm;
				27
				28	// Declare the pass initialization routine locally as target-specific passes
				29	// don't havve a target-wide initialization entry point, and so we rely on the
				30	// pass constructor initialization.
				31	namespace llvm {
				32	void initializeARM64TTIPass(PassRegistry &);
				33	}
				34
				35	namespace {
				36
				37	class ARM64TTI final : public ImmutablePass, public TargetTransformInfo {
				38	const ARM64TargetMachine *TM;
				39	const ARM64Subtarget *ST;
				40	const ARM64TargetLowering *TLI;
				41
				42	/// Estimate the overhead of scalarizing an instruction. Insert and Extract
				43	/// are set if the result needs to be inserted and/or extracted from vectors.
				44	unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
				45
				46	public:
				47	ARM64TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
				48	llvm_unreachable("This pass cannot be directly constructed");
				49	}
				50
				51	ARM64TTI(const ARM64TargetMachine *TM)
				52	: ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
				53	TLI(TM->getTargetLowering()) {
				54	initializeARM64TTIPass(*PassRegistry::getPassRegistry());
				55	}
				56
				57	void initializePass() override { pushTTIStack(this); }
				58
				59	void getAnalysisUsage(AnalysisUsage &AU) const override {
				60	TargetTransformInfo::getAnalysisUsage(AU);
				61	}
				62
				63	/// Pass identification.
				64	static char ID;
				65
				66	/// Provide necessary pointer adjustments for the two base classes.
				67	void getAdjustedAnalysisPointer(const void ID) override {
				68	if (ID == &TargetTransformInfo::ID)
				69	return (TargetTransformInfo *)this;
				70	return this;
				71	}
				72
				73	/// \name Scalar TTI Implementations
				74	/// @{
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	75	unsigned getIntImmCost(int64_t Val) const;
Tim Northover	00ed996	2014-03-29 10:18:08 +0000	[diff] [blame]	76	unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	77	unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
				78	Type *Ty) const override;
				79	unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
				80	Type *Ty) const override;
Tim Northover	00ed996	2014-03-29 10:18:08 +0000	[diff] [blame]	81	PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
				82
				83	/// @}
				84
				85	/// \name Vector TTI Implementations
				86	/// @{
				87
				88	unsigned getNumberOfRegisters(bool Vector) const override {
				89	if (Vector)
				90	return 32;
				91
				92	return 31;
				93	}
				94
				95	unsigned getRegisterBitWidth(bool Vector) const override {
				96	if (Vector)
				97	return 128;
				98
				99	return 64;
				100	}
				101
				102	unsigned getMaximumUnrollFactor() const override { return 2; }
				103
				104	unsigned getCastInstrCost(unsigned Opcode, Type Dst, Type Src) const
				105	override;
				106
				107	unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
				108	override;
				109
				110	unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
				111	OperandValueKind Opd1Info = OK_AnyValue,
				112	OperandValueKind Opd2Info = OK_AnyValue) const
				113	override;
				114
				115	unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
				116
				117	unsigned getCmpSelInstrCost(unsigned Opcode, Type ValTy, Type CondTy) const
				118	override;
				119
				120	unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
				121	unsigned AddressSpace) const override;
				122	/// @}
				123	};
				124
				125	} // end anonymous namespace
				126
				127	INITIALIZE_AG_PASS(ARM64TTI, TargetTransformInfo, "arm64tti",
				128	"ARM64 Target Transform Info", true, true, false)
				129	char ARM64TTI::ID = 0;
				130
				131	ImmutablePass *
				132	llvm::createARM64TargetTransformInfoPass(const ARM64TargetMachine *TM) {
				133	return new ARM64TTI(TM);
				134	}
				135
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	136	/// \brief Calculate the cost of materializing a 64-bit value. This helper
				137	/// method might only calculate a fraction of a larger immediate. Therefore it
				138	/// is valid to return a cost of ZERO.
				139	unsigned ARM64TTI::getIntImmCost(int64_t Val) const {
				140	// Check if the immediate can be encoded within an instruction.
				141	if (Val == 0 \|\| ARM64_AM::isLogicalImmediate(Val, 64))
				142	return 0;
				143
				144	if (Val < 0)
				145	Val = ~Val;
				146
				147	// Calculate how many moves we will need to materialize this constant.
				148	unsigned LZ = countLeadingZeros((uint64_t)Val);
				149	return (64 - LZ + 15) / 16;
				150	}
				151
				152	/// \brief Calculate the cost of materializing the given constant.
Tim Northover	00ed996	2014-03-29 10:18:08 +0000	[diff] [blame]	153	unsigned ARM64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
				154	assert(Ty->isIntegerTy());
				155
				156	unsigned BitSize = Ty->getPrimitiveSizeInBits();
Juergen Ributzka	6e17aa4	2014-04-12 02:36:28 +0000	[diff] [blame]	157	if (BitSize == 0)
Tim Northover	00ed996	2014-03-29 10:18:08 +0000	[diff] [blame]	158	return ~0U;
				159
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	160	// Sign-extend all constants to a multiple of 64-bit.
				161	APInt ImmVal = Imm;
				162	if (BitSize & 0x3f)
				163	ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
Tim Northover	00ed996	2014-03-29 10:18:08 +0000	[diff] [blame]	164
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	165	// Split the constant into 64-bit chunks and calculate the cost for each
				166	// chunk.
				167	unsigned Cost = 0;
				168	for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
Juergen Ributzka	48c8c07d	2014-04-10 01:36:59 +0000	[diff] [blame]	169	APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	170	int64_t Val = Tmp.getSExtValue();
				171	Cost += getIntImmCost(Val);
				172	}
				173	// We need at least one instruction to materialze the constant.
				174	return std::max(1U, Cost);
				175	}
Tim Northover	00ed996	2014-03-29 10:18:08 +0000	[diff] [blame]	176
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	177	unsigned ARM64TTI::getIntImmCost(unsigned Opcode, unsigned Idx,
				178	const APInt &Imm, Type *Ty) const {
				179	assert(Ty->isIntegerTy());
				180
				181	unsigned BitSize = Ty->getPrimitiveSizeInBits();
Juergen Ributzka	6e17aa4	2014-04-12 02:36:28 +0000	[diff] [blame]	182	// There is no cost model for constants with a bit size of 0. Return TCC_Free
				183	// here, so that constant hoisting will ignore this constant.
				184	if (BitSize == 0)
				185	return TCC_Free;
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	186
				187	unsigned ImmIdx = ~0U;
				188	switch (Opcode) {
				189	default:
				190	return TCC_Free;
				191	case Instruction::GetElementPtr:
				192	// Always hoist the base address of a GetElementPtr.
				193	if (Idx == 0)
				194	return 2 * TCC_Basic;
				195	return TCC_Free;
				196	case Instruction::Store:
				197	ImmIdx = 0;
				198	break;
				199	case Instruction::Add:
				200	case Instruction::Sub:
				201	case Instruction::Mul:
				202	case Instruction::UDiv:
				203	case Instruction::SDiv:
				204	case Instruction::URem:
				205	case Instruction::SRem:
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	206	case Instruction::And:
				207	case Instruction::Or:
				208	case Instruction::Xor:
				209	case Instruction::ICmp:
				210	ImmIdx = 1;
				211	break;
Juergen Ributzka	cf03068	2014-04-12 02:53:51 +0000	[diff] [blame]	212	// Always return TCC_Free for the shift value of a shift instruction.
				213	case Instruction::Shl:
				214	case Instruction::LShr:
				215	case Instruction::AShr:
				216	if (Idx == 1)
				217	return TCC_Free;
				218	break;
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	219	case Instruction::Trunc:
				220	case Instruction::ZExt:
				221	case Instruction::SExt:
				222	case Instruction::IntToPtr:
				223	case Instruction::PtrToInt:
				224	case Instruction::BitCast:
				225	case Instruction::PHI:
				226	case Instruction::Call:
				227	case Instruction::Select:
				228	case Instruction::Ret:
				229	case Instruction::Load:
				230	break;
				231	}
				232
				233	if (Idx == ImmIdx) {
				234	unsigned NumConstants = (BitSize + 63) / 64;
				235	unsigned Cost = ARM64TTI::getIntImmCost(Imm, Ty);
Saleem Abdulrasool	c5e0099	2014-04-10 02:48:10 +0000	[diff] [blame]	236	return (Cost <= NumConstants * TCC_Basic)
				237	? static_cast<unsigned>(TCC_Free) : Cost;
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	238	}
				239	return ARM64TTI::getIntImmCost(Imm, Ty);
				240	}
				241
				242	unsigned ARM64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
				243	const APInt &Imm, Type *Ty) const {
				244	assert(Ty->isIntegerTy());
				245
				246	unsigned BitSize = Ty->getPrimitiveSizeInBits();
Juergen Ributzka	6e17aa4	2014-04-12 02:36:28 +0000	[diff] [blame]	247	// There is no cost model for constants with a bit size of 0. Return TCC_Free
				248	// here, so that constant hoisting will ignore this constant.
				249	if (BitSize == 0)
				250	return TCC_Free;
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	251
				252	switch (IID) {
				253	default:
				254	return TCC_Free;
				255	case Intrinsic::sadd_with_overflow:
				256	case Intrinsic::uadd_with_overflow:
				257	case Intrinsic::ssub_with_overflow:
				258	case Intrinsic::usub_with_overflow:
				259	case Intrinsic::smul_with_overflow:
				260	case Intrinsic::umul_with_overflow:
				261	if (Idx == 1) {
				262	unsigned NumConstants = (BitSize + 63) / 64;
				263	unsigned Cost = ARM64TTI::getIntImmCost(Imm, Ty);
Saleem Abdulrasool	c5e0099	2014-04-10 02:48:10 +0000	[diff] [blame]	264	return (Cost <= NumConstants * TCC_Basic)
				265	? static_cast<unsigned>(TCC_Free) : Cost;
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	266	}
				267	break;
				268	case Intrinsic::experimental_stackmap:
				269	if ((Idx < 2) \|\| (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
				270	return TCC_Free;
				271	break;
				272	case Intrinsic::experimental_patchpoint_void:
				273	case Intrinsic::experimental_patchpoint_i64:
				274	if ((Idx < 4) \|\| (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
				275	return TCC_Free;
				276	break;
				277	}
				278	return ARM64TTI::getIntImmCost(Imm, Ty);
Tim Northover	00ed996	2014-03-29 10:18:08 +0000	[diff] [blame]	279	}
				280
				281	ARM64TTI::PopcntSupportKind ARM64TTI::getPopcntSupport(unsigned TyWidth) const {
				282	assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
				283	if (TyWidth == 32 \|\| TyWidth == 64)
				284	return PSK_FastHardware;
				285	// TODO: ARM64TargetLowering::LowerCTPOP() supports 128bit popcount.
				286	return PSK_Software;
				287	}
				288
				289	unsigned ARM64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
				290	Type *Src) const {
				291	int ISD = TLI->InstructionOpcodeToISD(Opcode);
				292	assert(ISD && "Invalid opcode");
				293
				294	EVT SrcTy = TLI->getValueType(Src);
				295	EVT DstTy = TLI->getValueType(Dst);
				296
				297	if (!SrcTy.isSimple() \|\| !DstTy.isSimple())
				298	return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
				299
				300	static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
				301	// LowerVectorINT_TO_FP:
				302	{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
				303	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
				304	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
				305	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
				306	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
				307	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
				308	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
				309	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
				310	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
				311	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
				312	// LowerVectorFP_TO_INT
				313	{ ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
				314	{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
				315	{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
				316	{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
				317	{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
				318	{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
				319	{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 },
				320	{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 },
				321	};
				322
				323	int Idx = ConvertCostTableLookup<MVT>(
				324	ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
				325	SrcTy.getSimpleVT());
				326	if (Idx != -1)
				327	return ConversionTbl[Idx].Cost;
				328
				329	return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
				330	}
				331
				332	unsigned ARM64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
				333	unsigned Index) const {
				334	assert(Val->isVectorTy() && "This must be a vector type");
				335
				336	if (Index != -1U) {
				337	// Legalize the type.
				338	std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
				339
				340	// This type is legalized to a scalar type.
				341	if (!LT.second.isVector())
				342	return 0;
				343
				344	// The type may be split. Normalize the index to the new type.
				345	unsigned Width = LT.second.getVectorNumElements();
				346	Index = Index % Width;
				347
				348	// The element at index zero is already inside the vector.
				349	if (Index == 0)
				350	return 0;
				351	}
				352
				353	// All other insert/extracts cost this much.
				354	return 2;
				355	}
				356
				357	unsigned ARM64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
				358	OperandValueKind Opd1Info,
				359	OperandValueKind Opd2Info) const {
				360	// Legalize the type.
				361	std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
				362
				363	int ISD = TLI->InstructionOpcodeToISD(Opcode);
				364
				365	switch (ISD) {
				366	default:
				367	return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
				368	Opd2Info);
				369	case ISD::ADD:
				370	case ISD::MUL:
				371	case ISD::XOR:
				372	case ISD::OR:
				373	case ISD::AND:
				374	// These nodes are marked as 'custom' for combining purposes only.
				375	// We know that they are legal. See LowerAdd in ISelLowering.
				376	return 1 * LT.first;
				377	}
				378	}
				379
				380	unsigned ARM64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
				381	// Address computations in vectorized code with non-consecutive addresses will
				382	// likely result in more instructions compared to scalar code where the
				383	// computation can more often be merged into the index mode. The resulting
				384	// extra micro-ops can significantly decrease throughput.
				385	unsigned NumVectorInstToHideOverhead = 10;
				386
				387	if (Ty->isVectorTy() && IsComplex)
				388	return NumVectorInstToHideOverhead;
				389
				390	// In many cases the address computation is not merged into the instruction
				391	// addressing mode.
				392	return 1;
				393	}
				394
				395	unsigned ARM64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
				396	Type *CondTy) const {
				397
				398	int ISD = TLI->InstructionOpcodeToISD(Opcode);
				399	// We don't lower vector selects well that are wider than the register width.
				400	if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
				401	// We would need this many instructions to hide the scalarization happening.
				402	unsigned AmortizationCost = 20;
				403	static const TypeConversionCostTblEntry<MVT::SimpleValueType>
				404	VectorSelectTbl[] = {
				405	{ ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
				406	{ ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
				407	{ ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
				408	{ ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
				409	{ ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
				410	{ ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
				411	};
				412
				413	EVT SelCondTy = TLI->getValueType(CondTy);
				414	EVT SelValTy = TLI->getValueType(ValTy);
				415	if (SelCondTy.isSimple() && SelValTy.isSimple()) {
				416	int Idx =
				417	ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
				418	SelValTy.getSimpleVT());
				419	if (Idx != -1)
				420	return VectorSelectTbl[Idx].Cost;
				421	}
				422	}
				423	return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
				424	}
				425
				426	unsigned ARM64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
				427	unsigned Alignment,
				428	unsigned AddressSpace) const {
				429	std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
				430
				431	if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
				432	Src->getVectorElementType()->isIntegerTy(64)) {
				433	// Unaligned stores are extremely inefficient. We don't split
				434	// unaligned v2i64 stores because the negative impact that has shown in
				435	// practice on inlined memcpy code.
				436	// We make v2i64 stores expensive so that we will only vectorize if there
				437	// are 6 other instructions getting vectorized.
				438	unsigned AmortizationCost = 6;
				439
				440	return LT.first * 2 * AmortizationCost;
				441	}
				442
				443	if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
				444	Src->getVectorNumElements() < 8) {
				445	// We scalarize the loads/stores because there is not v.4b register and we
				446	// have to promote the elements to v.4h.
				447	unsigned NumVecElts = Src->getVectorNumElements();
				448	unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
				449	// We generate 2 instructions per vector element.
				450	return NumVectorizableInstsToAmortize * NumVecElts * 2;
				451	}
				452
				453	return LT.first;
				454	}