Blame - llvm/lib/Target/ARM64/ARM64TargetTransformInfo.cpp - toolchain/llvm-project

blob: 5323b8965f474dee81828a86d56b15a64f2cd306 [file] [log] [blame]

Tim Northover	00ed996	2014-03-29 10:18:08 +0000	[diff] [blame]	1	//===-- ARM64TargetTransformInfo.cpp - ARM64 specific TTI pass ------------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	/// \file
				10	/// This file implements a TargetTransformInfo analysis pass specific to the
				11	/// ARM64 target machine. It uses the target's detailed information to provide
				12	/// more precise answers to certain TTI queries, while letting the target
				13	/// independent and default TTI implementations handle the rest.
				14	///
				15	//===----------------------------------------------------------------------===//
				16
				17	#define DEBUG_TYPE "arm64tti"
				18	#include "ARM64.h"
				19	#include "ARM64TargetMachine.h"
				20	#include "MCTargetDesc/ARM64AddressingModes.h"
				21	#include "llvm/Analysis/TargetTransformInfo.h"
				22	#include "llvm/Support/Debug.h"
				23	#include "llvm/Target/CostTable.h"
				24	#include "llvm/Target/TargetLowering.h"
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	25	#include <algorithm>
Tim Northover	00ed996	2014-03-29 10:18:08 +0000	[diff] [blame]	26	using namespace llvm;
				27
				28	// Declare the pass initialization routine locally as target-specific passes
				29	// don't havve a target-wide initialization entry point, and so we rely on the
				30	// pass constructor initialization.
				31	namespace llvm {
				32	void initializeARM64TTIPass(PassRegistry &);
				33	}
				34
				35	namespace {
				36
				37	class ARM64TTI final : public ImmutablePass, public TargetTransformInfo {
				38	const ARM64TargetMachine *TM;
				39	const ARM64Subtarget *ST;
				40	const ARM64TargetLowering *TLI;
				41
				42	/// Estimate the overhead of scalarizing an instruction. Insert and Extract
				43	/// are set if the result needs to be inserted and/or extracted from vectors.
				44	unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
				45
				46	public:
				47	ARM64TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
				48	llvm_unreachable("This pass cannot be directly constructed");
				49	}
				50
				51	ARM64TTI(const ARM64TargetMachine *TM)
				52	: ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
				53	TLI(TM->getTargetLowering()) {
				54	initializeARM64TTIPass(*PassRegistry::getPassRegistry());
				55	}
				56
				57	void initializePass() override { pushTTIStack(this); }
				58
				59	void getAnalysisUsage(AnalysisUsage &AU) const override {
				60	TargetTransformInfo::getAnalysisUsage(AU);
				61	}
				62
				63	/// Pass identification.
				64	static char ID;
				65
				66	/// Provide necessary pointer adjustments for the two base classes.
				67	void getAdjustedAnalysisPointer(const void ID) override {
				68	if (ID == &TargetTransformInfo::ID)
				69	return (TargetTransformInfo *)this;
				70	return this;
				71	}
				72
				73	/// \name Scalar TTI Implementations
				74	/// @{
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	75	unsigned getIntImmCost(int64_t Val) const;
Tim Northover	00ed996	2014-03-29 10:18:08 +0000	[diff] [blame]	76	unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	77	unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
				78	Type *Ty) const override;
				79	unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
				80	Type *Ty) const override;
Tim Northover	00ed996	2014-03-29 10:18:08 +0000	[diff] [blame]	81	PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
				82
				83	/// @}
				84
				85	/// \name Vector TTI Implementations
				86	/// @{
				87
				88	unsigned getNumberOfRegisters(bool Vector) const override {
				89	if (Vector)
				90	return 32;
				91
				92	return 31;
				93	}
				94
				95	unsigned getRegisterBitWidth(bool Vector) const override {
				96	if (Vector)
				97	return 128;
				98
				99	return 64;
				100	}
				101
				102	unsigned getMaximumUnrollFactor() const override { return 2; }
				103
				104	unsigned getCastInstrCost(unsigned Opcode, Type Dst, Type Src) const
				105	override;
				106
				107	unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
				108	override;
				109
				110	unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
				111	OperandValueKind Opd1Info = OK_AnyValue,
				112	OperandValueKind Opd2Info = OK_AnyValue) const
				113	override;
				114
				115	unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
				116
				117	unsigned getCmpSelInstrCost(unsigned Opcode, Type ValTy, Type CondTy) const
				118	override;
				119
				120	unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
				121	unsigned AddressSpace) const override;
				122	/// @}
				123	};
				124
				125	} // end anonymous namespace
				126
				127	INITIALIZE_AG_PASS(ARM64TTI, TargetTransformInfo, "arm64tti",
				128	"ARM64 Target Transform Info", true, true, false)
				129	char ARM64TTI::ID = 0;
				130
				131	ImmutablePass *
				132	llvm::createARM64TargetTransformInfoPass(const ARM64TargetMachine *TM) {
				133	return new ARM64TTI(TM);
				134	}
				135
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	136	/// \brief Calculate the cost of materializing a 64-bit value. This helper
				137	/// method might only calculate a fraction of a larger immediate. Therefore it
				138	/// is valid to return a cost of ZERO.
				139	unsigned ARM64TTI::getIntImmCost(int64_t Val) const {
				140	// Check if the immediate can be encoded within an instruction.
				141	if (Val == 0 \|\| ARM64_AM::isLogicalImmediate(Val, 64))
				142	return 0;
				143
				144	if (Val < 0)
				145	Val = ~Val;
				146
				147	// Calculate how many moves we will need to materialize this constant.
				148	unsigned LZ = countLeadingZeros((uint64_t)Val);
				149	return (64 - LZ + 15) / 16;
				150	}
				151
				152	/// \brief Calculate the cost of materializing the given constant.
Tim Northover	00ed996	2014-03-29 10:18:08 +0000	[diff] [blame]	153	unsigned ARM64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
				154	assert(Ty->isIntegerTy());
				155
				156	unsigned BitSize = Ty->getPrimitiveSizeInBits();
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	157	if (BitSize == 0 \|\| BitSize > 128)
Tim Northover	00ed996	2014-03-29 10:18:08 +0000	[diff] [blame]	158	return ~0U;
				159
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	160	// Sign-extend all constants to a multiple of 64-bit.
				161	APInt ImmVal = Imm;
				162	if (BitSize & 0x3f)
				163	ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
Tim Northover	00ed996	2014-03-29 10:18:08 +0000	[diff] [blame]	164
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	165	// Split the constant into 64-bit chunks and calculate the cost for each
				166	// chunk.
				167	unsigned Cost = 0;
				168	for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
Juergen Ributzka	48c8c07d	2014-04-10 01:36:59 +0000	[diff] [blame^]	169	APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	170	int64_t Val = Tmp.getSExtValue();
				171	Cost += getIntImmCost(Val);
				172	}
				173	// We need at least one instruction to materialze the constant.
				174	return std::max(1U, Cost);
				175	}
Tim Northover	00ed996	2014-03-29 10:18:08 +0000	[diff] [blame]	176
Juergen Ributzka	c11e8b6	2014-04-08 20:39:59 +0000	[diff] [blame]	177	unsigned ARM64TTI::getIntImmCost(unsigned Opcode, unsigned Idx,
				178	const APInt &Imm, Type *Ty) const {
				179	assert(Ty->isIntegerTy());
				180
				181	unsigned BitSize = Ty->getPrimitiveSizeInBits();
				182	if (BitSize == 0 \|\| BitSize > 128)
				183	return ~0U;
				184
				185	unsigned ImmIdx = ~0U;
				186	switch (Opcode) {
				187	default:
				188	return TCC_Free;
				189	case Instruction::GetElementPtr:
				190	// Always hoist the base address of a GetElementPtr.
				191	if (Idx == 0)
				192	return 2 * TCC_Basic;
				193	return TCC_Free;
				194	case Instruction::Store:
				195	ImmIdx = 0;
				196	break;
				197	case Instruction::Add:
				198	case Instruction::Sub:
				199	case Instruction::Mul:
				200	case Instruction::UDiv:
				201	case Instruction::SDiv:
				202	case Instruction::URem:
				203	case Instruction::SRem:
				204	case Instruction::Shl:
				205	case Instruction::LShr:
				206	case Instruction::AShr:
				207	case Instruction::And:
				208	case Instruction::Or:
				209	case Instruction::Xor:
				210	case Instruction::ICmp:
				211	ImmIdx = 1;
				212	break;
				213	case Instruction::Trunc:
				214	case Instruction::ZExt:
				215	case Instruction::SExt:
				216	case Instruction::IntToPtr:
				217	case Instruction::PtrToInt:
				218	case Instruction::BitCast:
				219	case Instruction::PHI:
				220	case Instruction::Call:
				221	case Instruction::Select:
				222	case Instruction::Ret:
				223	case Instruction::Load:
				224	break;
				225	}
				226
				227	if (Idx == ImmIdx) {
				228	unsigned NumConstants = (BitSize + 63) / 64;
				229	unsigned Cost = ARM64TTI::getIntImmCost(Imm, Ty);
				230	return (Cost <= NumConstants * TCC_Basic) ? TCC_Free : Cost;
				231	}
				232	return ARM64TTI::getIntImmCost(Imm, Ty);
				233	}
				234
				235	unsigned ARM64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
				236	const APInt &Imm, Type *Ty) const {
				237	assert(Ty->isIntegerTy());
				238
				239	unsigned BitSize = Ty->getPrimitiveSizeInBits();
				240	if (BitSize == 0 \|\| BitSize > 128)
				241	return ~0U;
				242
				243	switch (IID) {
				244	default:
				245	return TCC_Free;
				246	case Intrinsic::sadd_with_overflow:
				247	case Intrinsic::uadd_with_overflow:
				248	case Intrinsic::ssub_with_overflow:
				249	case Intrinsic::usub_with_overflow:
				250	case Intrinsic::smul_with_overflow:
				251	case Intrinsic::umul_with_overflow:
				252	if (Idx == 1) {
				253	unsigned NumConstants = (BitSize + 63) / 64;
				254	unsigned Cost = ARM64TTI::getIntImmCost(Imm, Ty);
				255	return (Cost <= NumConstants * TCC_Basic) ? TCC_Free : Cost;
				256	}
				257	break;
				258	case Intrinsic::experimental_stackmap:
				259	if ((Idx < 2) \|\| (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
				260	return TCC_Free;
				261	break;
				262	case Intrinsic::experimental_patchpoint_void:
				263	case Intrinsic::experimental_patchpoint_i64:
				264	if ((Idx < 4) \|\| (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
				265	return TCC_Free;
				266	break;
				267	}
				268	return ARM64TTI::getIntImmCost(Imm, Ty);
Tim Northover	00ed996	2014-03-29 10:18:08 +0000	[diff] [blame]	269	}
				270
				271	ARM64TTI::PopcntSupportKind ARM64TTI::getPopcntSupport(unsigned TyWidth) const {
				272	assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
				273	if (TyWidth == 32 \|\| TyWidth == 64)
				274	return PSK_FastHardware;
				275	// TODO: ARM64TargetLowering::LowerCTPOP() supports 128bit popcount.
				276	return PSK_Software;
				277	}
				278
				279	unsigned ARM64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
				280	Type *Src) const {
				281	int ISD = TLI->InstructionOpcodeToISD(Opcode);
				282	assert(ISD && "Invalid opcode");
				283
				284	EVT SrcTy = TLI->getValueType(Src);
				285	EVT DstTy = TLI->getValueType(Dst);
				286
				287	if (!SrcTy.isSimple() \|\| !DstTy.isSimple())
				288	return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
				289
				290	static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
				291	// LowerVectorINT_TO_FP:
				292	{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
				293	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
				294	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
				295	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
				296	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
				297	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
				298	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
				299	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
				300	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
				301	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
				302	// LowerVectorFP_TO_INT
				303	{ ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
				304	{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
				305	{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
				306	{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
				307	{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
				308	{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
				309	{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 },
				310	{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 },
				311	};
				312
				313	int Idx = ConvertCostTableLookup<MVT>(
				314	ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
				315	SrcTy.getSimpleVT());
				316	if (Idx != -1)
				317	return ConversionTbl[Idx].Cost;
				318
				319	return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
				320	}
				321
				322	unsigned ARM64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
				323	unsigned Index) const {
				324	assert(Val->isVectorTy() && "This must be a vector type");
				325
				326	if (Index != -1U) {
				327	// Legalize the type.
				328	std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
				329
				330	// This type is legalized to a scalar type.
				331	if (!LT.second.isVector())
				332	return 0;
				333
				334	// The type may be split. Normalize the index to the new type.
				335	unsigned Width = LT.second.getVectorNumElements();
				336	Index = Index % Width;
				337
				338	// The element at index zero is already inside the vector.
				339	if (Index == 0)
				340	return 0;
				341	}
				342
				343	// All other insert/extracts cost this much.
				344	return 2;
				345	}
				346
				347	unsigned ARM64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
				348	OperandValueKind Opd1Info,
				349	OperandValueKind Opd2Info) const {
				350	// Legalize the type.
				351	std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
				352
				353	int ISD = TLI->InstructionOpcodeToISD(Opcode);
				354
				355	switch (ISD) {
				356	default:
				357	return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
				358	Opd2Info);
				359	case ISD::ADD:
				360	case ISD::MUL:
				361	case ISD::XOR:
				362	case ISD::OR:
				363	case ISD::AND:
				364	// These nodes are marked as 'custom' for combining purposes only.
				365	// We know that they are legal. See LowerAdd in ISelLowering.
				366	return 1 * LT.first;
				367	}
				368	}
				369
				370	unsigned ARM64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
				371	// Address computations in vectorized code with non-consecutive addresses will
				372	// likely result in more instructions compared to scalar code where the
				373	// computation can more often be merged into the index mode. The resulting
				374	// extra micro-ops can significantly decrease throughput.
				375	unsigned NumVectorInstToHideOverhead = 10;
				376
				377	if (Ty->isVectorTy() && IsComplex)
				378	return NumVectorInstToHideOverhead;
				379
				380	// In many cases the address computation is not merged into the instruction
				381	// addressing mode.
				382	return 1;
				383	}
				384
				385	unsigned ARM64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
				386	Type *CondTy) const {
				387
				388	int ISD = TLI->InstructionOpcodeToISD(Opcode);
				389	// We don't lower vector selects well that are wider than the register width.
				390	if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
				391	// We would need this many instructions to hide the scalarization happening.
				392	unsigned AmortizationCost = 20;
				393	static const TypeConversionCostTblEntry<MVT::SimpleValueType>
				394	VectorSelectTbl[] = {
				395	{ ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
				396	{ ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
				397	{ ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
				398	{ ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
				399	{ ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
				400	{ ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
				401	};
				402
				403	EVT SelCondTy = TLI->getValueType(CondTy);
				404	EVT SelValTy = TLI->getValueType(ValTy);
				405	if (SelCondTy.isSimple() && SelValTy.isSimple()) {
				406	int Idx =
				407	ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
				408	SelValTy.getSimpleVT());
				409	if (Idx != -1)
				410	return VectorSelectTbl[Idx].Cost;
				411	}
				412	}
				413	return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
				414	}
				415
				416	unsigned ARM64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
				417	unsigned Alignment,
				418	unsigned AddressSpace) const {
				419	std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
				420
				421	if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
				422	Src->getVectorElementType()->isIntegerTy(64)) {
				423	// Unaligned stores are extremely inefficient. We don't split
				424	// unaligned v2i64 stores because the negative impact that has shown in
				425	// practice on inlined memcpy code.
				426	// We make v2i64 stores expensive so that we will only vectorize if there
				427	// are 6 other instructions getting vectorized.
				428	unsigned AmortizationCost = 6;
				429
				430	return LT.first * 2 * AmortizationCost;
				431	}
				432
				433	if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
				434	Src->getVectorNumElements() < 8) {
				435	// We scalarize the loads/stores because there is not v.4b register and we
				436	// have to promote the elements to v.4h.
				437	unsigned NumVecElts = Src->getVectorNumElements();
				438	unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
				439	// We generate 2 instructions per vector element.
				440	return NumVectorizableInstsToAmortize * NumVecElts * 2;
				441	}
				442
				443	return LT.first;
				444	}