Blame - llvm/lib/Target/ARM64/ARM64TargetTransformInfo.cpp - toolchain/llvm-project

blob: 9b598d76561d2785f1a8a7e5957a0d931e839db2 [file] [log] [blame]

Tim Northover	00ed996	2014-03-29 10:18:08 +0000	[diff] [blame]	1	//===-- ARM64TargetTransformInfo.cpp - ARM64 specific TTI pass ------------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	/// \file
				10	/// This file implements a TargetTransformInfo analysis pass specific to the
				11	/// ARM64 target machine. It uses the target's detailed information to provide
				12	/// more precise answers to certain TTI queries, while letting the target
				13	/// independent and default TTI implementations handle the rest.
				14	///
				15	//===----------------------------------------------------------------------===//
				16
				17	#define DEBUG_TYPE "arm64tti"
				18	#include "ARM64.h"
				19	#include "ARM64TargetMachine.h"
				20	#include "MCTargetDesc/ARM64AddressingModes.h"
				21	#include "llvm/Analysis/TargetTransformInfo.h"
				22	#include "llvm/Support/Debug.h"
				23	#include "llvm/Target/CostTable.h"
				24	#include "llvm/Target/TargetLowering.h"
				25	using namespace llvm;
				26
				27	// Declare the pass initialization routine locally as target-specific passes
				28	// don't havve a target-wide initialization entry point, and so we rely on the
				29	// pass constructor initialization.
				30	namespace llvm {
				31	void initializeARM64TTIPass(PassRegistry &);
				32	}
				33
				34	namespace {
				35
				36	class ARM64TTI final : public ImmutablePass, public TargetTransformInfo {
				37	const ARM64TargetMachine *TM;
				38	const ARM64Subtarget *ST;
				39	const ARM64TargetLowering *TLI;
				40
				41	/// Estimate the overhead of scalarizing an instruction. Insert and Extract
				42	/// are set if the result needs to be inserted and/or extracted from vectors.
				43	unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
				44
				45	public:
				46	ARM64TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
				47	llvm_unreachable("This pass cannot be directly constructed");
				48	}
				49
				50	ARM64TTI(const ARM64TargetMachine *TM)
				51	: ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
				52	TLI(TM->getTargetLowering()) {
				53	initializeARM64TTIPass(*PassRegistry::getPassRegistry());
				54	}
				55
				56	void initializePass() override { pushTTIStack(this); }
				57
				58	void getAnalysisUsage(AnalysisUsage &AU) const override {
				59	TargetTransformInfo::getAnalysisUsage(AU);
				60	}
				61
				62	/// Pass identification.
				63	static char ID;
				64
				65	/// Provide necessary pointer adjustments for the two base classes.
				66	void getAdjustedAnalysisPointer(const void ID) override {
				67	if (ID == &TargetTransformInfo::ID)
				68	return (TargetTransformInfo *)this;
				69	return this;
				70	}
				71
				72	/// \name Scalar TTI Implementations
				73	/// @{
				74
				75	unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
				76	PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
				77
				78	/// @}
				79
				80	/// \name Vector TTI Implementations
				81	/// @{
				82
				83	unsigned getNumberOfRegisters(bool Vector) const override {
				84	if (Vector)
				85	return 32;
				86
				87	return 31;
				88	}
				89
				90	unsigned getRegisterBitWidth(bool Vector) const override {
				91	if (Vector)
				92	return 128;
				93
				94	return 64;
				95	}
				96
				97	unsigned getMaximumUnrollFactor() const override { return 2; }
				98
				99	unsigned getCastInstrCost(unsigned Opcode, Type Dst, Type Src) const
				100	override;
				101
				102	unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
				103	override;
				104
				105	unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
				106	OperandValueKind Opd1Info = OK_AnyValue,
				107	OperandValueKind Opd2Info = OK_AnyValue) const
				108	override;
				109
				110	unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
				111
				112	unsigned getCmpSelInstrCost(unsigned Opcode, Type ValTy, Type CondTy) const
				113	override;
				114
				115	unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
				116	unsigned AddressSpace) const override;
				117	/// @}
				118	};
				119
				120	} // end anonymous namespace
				121
				122	INITIALIZE_AG_PASS(ARM64TTI, TargetTransformInfo, "arm64tti",
				123	"ARM64 Target Transform Info", true, true, false)
				124	char ARM64TTI::ID = 0;
				125
				126	ImmutablePass *
				127	llvm::createARM64TargetTransformInfoPass(const ARM64TargetMachine *TM) {
				128	return new ARM64TTI(TM);
				129	}
				130
				131	unsigned ARM64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
				132	assert(Ty->isIntegerTy());
				133
				134	unsigned BitSize = Ty->getPrimitiveSizeInBits();
				135	if (BitSize == 0)
				136	return ~0U;
				137
				138	int64_t Val = Imm.getSExtValue();
				139	if (Val == 0 \|\| ARM64_AM::isLogicalImmediate(Val, BitSize))
				140	return 1;
				141
				142	if ((int64_t)Val < 0)
				143	Val = ~Val;
				144	if (BitSize == 32)
				145	Val &= (1LL << 32) - 1;
				146
				147	unsigned LZ = countLeadingZeros((uint64_t)Val);
				148	unsigned Shift = (63 - LZ) / 16;
				149	// MOVZ is free so return true for one or fewer MOVK.
				150	return (Shift == 0) ? 1 : Shift;
				151	}
				152
				153	ARM64TTI::PopcntSupportKind ARM64TTI::getPopcntSupport(unsigned TyWidth) const {
				154	assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
				155	if (TyWidth == 32 \|\| TyWidth == 64)
				156	return PSK_FastHardware;
				157	// TODO: ARM64TargetLowering::LowerCTPOP() supports 128bit popcount.
				158	return PSK_Software;
				159	}
				160
				161	unsigned ARM64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
				162	Type *Src) const {
				163	int ISD = TLI->InstructionOpcodeToISD(Opcode);
				164	assert(ISD && "Invalid opcode");
				165
				166	EVT SrcTy = TLI->getValueType(Src);
				167	EVT DstTy = TLI->getValueType(Dst);
				168
				169	if (!SrcTy.isSimple() \|\| !DstTy.isSimple())
				170	return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
				171
				172	static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
				173	// LowerVectorINT_TO_FP:
				174	{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
				175	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
				176	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
				177	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
				178	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
				179	{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
				180	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
				181	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
				182	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
				183	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
				184	// LowerVectorFP_TO_INT
				185	{ ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
				186	{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
				187	{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
				188	{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
				189	{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
				190	{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
				191	{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 },
				192	{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 },
				193	};
				194
				195	int Idx = ConvertCostTableLookup<MVT>(
				196	ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
				197	SrcTy.getSimpleVT());
				198	if (Idx != -1)
				199	return ConversionTbl[Idx].Cost;
				200
				201	return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
				202	}
				203
				204	unsigned ARM64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
				205	unsigned Index) const {
				206	assert(Val->isVectorTy() && "This must be a vector type");
				207
				208	if (Index != -1U) {
				209	// Legalize the type.
				210	std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
				211
				212	// This type is legalized to a scalar type.
				213	if (!LT.second.isVector())
				214	return 0;
				215
				216	// The type may be split. Normalize the index to the new type.
				217	unsigned Width = LT.second.getVectorNumElements();
				218	Index = Index % Width;
				219
				220	// The element at index zero is already inside the vector.
				221	if (Index == 0)
				222	return 0;
				223	}
				224
				225	// All other insert/extracts cost this much.
				226	return 2;
				227	}
				228
				229	unsigned ARM64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
				230	OperandValueKind Opd1Info,
				231	OperandValueKind Opd2Info) const {
				232	// Legalize the type.
				233	std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
				234
				235	int ISD = TLI->InstructionOpcodeToISD(Opcode);
				236
				237	switch (ISD) {
				238	default:
				239	return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
				240	Opd2Info);
				241	case ISD::ADD:
				242	case ISD::MUL:
				243	case ISD::XOR:
				244	case ISD::OR:
				245	case ISD::AND:
				246	// These nodes are marked as 'custom' for combining purposes only.
				247	// We know that they are legal. See LowerAdd in ISelLowering.
				248	return 1 * LT.first;
				249	}
				250	}
				251
				252	unsigned ARM64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
				253	// Address computations in vectorized code with non-consecutive addresses will
				254	// likely result in more instructions compared to scalar code where the
				255	// computation can more often be merged into the index mode. The resulting
				256	// extra micro-ops can significantly decrease throughput.
				257	unsigned NumVectorInstToHideOverhead = 10;
				258
				259	if (Ty->isVectorTy() && IsComplex)
				260	return NumVectorInstToHideOverhead;
				261
				262	// In many cases the address computation is not merged into the instruction
				263	// addressing mode.
				264	return 1;
				265	}
				266
				267	unsigned ARM64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
				268	Type *CondTy) const {
				269
				270	int ISD = TLI->InstructionOpcodeToISD(Opcode);
				271	// We don't lower vector selects well that are wider than the register width.
				272	if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
				273	// We would need this many instructions to hide the scalarization happening.
				274	unsigned AmortizationCost = 20;
				275	static const TypeConversionCostTblEntry<MVT::SimpleValueType>
				276	VectorSelectTbl[] = {
				277	{ ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
				278	{ ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
				279	{ ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
				280	{ ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
				281	{ ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
				282	{ ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
				283	};
				284
				285	EVT SelCondTy = TLI->getValueType(CondTy);
				286	EVT SelValTy = TLI->getValueType(ValTy);
				287	if (SelCondTy.isSimple() && SelValTy.isSimple()) {
				288	int Idx =
				289	ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
				290	SelValTy.getSimpleVT());
				291	if (Idx != -1)
				292	return VectorSelectTbl[Idx].Cost;
				293	}
				294	}
				295	return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
				296	}
				297
				298	unsigned ARM64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
				299	unsigned Alignment,
				300	unsigned AddressSpace) const {
				301	std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
				302
				303	if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
				304	Src->getVectorElementType()->isIntegerTy(64)) {
				305	// Unaligned stores are extremely inefficient. We don't split
				306	// unaligned v2i64 stores because the negative impact that has shown in
				307	// practice on inlined memcpy code.
				308	// We make v2i64 stores expensive so that we will only vectorize if there
				309	// are 6 other instructions getting vectorized.
				310	unsigned AmortizationCost = 6;
				311
				312	return LT.first * 2 * AmortizationCost;
				313	}
				314
				315	if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
				316	Src->getVectorNumElements() < 8) {
				317	// We scalarize the loads/stores because there is not v.4b register and we
				318	// have to promote the elements to v.4h.
				319	unsigned NumVecElts = Src->getVectorNumElements();
				320	unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
				321	// We generate 2 instructions per vector element.
				322	return NumVectorizableInstsToAmortize * NumVecElts * 2;
				323	}
				324
				325	return LT.first;
				326	}