Blame - llvm/lib/Target/X86/X86TargetTransformInfo.cpp - toolchain/llvm-project

blob: ac63db50dc13a5baa7a47f572401ec4cce605df0 [file] [log] [blame]

Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	1	//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	/// \file
				10	/// This file implements a TargetTransformInfo analysis pass specific to the
				11	/// X86 target machine. It uses the target's detailed information to provide
				12	/// more precise answers to certain TTI queries, while letting the target
				13	/// independent and default TTI implementations handle the rest.
				14	///
				15	//===----------------------------------------------------------------------===//
				16
				17	#define DEBUG_TYPE "x86tti"
				18	#include "X86.h"
				19	#include "X86TargetMachine.h"
Chandler Carruth	d3e7355	2013-01-07 03:08:10 +0000	[diff] [blame]	20	#include "llvm/Analysis/TargetTransformInfo.h"
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	21	#include "llvm/Support/Debug.h"
				22	#include "llvm/Target/TargetLowering.h"
Renato Golin	d4c392e	2013-01-24 23:01:00 +0000	[diff] [blame]	23	#include "llvm/Target/CostTable.h"
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	24	using namespace llvm;
				25
				26	// Declare the pass initialization routine locally as target-specific passes
				27	// don't havve a target-wide initialization entry point, and so we rely on the
				28	// pass constructor initialization.
				29	namespace llvm {
				30	void initializeX86TTIPass(PassRegistry &);
				31	}
				32
				33	namespace {
				34
				35	class X86TTI : public ImmutablePass, public TargetTransformInfo {
				36	const X86TargetMachine *TM;
				37	const X86Subtarget *ST;
				38	const X86TargetLowering *TLI;
				39
				40	/// Estimate the overhead of scalarizing an instruction. Insert and Extract
				41	/// are set if the result needs to be inserted and/or extracted from vectors.
				42	unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
				43
				44	public:
				45	X86TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
				46	llvm_unreachable("This pass cannot be directly constructed");
				47	}
				48
				49	X86TTI(const X86TargetMachine *TM)
				50	: ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
				51	TLI(TM->getTargetLowering()) {
				52	initializeX86TTIPass(*PassRegistry::getPassRegistry());
				53	}
				54
				55	virtual void initializePass() {
				56	pushTTIStack(this);
				57	}
				58
				59	virtual void finalizePass() {
				60	popTTIStack();
				61	}
				62
				63	virtual void getAnalysisUsage(AnalysisUsage &AU) const {
				64	TargetTransformInfo::getAnalysisUsage(AU);
				65	}
				66
				67	/// Pass identification.
				68	static char ID;
				69
				70	/// Provide necessary pointer adjustments for the two base classes.
				71	virtual void getAdjustedAnalysisPointer(const void ID) {
				72	if (ID == &TargetTransformInfo::ID)
				73	return (TargetTransformInfo*)this;
				74	return this;
				75	}
				76
				77	/// \name Scalar TTI Implementations
				78	/// @{
Chandler Carruth	50a36cd	2013-01-07 03:16:03 +0000	[diff] [blame]	79	virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	80
				81	/// @}
				82
				83	/// \name Vector TTI Implementations
				84	/// @{
				85
				86	virtual unsigned getNumberOfRegisters(bool Vector) const;
Nadav Rotem	b1791a7	2013-01-09 22:29:00 +0000	[diff] [blame]	87	virtual unsigned getRegisterBitWidth(bool Vector) const;
Nadav Rotem	b696c36	2013-01-09 01:15:42 +0000	[diff] [blame]	88	virtual unsigned getMaximumUnrollFactor() const;
Arnold Schwaighofer	b977387	2013-04-04 23:26:21 +0000	[diff] [blame]	89	virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
				90	OperandValueKind,
				91	OperandValueKind) const;
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	92	virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
				93	int Index, Type *SubTp) const;
				94	virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
				95	Type *Src) const;
				96	virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
				97	Type *CondTy) const;
				98	virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
				99	unsigned Index) const;
				100	virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
				101	unsigned Alignment,
				102	unsigned AddressSpace) const;
				103
				104	/// @}
				105	};
				106
				107	} // end anonymous namespace
				108
				109	INITIALIZE_AG_PASS(X86TTI, TargetTransformInfo, "x86tti",
				110	"X86 Target Transform Info", true, true, false)
				111	char X86TTI::ID = 0;
				112
				113	ImmutablePass *
				114	llvm::createX86TargetTransformInfoPass(const X86TargetMachine *TM) {
				115	return new X86TTI(TM);
				116	}
				117
				118
				119	//===----------------------------------------------------------------------===//
				120	//
				121	// X86 cost model.
				122	//
				123	//===----------------------------------------------------------------------===//
				124
Chandler Carruth	50a36cd	2013-01-07 03:16:03 +0000	[diff] [blame]	125	X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	126	assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
				127	// TODO: Currently the __builtin_popcount() implementation using SSE3
				128	// instructions is inefficient. Once the problem is fixed, we should
				129	// call ST->hasSSE3() instead of ST->hasSSE4().
Chandler Carruth	50a36cd	2013-01-07 03:16:03 +0000	[diff] [blame]	130	return ST->hasSSE41() ? PSK_FastHardware : PSK_Software;
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	131	}
				132
				133	unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
Nadav Rotem	b1791a7	2013-01-09 22:29:00 +0000	[diff] [blame]	134	if (Vector && !ST->hasSSE1())
				135	return 0;
				136
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	137	if (ST->is64Bit())
				138	return 16;
				139	return 8;
				140	}
				141
Nadav Rotem	b1791a7	2013-01-09 22:29:00 +0000	[diff] [blame]	142	unsigned X86TTI::getRegisterBitWidth(bool Vector) const {
				143	if (Vector) {
				144	if (ST->hasAVX()) return 256;
				145	if (ST->hasSSE1()) return 128;
				146	return 0;
				147	}
				148
				149	if (ST->is64Bit())
				150	return 64;
				151	return 32;
				152
				153	}
				154
Nadav Rotem	b696c36	2013-01-09 01:15:42 +0000	[diff] [blame]	155	unsigned X86TTI::getMaximumUnrollFactor() const {
				156	if (ST->isAtom())
				157	return 1;
				158
				159	// Sandybridge and Haswell have multiple execution ports and pipelined
				160	// vector units.
				161	if (ST->hasAVX())
				162	return 4;
				163
				164	return 2;
				165	}
				166
Arnold Schwaighofer	b977387	2013-04-04 23:26:21 +0000	[diff] [blame]	167	unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
				168	OperandValueKind Op1Info,
				169	OperandValueKind Op2Info) const {
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	170	// Legalize the type.
				171	std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
				172
				173	int ISD = TLI->InstructionOpcodeToISD(Opcode);
				174	assert(ISD && "Invalid opcode");
				175
Michael Liao	70dd7f9	2013-03-20 22:01:10 +0000	[diff] [blame]	176	static const CostTblEntry<MVT> AVX2CostTable[] = {
				177	// Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
				178	// customize them to detect the cases where shift amount is a scalar one.
				179	{ ISD::SHL, MVT::v4i32, 1 },
				180	{ ISD::SRL, MVT::v4i32, 1 },
				181	{ ISD::SRA, MVT::v4i32, 1 },
				182	{ ISD::SHL, MVT::v8i32, 1 },
				183	{ ISD::SRL, MVT::v8i32, 1 },
				184	{ ISD::SRA, MVT::v8i32, 1 },
				185	{ ISD::SHL, MVT::v2i64, 1 },
				186	{ ISD::SRL, MVT::v2i64, 1 },
				187	{ ISD::SHL, MVT::v4i64, 1 },
				188	{ ISD::SRL, MVT::v4i64, 1 },
Arnold Schwaighofer	e9b5016	2013-04-03 21:46:05 +0000	[diff] [blame]	189
				190	{ ISD::SHL, MVT::v32i8, 42 }, // cmpeqb sequence.
				191	{ ISD::SHL, MVT::v16i16, 16*10 }, // Scalarized.
				192
				193	{ ISD::SRL, MVT::v32i8, 32*10 }, // Scalarized.
				194	{ ISD::SRL, MVT::v16i16, 8*10 }, // Scalarized.
				195
				196	{ ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized.
				197	{ ISD::SRA, MVT::v16i16, 16*10 }, // Scalarized.
				198	{ ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized.
Arnold Schwaighofer	a04b9ef	2013-06-25 19:14:09 +0000	[diff] [blame]	199
				200	// Vectorizing division is a bad idea. See the SSE2 table for more comments.
				201	{ ISD::SDIV, MVT::v32i8, 32*20 },
				202	{ ISD::SDIV, MVT::v16i16, 16*20 },
				203	{ ISD::SDIV, MVT::v8i32, 8*20 },
				204	{ ISD::SDIV, MVT::v4i64, 4*20 },
				205	{ ISD::UDIV, MVT::v32i8, 32*20 },
				206	{ ISD::UDIV, MVT::v16i16, 16*20 },
				207	{ ISD::UDIV, MVT::v8i32, 8*20 },
				208	{ ISD::UDIV, MVT::v4i64, 4*20 },
Michael Liao	70dd7f9	2013-03-20 22:01:10 +0000	[diff] [blame]	209	};
				210
				211	// Look for AVX2 lowering tricks.
				212	if (ST->hasAVX2()) {
				213	int Idx = CostTableLookup<MVT>(AVX2CostTable, array_lengthof(AVX2CostTable),
				214	ISD, LT.second);
				215	if (Idx != -1)
				216	return LT.first * AVX2CostTable[Idx].Cost;
				217	}
				218
Arnold Schwaighofer	44f902e	2013-04-04 23:26:24 +0000	[diff] [blame]	219	static const CostTblEntry<MVT> SSE2UniformConstCostTable[] = {
				220	// We don't correctly identify costs of casts because they are marked as
				221	// custom.
				222	// Constant splats are cheaper for the following instructions.
				223	{ ISD::SHL, MVT::v16i8, 1 }, // psllw.
				224	{ ISD::SHL, MVT::v8i16, 1 }, // psllw.
				225	{ ISD::SHL, MVT::v4i32, 1 }, // pslld
				226	{ ISD::SHL, MVT::v2i64, 1 }, // psllq.
				227
				228	{ ISD::SRL, MVT::v16i8, 1 }, // psrlw.
				229	{ ISD::SRL, MVT::v8i16, 1 }, // psrlw.
				230	{ ISD::SRL, MVT::v4i32, 1 }, // psrld.
				231	{ ISD::SRL, MVT::v2i64, 1 }, // psrlq.
				232
				233	{ ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
				234	{ ISD::SRA, MVT::v8i16, 1 }, // psraw.
				235	{ ISD::SRA, MVT::v4i32, 1 }, // psrad.
				236	};
				237
				238	if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
				239	ST->hasSSE2()) {
				240	int Idx = CostTableLookup<MVT>(SSE2UniformConstCostTable,
				241	array_lengthof(SSE2UniformConstCostTable),
				242	ISD, LT.second);
				243	if (Idx != -1)
				244	return LT.first * SSE2UniformConstCostTable[Idx].Cost;
				245	}
				246
				247
Arnold Schwaighofer	e9b5016	2013-04-03 21:46:05 +0000	[diff] [blame]	248	static const CostTblEntry<MVT> SSE2CostTable[] = {
				249	// We don't correctly identify costs of casts because they are marked as
				250	// custom.
				251	// For some cases, where the shift amount is a scalar we would be able
				252	// to generate better code. Unfortunately, when this is the case the value
				253	// (the splat) will get hoisted out of the loop, thereby making it invisible
				254	// to ISel. The cost model must return worst case assumptions because it is
				255	// used for vectorization and we don't want to make vectorized code worse
				256	// than scalar code.
				257	{ ISD::SHL, MVT::v16i8, 30 }, // cmpeqb sequence.
				258	{ ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized.
				259	{ ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
				260	{ ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized.
				261
				262	{ ISD::SRL, MVT::v16i8, 16*10 }, // Scalarized.
				263	{ ISD::SRL, MVT::v8i16, 8*10 }, // Scalarized.
				264	{ ISD::SRL, MVT::v4i32, 4*10 }, // Scalarized.
				265	{ ISD::SRL, MVT::v2i64, 2*10 }, // Scalarized.
				266
				267	{ ISD::SRA, MVT::v16i8, 16*10 }, // Scalarized.
				268	{ ISD::SRA, MVT::v8i16, 8*10 }, // Scalarized.
				269	{ ISD::SRA, MVT::v4i32, 4*10 }, // Scalarized.
				270	{ ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized.
Arnold Schwaighofer	a04b9ef	2013-06-25 19:14:09 +0000	[diff] [blame]	271
				272	// It is not a good idea to vectorize division. We have to scalarize it and
				273	// in the process we will often end up having to spilling regular
				274	// registers. The overhead of division is going to dominate most kernels
				275	// anyways so try hard to prevent vectorization of division - it is
				276	// generally a bad idea. Assume somewhat arbitrarily that we have to be able
				277	// to hide "20 cycles" for each lane.
				278	{ ISD::SDIV, MVT::v16i8, 16*20 },
				279	{ ISD::SDIV, MVT::v8i16, 8*20 },
				280	{ ISD::SDIV, MVT::v4i32, 4*20 },
				281	{ ISD::SDIV, MVT::v2i64, 2*20 },
				282	{ ISD::UDIV, MVT::v16i8, 16*20 },
				283	{ ISD::UDIV, MVT::v8i16, 8*20 },
				284	{ ISD::UDIV, MVT::v4i32, 4*20 },
				285	{ ISD::UDIV, MVT::v2i64, 2*20 },
Arnold Schwaighofer	e9b5016	2013-04-03 21:46:05 +0000	[diff] [blame]	286	};
				287
				288	if (ST->hasSSE2()) {
				289	int Idx = CostTableLookup<MVT>(SSE2CostTable, array_lengthof(SSE2CostTable),
				290	ISD, LT.second);
				291	if (Idx != -1)
				292	return LT.first * SSE2CostTable[Idx].Cost;
				293	}
				294
Renato Golin	d4c392e	2013-01-24 23:01:00 +0000	[diff] [blame]	295	static const CostTblEntry<MVT> AVX1CostTable[] = {
Renato Golin	e1fb059	2013-01-20 20:57:20 +0000	[diff] [blame]	296	// We don't have to scalarize unsupported ops. We can issue two half-sized
				297	// operations and we only need to extract the upper YMM half.
				298	// Two ops + 1 extract + 1 insert = 4.
				299	{ ISD::MUL, MVT::v8i32, 4 },
				300	{ ISD::SUB, MVT::v8i32, 4 },
				301	{ ISD::ADD, MVT::v8i32, 4 },
Renato Golin	e1fb059	2013-01-20 20:57:20 +0000	[diff] [blame]	302	{ ISD::SUB, MVT::v4i64, 4 },
				303	{ ISD::ADD, MVT::v4i64, 4 },
Arnold Schwaighofer	20ef54f	2013-03-02 04:02:52 +0000	[diff] [blame]	304	// A v4i64 multiply is custom lowered as two split v2i64 vectors that then
				305	// are lowered as a series of long multiplies(3), shifts(4) and adds(2)
				306	// Because we believe v4i64 to be a legal type, we must also include the
				307	// split factor of two in the cost table. Therefore, the cost here is 18
				308	// instead of 9.
				309	{ ISD::MUL, MVT::v4i64, 18 },
				310	};
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	311
				312	// Look for AVX1 lowering tricks.
Arnold Schwaighofer	20ef54f	2013-03-02 04:02:52 +0000	[diff] [blame]	313	if (ST->hasAVX() && !ST->hasAVX2()) {
				314	int Idx = CostTableLookup<MVT>(AVX1CostTable, array_lengthof(AVX1CostTable),
				315	ISD, LT.second);
Renato Golin	e1fb059	2013-01-20 20:57:20 +0000	[diff] [blame]	316	if (Idx != -1)
				317	return LT.first * AVX1CostTable[Idx].Cost;
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	318	}
Arnold Schwaighofer	20ef54f	2013-03-02 04:02:52 +0000	[diff] [blame]	319
				320	// Custom lowering of vectors.
				321	static const CostTblEntry<MVT> CustomLowered[] = {
				322	// A v2i64/v4i64 and multiply is custom lowered as a series of long
				323	// multiplies(3), shifts(4) and adds(2).
				324	{ ISD::MUL, MVT::v2i64, 9 },
				325	{ ISD::MUL, MVT::v4i64, 9 },
				326	};
				327	int Idx = CostTableLookup<MVT>(CustomLowered, array_lengthof(CustomLowered),
				328	ISD, LT.second);
				329	if (Idx != -1)
				330	return LT.first * CustomLowered[Idx].Cost;
				331
				332	// Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle,
				333	// 2x pmuludq, 2x shuffle.
				334	if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
				335	!ST->hasSSE41())
				336	return 6;
				337
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	338	// Fallback to the default implementation.
Arnold Schwaighofer	b977387	2013-04-04 23:26:21 +0000	[diff] [blame]	339	return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info,
				340	Op2Info);
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	341	}
				342
				343	unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
				344	Type *SubTp) const {
				345	// We only estimate the cost of reverse shuffles.
Chandler Carruth	2109f47	2013-01-07 03:20:02 +0000	[diff] [blame]	346	if (Kind != SK_Reverse)
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	347	return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
				348
				349	std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
				350	unsigned Cost = 1;
				351	if (LT.second.getSizeInBits() > 128)
				352	Cost = 3; // Extract + insert + copy.
				353
				354	// Multiple by the number of parts.
				355	return Cost * LT.first;
				356	}
				357
				358	unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type Dst, Type Src) const {
				359	int ISD = TLI->InstructionOpcodeToISD(Opcode);
				360	assert(ISD && "Invalid opcode");
				361
Arnold Schwaighofer	f47d2d7	2013-04-08 18:05:48 +0000	[diff] [blame]	362	std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(Src);
				363	std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(Dst);
				364
				365	static const TypeConversionCostTblEntry<MVT> SSE2ConvTbl[] = {
				366	// These are somewhat magic numbers justified by looking at the output of
				367	// Intel's IACA, running some kernels and making sure when we take
				368	// legalization into account the throughput will be overestimated.
				369	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
				370	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
				371	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
				372	{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
				373	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
				374	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
				375	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
				376	{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
				377	// There are faster sequences for float conversions.
				378	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
				379	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 },
				380	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
				381	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
				382	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
				383	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 },
				384	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
				385	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
				386	};
				387
				388	if (ST->hasSSE2() && !ST->hasAVX()) {
				389	int Idx = ConvertCostTableLookup<MVT>(SSE2ConvTbl,
				390	array_lengthof(SSE2ConvTbl),
				391	ISD, LTDest.second, LTSrc.second);
				392	if (Idx != -1)
				393	return LTSrc.first * SSE2ConvTbl[Idx].Cost;
				394	}
				395
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	396	EVT SrcTy = TLI->getValueType(Src);
				397	EVT DstTy = TLI->getValueType(Dst);
				398
Arnold Schwaighofer	c0c7ff4	2013-04-17 20:04:53 +0000	[diff] [blame]	399	// The function getSimpleVT only handles simple value types.
				400	if (!SrcTy.isSimple() \|\| !DstTy.isSimple())
				401	return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
				402
Renato Golin	d4c392e	2013-01-24 23:01:00 +0000	[diff] [blame]	403	static const TypeConversionCostTblEntry<MVT> AVXConversionTbl[] = {
Renato Golin	e1fb059	2013-01-20 20:57:20 +0000	[diff] [blame]	404	{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
				405	{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
				406	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
				407	{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
				408	{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
				409	{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1 },
Benjamin Kramer	52ceb44	2013-04-01 10:23:49 +0000	[diff] [blame]	410
				411	{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
				412	{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 },
				413	{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
				414	{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
				415	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
				416	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
				417	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 },
				418	{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
				419	{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
				420	{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 },
				421	{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 },
				422	{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
				423
				424	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
				425	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 },
				426	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
				427	{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 },
				428	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
				429	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 },
				430	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
				431	{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 },
				432	{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
				433	{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
				434	{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
				435	{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
				436
Renato Golin	e1fb059	2013-01-20 20:57:20 +0000	[diff] [blame]	437	{ ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 1 },
				438	{ ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },
				439	{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 },
				440	{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 },
Elena Demikhovsky	0ccdd13	2013-02-20 12:42:54 +0000	[diff] [blame]	441	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 8 },
Nadav Rotem	0f1bc60	2013-03-19 18:38:27 +0000	[diff] [blame]	442	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 },
				443	{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 },
Renato Golin	e1fb059	2013-01-20 20:57:20 +0000	[diff] [blame]	444	{ ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 },
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	445	};
				446
				447	if (ST->hasAVX()) {
Renato Golin	d4c392e	2013-01-24 23:01:00 +0000	[diff] [blame]	448	int Idx = ConvertCostTableLookup<MVT>(AVXConversionTbl,
Renato Golin	e1fb059	2013-01-20 20:57:20 +0000	[diff] [blame]	449	array_lengthof(AVXConversionTbl),
				450	ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
				451	if (Idx != -1)
				452	return AVXConversionTbl[Idx].Cost;
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	453	}
				454
				455	return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
				456	}
				457
				458	unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
				459	Type *CondTy) const {
				460	// Legalize the type.
				461	std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
				462
				463	MVT MTy = LT.second;
				464
				465	int ISD = TLI->InstructionOpcodeToISD(Opcode);
				466	assert(ISD && "Invalid opcode");
				467
Renato Golin	d4c392e	2013-01-24 23:01:00 +0000	[diff] [blame]	468	static const CostTblEntry<MVT> SSE42CostTbl[] = {
Renato Golin	e1fb059	2013-01-20 20:57:20 +0000	[diff] [blame]	469	{ ISD::SETCC, MVT::v2f64, 1 },
				470	{ ISD::SETCC, MVT::v4f32, 1 },
				471	{ ISD::SETCC, MVT::v2i64, 1 },
				472	{ ISD::SETCC, MVT::v4i32, 1 },
				473	{ ISD::SETCC, MVT::v8i16, 1 },
				474	{ ISD::SETCC, MVT::v16i8, 1 },
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	475	};
				476
Renato Golin	d4c392e	2013-01-24 23:01:00 +0000	[diff] [blame]	477	static const CostTblEntry<MVT> AVX1CostTbl[] = {
Renato Golin	e1fb059	2013-01-20 20:57:20 +0000	[diff] [blame]	478	{ ISD::SETCC, MVT::v4f64, 1 },
				479	{ ISD::SETCC, MVT::v8f32, 1 },
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	480	// AVX1 does not support 8-wide integer compare.
Renato Golin	e1fb059	2013-01-20 20:57:20 +0000	[diff] [blame]	481	{ ISD::SETCC, MVT::v4i64, 4 },
				482	{ ISD::SETCC, MVT::v8i32, 4 },
				483	{ ISD::SETCC, MVT::v16i16, 4 },
				484	{ ISD::SETCC, MVT::v32i8, 4 },
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	485	};
				486
Renato Golin	d4c392e	2013-01-24 23:01:00 +0000	[diff] [blame]	487	static const CostTblEntry<MVT> AVX2CostTbl[] = {
Renato Golin	e1fb059	2013-01-20 20:57:20 +0000	[diff] [blame]	488	{ ISD::SETCC, MVT::v4i64, 1 },
				489	{ ISD::SETCC, MVT::v8i32, 1 },
				490	{ ISD::SETCC, MVT::v16i16, 1 },
				491	{ ISD::SETCC, MVT::v32i8, 1 },
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	492	};
				493
				494	if (ST->hasAVX2()) {
Nadav Rotem	7d6c625	2013-06-18 20:41:52 +0000	[diff] [blame]	495	int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl),
				496	ISD, MTy);
Renato Golin	e1fb059	2013-01-20 20:57:20 +0000	[diff] [blame]	497	if (Idx != -1)
				498	return LT.first * AVX2CostTbl[Idx].Cost;
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	499	}
				500
				501	if (ST->hasAVX()) {
Nadav Rotem	7d6c625	2013-06-18 20:41:52 +0000	[diff] [blame]	502	int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl),
				503	ISD, MTy);
Renato Golin	e1fb059	2013-01-20 20:57:20 +0000	[diff] [blame]	504	if (Idx != -1)
				505	return LT.first * AVX1CostTbl[Idx].Cost;
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	506	}
				507
				508	if (ST->hasSSE42()) {
Nadav Rotem	7d6c625	2013-06-18 20:41:52 +0000	[diff] [blame]	509	int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl),
				510	ISD, MTy);
Renato Golin	e1fb059	2013-01-20 20:57:20 +0000	[diff] [blame]	511	if (Idx != -1)
				512	return LT.first * SSE42CostTbl[Idx].Cost;
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	513	}
				514
				515	return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
				516	}
				517
				518	unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
				519	unsigned Index) const {
				520	assert(Val->isVectorTy() && "This must be a vector type");
				521
				522	if (Index != -1U) {
				523	// Legalize the type.
				524	std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
				525
				526	// This type is legalized to a scalar type.
				527	if (!LT.second.isVector())
				528	return 0;
				529
				530	// The type may be split. Normalize the index to the new type.
				531	unsigned Width = LT.second.getVectorNumElements();
				532	Index = Index % Width;
				533
				534	// Floating point scalars are already located in index #0.
				535	if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
				536	return 0;
				537	}
				538
				539	return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
				540	}
				541
Nadav Rotem	f9ecbcb	2013-06-27 17:52:04 +0000	[diff] [blame^]	542	unsigned X86TTI::getScalarizationOverhead(Type *Ty, bool Insert,
				543	bool Extract) const {
				544	assert (Ty->isVectorTy() && "Can only scalarize vectors");
				545	unsigned Cost = 0;
				546
				547	for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
				548	if (Insert)
				549	Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
				550	if (Extract)
				551	Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
				552	}
				553
				554	return Cost;
				555	}
				556
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	557	unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
				558	unsigned AddressSpace) const {
Nadav Rotem	f9ecbcb	2013-06-27 17:52:04 +0000	[diff] [blame^]	559	// Handle non power of two vectors such as <3 x float>
				560	if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
				561	unsigned NumElem = VTy->getVectorNumElements();
				562
				563	// Handle a few common cases:
				564	// <3 x float>
				565	if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
				566	// Cost = 64 bit store + extract + 32 bit store.
				567	return 3;
				568
				569	// <3 x double>
				570	if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
				571	// Cost = 128 bit store + unpack + 64 bit store.
				572	return 3;
				573
				574	// Assume that all other non power-of-two numbers are scalarized.
				575	if (!isPowerOf2_32(NumElem)) {
				576	unsigned Cost = TargetTransformInfo::getMemoryOpCost(Opcode,
				577	VTy->getScalarType(),
				578	Alignment,
				579	AddressSpace);
				580	unsigned SplitCost = getScalarizationOverhead(Src,
				581	Opcode == Instruction::Load,
				582	Opcode==Instruction::Store);
				583	return NumElem * Cost + SplitCost;
				584	}
				585	}
				586
Chandler Carruth	664e354	2013-01-07 01:37:14 +0000	[diff] [blame]	587	// Legalize the type.
				588	std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
				589	assert((Opcode == Instruction::Load \|\| Opcode == Instruction::Store) &&
				590	"Invalid Opcode");
				591
				592	// Each load/store unit costs 1.
				593	unsigned Cost = LT.first * 1;
				594
				595	// On Sandybridge 256bit load/stores are double pumped
				596	// (but not on Haswell).
				597	if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2())
				598	Cost*=2;
				599
				600	return Cost;
				601	}