blob: c0a12a82a32d7bf488b8367dce13ba67bf78a783 [file] [log] [blame]
Chandler Carruth664e3542013-01-07 01:37:14 +00001//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9/// \file
10/// This file implements a TargetTransformInfo analysis pass specific to the
11/// X86 target machine. It uses the target's detailed information to provide
12/// more precise answers to certain TTI queries, while letting the target
13/// independent and default TTI implementations handle the rest.
14///
15//===----------------------------------------------------------------------===//
16
Chandler Carruth93dcdc42015-01-31 11:17:59 +000017#include "X86TargetTransformInfo.h"
Chandler Carruthd3e73552013-01-07 03:08:10 +000018#include "llvm/Analysis/TargetTransformInfo.h"
Chandler Carruth705b1852015-01-31 03:43:40 +000019#include "llvm/CodeGen/BasicTTIImpl.h"
Juergen Ributzkaf26beda2014-01-25 02:02:55 +000020#include "llvm/IR/IntrinsicInst.h"
Chandler Carruth664e3542013-01-07 01:37:14 +000021#include "llvm/Support/Debug.h"
Renato Golind4c392e2013-01-24 23:01:00 +000022#include "llvm/Target/CostTable.h"
Chandler Carruth8a8cd2b2014-01-07 11:48:04 +000023#include "llvm/Target/TargetLowering.h"
Hans Wennborg083ca9b2015-10-06 23:24:35 +000024
Chandler Carruth664e3542013-01-07 01:37:14 +000025using namespace llvm;
26
Chandler Carruth84e68b22014-04-22 02:41:26 +000027#define DEBUG_TYPE "x86tti"
28
Chandler Carruth664e3542013-01-07 01:37:14 +000029//===----------------------------------------------------------------------===//
30//
31// X86 cost model.
32//
33//===----------------------------------------------------------------------===//
34
Chandler Carruth705b1852015-01-31 03:43:40 +000035TargetTransformInfo::PopcntSupportKind
36X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
Chandler Carruth664e3542013-01-07 01:37:14 +000037 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
38 // TODO: Currently the __builtin_popcount() implementation using SSE3
39 // instructions is inefficient. Once the problem is fixed, we should
Craig Topper0a63e1d2013-09-08 00:47:31 +000040 // call ST->hasSSE3() instead of ST->hasPOPCNT().
Chandler Carruth705b1852015-01-31 03:43:40 +000041 return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
Chandler Carruth664e3542013-01-07 01:37:14 +000042}
43
Chandler Carruth705b1852015-01-31 03:43:40 +000044unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
Nadav Rotemb1791a72013-01-09 22:29:00 +000045 if (Vector && !ST->hasSSE1())
46 return 0;
47
Adam Nemet2820a5b2014-07-09 18:22:33 +000048 if (ST->is64Bit()) {
49 if (Vector && ST->hasAVX512())
50 return 32;
Chandler Carruth664e3542013-01-07 01:37:14 +000051 return 16;
Adam Nemet2820a5b2014-07-09 18:22:33 +000052 }
Chandler Carruth664e3542013-01-07 01:37:14 +000053 return 8;
54}
55
Chandler Carruth705b1852015-01-31 03:43:40 +000056unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
Nadav Rotemb1791a72013-01-09 22:29:00 +000057 if (Vector) {
Adam Nemet2820a5b2014-07-09 18:22:33 +000058 if (ST->hasAVX512()) return 512;
Nadav Rotemb1791a72013-01-09 22:29:00 +000059 if (ST->hasAVX()) return 256;
60 if (ST->hasSSE1()) return 128;
61 return 0;
62 }
63
64 if (ST->is64Bit())
65 return 64;
Nadav Rotemb1791a72013-01-09 22:29:00 +000066
Hans Wennborg083ca9b2015-10-06 23:24:35 +000067 return 32;
Nadav Rotemb1791a72013-01-09 22:29:00 +000068}
69
Wei Mi062c7442015-05-06 17:12:25 +000070unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
71 // If the loop will not be vectorized, don't interleave the loop.
72 // Let regular unroll to unroll the loop, which saves the overflow
73 // check and memory check cost.
74 if (VF == 1)
75 return 1;
76
Nadav Rotemb696c362013-01-09 01:15:42 +000077 if (ST->isAtom())
78 return 1;
79
80 // Sandybridge and Haswell have multiple execution ports and pipelined
81 // vector units.
82 if (ST->hasAVX())
83 return 4;
84
85 return 2;
86}
87
Chandler Carruth93205eb2015-08-05 18:08:10 +000088int X86TTIImpl::getArithmeticInstrCost(
Chandler Carruth705b1852015-01-31 03:43:40 +000089 unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
90 TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
91 TTI::OperandValueProperties Opd2PropInfo) {
Chandler Carruth664e3542013-01-07 01:37:14 +000092 // Legalize the type.
Chandler Carruth93205eb2015-08-05 18:08:10 +000093 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
Chandler Carruth664e3542013-01-07 01:37:14 +000094
95 int ISD = TLI->InstructionOpcodeToISD(Opcode);
96 assert(ISD && "Invalid opcode");
97
Karthik Bhat7f33ff72014-08-25 04:56:54 +000098 if (ISD == ISD::SDIV &&
99 Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
100 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
101 // On X86, vector signed division by constants power-of-two are
102 // normally expanded to the sequence SRA + SRL + ADD + SRA.
103 // The OperandValue properties many not be same as that of previous
104 // operation;conservatively assume OP_None.
Chandler Carruth93205eb2015-08-05 18:08:10 +0000105 int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info,
106 Op2Info, TargetTransformInfo::OP_None,
107 TargetTransformInfo::OP_None);
Karthik Bhat7f33ff72014-08-25 04:56:54 +0000108 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
109 TargetTransformInfo::OP_None,
110 TargetTransformInfo::OP_None);
111 Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
112 TargetTransformInfo::OP_None,
113 TargetTransformInfo::OP_None);
114
115 return Cost;
116 }
117
Benjamin Kramer7c372272014-04-26 14:53:05 +0000118 static const CostTblEntry<MVT::SimpleValueType>
119 AVX2UniformConstCostTable[] = {
Simon Pilgrim8fbf1c12015-07-06 22:35:19 +0000120 { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
121
Benjamin Kramer7c372272014-04-26 14:53:05 +0000122 { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
123 { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
124 { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
125 { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
126 };
127
128 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
129 ST->hasAVX2()) {
130 int Idx = CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second);
131 if (Idx != -1)
132 return LT.first * AVX2UniformConstCostTable[Idx].Cost;
133 }
134
Elena Demikhovsky27012472014-09-16 07:57:37 +0000135 static const CostTblEntry<MVT::SimpleValueType> AVX512CostTable[] = {
136 { ISD::SHL, MVT::v16i32, 1 },
137 { ISD::SRL, MVT::v16i32, 1 },
138 { ISD::SRA, MVT::v16i32, 1 },
139 { ISD::SHL, MVT::v8i64, 1 },
140 { ISD::SRL, MVT::v8i64, 1 },
141 { ISD::SRA, MVT::v8i64, 1 },
142 };
143
Simon Pilgrim3d11c992015-09-30 08:17:50 +0000144 if (ST->hasAVX512()) {
145 int Idx = CostTableLookup(AVX512CostTable, ISD, LT.second);
146 if (Idx != -1)
147 return LT.first * AVX512CostTable[Idx].Cost;
148 }
149
Benjamin Kramer21585fd2013-08-09 19:33:32 +0000150 static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = {
Michael Liao70dd7f92013-03-20 22:01:10 +0000151 // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
152 // customize them to detect the cases where shift amount is a scalar one.
153 { ISD::SHL, MVT::v4i32, 1 },
154 { ISD::SRL, MVT::v4i32, 1 },
155 { ISD::SRA, MVT::v4i32, 1 },
156 { ISD::SHL, MVT::v8i32, 1 },
157 { ISD::SRL, MVT::v8i32, 1 },
158 { ISD::SRA, MVT::v8i32, 1 },
159 { ISD::SHL, MVT::v2i64, 1 },
160 { ISD::SRL, MVT::v2i64, 1 },
161 { ISD::SHL, MVT::v4i64, 1 },
162 { ISD::SRL, MVT::v4i64, 1 },
Simon Pilgrim3d11c992015-09-30 08:17:50 +0000163 };
Arnold Schwaighofere9b50162013-04-03 21:46:05 +0000164
Simon Pilgrim3d11c992015-09-30 08:17:50 +0000165 // Look for AVX2 lowering tricks.
166 if (ST->hasAVX2()) {
167 if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
168 (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
169 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
170 // On AVX2, a packed v16i16 shift left by a constant build_vector
171 // is lowered into a vector multiply (vpmullw).
172 return LT.first;
173
174 int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second);
175 if (Idx != -1)
176 return LT.first * AVX2CostTable[Idx].Cost;
177 }
178
179 static const CostTblEntry<MVT::SimpleValueType> XOPCostTable[] = {
180 // 128bit shifts take 1cy, but right shifts require negation beforehand.
181 { ISD::SHL, MVT::v16i8, 1 },
182 { ISD::SRL, MVT::v16i8, 2 },
183 { ISD::SRA, MVT::v16i8, 2 },
184 { ISD::SHL, MVT::v8i16, 1 },
185 { ISD::SRL, MVT::v8i16, 2 },
186 { ISD::SRA, MVT::v8i16, 2 },
187 { ISD::SHL, MVT::v4i32, 1 },
188 { ISD::SRL, MVT::v4i32, 2 },
189 { ISD::SRA, MVT::v4i32, 2 },
190 { ISD::SHL, MVT::v2i64, 1 },
191 { ISD::SRL, MVT::v2i64, 2 },
192 { ISD::SRA, MVT::v2i64, 2 },
193 // 256bit shifts require splitting if AVX2 didn't catch them above.
194 { ISD::SHL, MVT::v32i8, 2 },
195 { ISD::SRL, MVT::v32i8, 4 },
196 { ISD::SRA, MVT::v32i8, 4 },
197 { ISD::SHL, MVT::v16i16, 2 },
198 { ISD::SRL, MVT::v16i16, 4 },
199 { ISD::SRA, MVT::v16i16, 4 },
200 { ISD::SHL, MVT::v8i32, 2 },
201 { ISD::SRL, MVT::v8i32, 4 },
202 { ISD::SRA, MVT::v8i32, 4 },
203 { ISD::SHL, MVT::v4i64, 2 },
204 { ISD::SRL, MVT::v4i64, 4 },
205 { ISD::SRA, MVT::v4i64, 4 },
206 };
207
208 // Look for XOP lowering tricks.
209 if (ST->hasXOP()) {
210 int Idx = CostTableLookup(XOPCostTable, ISD, LT.second);
211 if (Idx != -1)
212 return LT.first * XOPCostTable[Idx].Cost;
213 }
214
215 static const CostTblEntry<MVT::SimpleValueType> AVX2CustomCostTable[] = {
Simon Pilgrim59656802015-06-11 07:46:37 +0000216 { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
Simon Pilgrim0be4fa72015-05-25 17:49:13 +0000217 { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
Arnold Schwaighofere9b50162013-04-03 21:46:05 +0000218
Simon Pilgrim59656802015-06-11 07:46:37 +0000219 { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
Simon Pilgrim0be4fa72015-05-25 17:49:13 +0000220 { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
Arnold Schwaighofere9b50162013-04-03 21:46:05 +0000221
Simon Pilgrim59656802015-06-11 07:46:37 +0000222 { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
Simon Pilgrim0be4fa72015-05-25 17:49:13 +0000223 { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
Simon Pilgrim86478c62015-07-29 20:31:45 +0000224 { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
225 { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
Arnold Schwaighofera04b9ef2013-06-25 19:14:09 +0000226
227 // Vectorizing division is a bad idea. See the SSE2 table for more comments.
228 { ISD::SDIV, MVT::v32i8, 32*20 },
229 { ISD::SDIV, MVT::v16i16, 16*20 },
230 { ISD::SDIV, MVT::v8i32, 8*20 },
231 { ISD::SDIV, MVT::v4i64, 4*20 },
232 { ISD::UDIV, MVT::v32i8, 32*20 },
233 { ISD::UDIV, MVT::v16i16, 16*20 },
234 { ISD::UDIV, MVT::v8i32, 8*20 },
235 { ISD::UDIV, MVT::v4i64, 4*20 },
Michael Liao70dd7f92013-03-20 22:01:10 +0000236 };
237
Simon Pilgrim3d11c992015-09-30 08:17:50 +0000238 // Look for AVX2 lowering tricks for custom cases.
Michael Liao70dd7f92013-03-20 22:01:10 +0000239 if (ST->hasAVX2()) {
Simon Pilgrim3d11c992015-09-30 08:17:50 +0000240 int Idx = CostTableLookup(AVX2CustomCostTable, ISD, LT.second);
Michael Liao70dd7f92013-03-20 22:01:10 +0000241 if (Idx != -1)
Simon Pilgrim3d11c992015-09-30 08:17:50 +0000242 return LT.first * AVX2CustomCostTable[Idx].Cost;
Michael Liao70dd7f92013-03-20 22:01:10 +0000243 }
244
Benjamin Kramer21585fd2013-08-09 19:33:32 +0000245 static const CostTblEntry<MVT::SimpleValueType>
246 SSE2UniformConstCostTable[] = {
Arnold Schwaighofer44f902e2013-04-04 23:26:24 +0000247 // We don't correctly identify costs of casts because they are marked as
248 // custom.
249 // Constant splats are cheaper for the following instructions.
250 { ISD::SHL, MVT::v16i8, 1 }, // psllw.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000251 { ISD::SHL, MVT::v32i8, 2 }, // psllw.
Arnold Schwaighofer44f902e2013-04-04 23:26:24 +0000252 { ISD::SHL, MVT::v8i16, 1 }, // psllw.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000253 { ISD::SHL, MVT::v16i16, 2 }, // psllw.
Arnold Schwaighofer44f902e2013-04-04 23:26:24 +0000254 { ISD::SHL, MVT::v4i32, 1 }, // pslld
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000255 { ISD::SHL, MVT::v8i32, 2 }, // pslld
Arnold Schwaighofer44f902e2013-04-04 23:26:24 +0000256 { ISD::SHL, MVT::v2i64, 1 }, // psllq.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000257 { ISD::SHL, MVT::v4i64, 2 }, // psllq.
Arnold Schwaighofer44f902e2013-04-04 23:26:24 +0000258
259 { ISD::SRL, MVT::v16i8, 1 }, // psrlw.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000260 { ISD::SRL, MVT::v32i8, 2 }, // psrlw.
Arnold Schwaighofer44f902e2013-04-04 23:26:24 +0000261 { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000262 { ISD::SRL, MVT::v16i16, 2 }, // psrlw.
Arnold Schwaighofer44f902e2013-04-04 23:26:24 +0000263 { ISD::SRL, MVT::v4i32, 1 }, // psrld.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000264 { ISD::SRL, MVT::v8i32, 2 }, // psrld.
Arnold Schwaighofer44f902e2013-04-04 23:26:24 +0000265 { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000266 { ISD::SRL, MVT::v4i64, 2 }, // psrlq.
Arnold Schwaighofer44f902e2013-04-04 23:26:24 +0000267
268 { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000269 { ISD::SRA, MVT::v32i8, 8 }, // psrlw, pand, pxor, psubb.
Arnold Schwaighofer44f902e2013-04-04 23:26:24 +0000270 { ISD::SRA, MVT::v8i16, 1 }, // psraw.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000271 { ISD::SRA, MVT::v16i16, 2 }, // psraw.
Arnold Schwaighofer44f902e2013-04-04 23:26:24 +0000272 { ISD::SRA, MVT::v4i32, 1 }, // psrad.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000273 { ISD::SRA, MVT::v8i32, 2 }, // psrad.
Simon Pilgrim8fbf1c12015-07-06 22:35:19 +0000274 { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000275 { ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle.
Benjamin Kramer7c372272014-04-26 14:53:05 +0000276
277 { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
278 { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
Benjamin Kramerce4b3fe2014-04-27 18:47:54 +0000279 { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
Benjamin Kramer7c372272014-04-26 14:53:05 +0000280 { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
Arnold Schwaighofer44f902e2013-04-04 23:26:24 +0000281 };
282
283 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
284 ST->hasSSE2()) {
Benjamin Kramerce4b3fe2014-04-27 18:47:54 +0000285 // pmuldq sequence.
286 if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
287 return LT.first * 15;
288
Benjamin Kramer21585fd2013-08-09 19:33:32 +0000289 int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second);
Arnold Schwaighofer44f902e2013-04-04 23:26:24 +0000290 if (Idx != -1)
291 return LT.first * SSE2UniformConstCostTable[Idx].Cost;
292 }
293
Andrea Di Biagiob7882b32014-02-12 23:43:47 +0000294 if (ISD == ISD::SHL &&
295 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
296 EVT VT = LT.second;
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000297 // Vector shift left by non uniform constant can be lowered
298 // into vector multiply (pmullw/pmulld).
Andrea Di Biagiob7882b32014-02-12 23:43:47 +0000299 if ((VT == MVT::v8i16 && ST->hasSSE2()) ||
300 (VT == MVT::v4i32 && ST->hasSSE41()))
Andrea Di Biagiob7882b32014-02-12 23:43:47 +0000301 return LT.first;
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000302
303 // v16i16 and v8i32 shifts by non-uniform constants are lowered into a
304 // sequence of extract + two vector multiply + insert.
305 if ((VT == MVT::v8i32 || VT == MVT::v16i16) &&
306 (ST->hasAVX() && !ST->hasAVX2()))
307 ISD = ISD::MUL;
308
309 // A vector shift left by non uniform constant is converted
310 // into a vector multiply; the new multiply is eventually
311 // lowered into a sequence of shuffles and 2 x pmuludq.
Andrea Di Biagiob7882b32014-02-12 23:43:47 +0000312 if (VT == MVT::v4i32 && ST->hasSSE2())
Andrea Di Biagiob7882b32014-02-12 23:43:47 +0000313 ISD = ISD::MUL;
314 }
Arnold Schwaighofer44f902e2013-04-04 23:26:24 +0000315
Benjamin Kramer21585fd2013-08-09 19:33:32 +0000316 static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = {
Arnold Schwaighofere9b50162013-04-03 21:46:05 +0000317 // We don't correctly identify costs of casts because they are marked as
318 // custom.
319 // For some cases, where the shift amount is a scalar we would be able
320 // to generate better code. Unfortunately, when this is the case the value
321 // (the splat) will get hoisted out of the loop, thereby making it invisible
322 // to ISel. The cost model must return worst case assumptions because it is
323 // used for vectorization and we don't want to make vectorized code worse
324 // than scalar code.
Simon Pilgrim59656802015-06-11 07:46:37 +0000325 { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000326 { ISD::SHL, MVT::v32i8, 2*26 }, // cmpgtb sequence.
Simon Pilgrim59656802015-06-11 07:46:37 +0000327 { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000328 { ISD::SHL, MVT::v16i16, 2*32 }, // cmpgtb sequence.
Simon Pilgrim59656802015-06-11 07:46:37 +0000329 { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000330 { ISD::SHL, MVT::v8i32, 2*2*5 }, // We optimized this using mul.
Simon Pilgrim59764dc2015-07-18 20:06:30 +0000331 { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000332 { ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
NAKAMURA Takumi0b305db2015-07-14 04:03:49 +0000333
334 { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000335 { ISD::SRL, MVT::v32i8, 2*26 }, // cmpgtb sequence.
NAKAMURA Takumi0b305db2015-07-14 04:03:49 +0000336 { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000337 { ISD::SRL, MVT::v16i16, 2*32 }, // cmpgtb sequence.
NAKAMURA Takumi0b305db2015-07-14 04:03:49 +0000338 { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000339 { ISD::SRL, MVT::v8i32, 2*16 }, // Shift each lane + blend.
Simon Pilgrim59764dc2015-07-18 20:06:30 +0000340 { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000341 { ISD::SRL, MVT::v4i64, 2*4 }, // splat+shuffle sequence.
NAKAMURA Takumi0b305db2015-07-14 04:03:49 +0000342
343 { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000344 { ISD::SRA, MVT::v32i8, 2*54 }, // unpacked cmpgtb sequence.
NAKAMURA Takumi0b305db2015-07-14 04:03:49 +0000345 { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000346 { ISD::SRA, MVT::v16i16, 2*32 }, // cmpgtb sequence.
NAKAMURA Takumi0b305db2015-07-14 04:03:49 +0000347 { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000348 { ISD::SRA, MVT::v8i32, 2*16 }, // Shift each lane + blend.
Simon Pilgrim86478c62015-07-29 20:31:45 +0000349 { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
Simon Pilgrima18ae9b2015-10-17 13:23:38 +0000350 { ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence.
NAKAMURA Takumi0b305db2015-07-14 04:03:49 +0000351
352 // It is not a good idea to vectorize division. We have to scalarize it and
Arnold Schwaighofera04b9ef2013-06-25 19:14:09 +0000353 // in the process we will often end up having to spilling regular
354 // registers. The overhead of division is going to dominate most kernels
355 // anyways so try hard to prevent vectorization of division - it is
356 // generally a bad idea. Assume somewhat arbitrarily that we have to be able
357 // to hide "20 cycles" for each lane.
358 { ISD::SDIV, MVT::v16i8, 16*20 },
359 { ISD::SDIV, MVT::v8i16, 8*20 },
360 { ISD::SDIV, MVT::v4i32, 4*20 },
361 { ISD::SDIV, MVT::v2i64, 2*20 },
362 { ISD::UDIV, MVT::v16i8, 16*20 },
363 { ISD::UDIV, MVT::v8i16, 8*20 },
364 { ISD::UDIV, MVT::v4i32, 4*20 },
365 { ISD::UDIV, MVT::v2i64, 2*20 },
Arnold Schwaighofere9b50162013-04-03 21:46:05 +0000366 };
367
368 if (ST->hasSSE2()) {
Benjamin Kramer21585fd2013-08-09 19:33:32 +0000369 int Idx = CostTableLookup(SSE2CostTable, ISD, LT.second);
Arnold Schwaighofere9b50162013-04-03 21:46:05 +0000370 if (Idx != -1)
371 return LT.first * SSE2CostTable[Idx].Cost;
372 }
373
Benjamin Kramer21585fd2013-08-09 19:33:32 +0000374 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTable[] = {
Renato Goline1fb0592013-01-20 20:57:20 +0000375 // We don't have to scalarize unsupported ops. We can issue two half-sized
376 // operations and we only need to extract the upper YMM half.
377 // Two ops + 1 extract + 1 insert = 4.
Andrea Di Biagiob7882b32014-02-12 23:43:47 +0000378 { ISD::MUL, MVT::v16i16, 4 },
Renato Goline1fb0592013-01-20 20:57:20 +0000379 { ISD::MUL, MVT::v8i32, 4 },
380 { ISD::SUB, MVT::v8i32, 4 },
381 { ISD::ADD, MVT::v8i32, 4 },
Renato Goline1fb0592013-01-20 20:57:20 +0000382 { ISD::SUB, MVT::v4i64, 4 },
383 { ISD::ADD, MVT::v4i64, 4 },
Arnold Schwaighofer20ef54f2013-03-02 04:02:52 +0000384 // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
385 // are lowered as a series of long multiplies(3), shifts(4) and adds(2)
386 // Because we believe v4i64 to be a legal type, we must also include the
387 // split factor of two in the cost table. Therefore, the cost here is 18
388 // instead of 9.
389 { ISD::MUL, MVT::v4i64, 18 },
390 };
Chandler Carruth664e3542013-01-07 01:37:14 +0000391
392 // Look for AVX1 lowering tricks.
Arnold Schwaighofer20ef54f2013-03-02 04:02:52 +0000393 if (ST->hasAVX() && !ST->hasAVX2()) {
Andrea Di Biagiob7882b32014-02-12 23:43:47 +0000394 EVT VT = LT.second;
395
Andrea Di Biagiob7882b32014-02-12 23:43:47 +0000396 int Idx = CostTableLookup(AVX1CostTable, ISD, VT);
Renato Goline1fb0592013-01-20 20:57:20 +0000397 if (Idx != -1)
398 return LT.first * AVX1CostTable[Idx].Cost;
Chandler Carruth664e3542013-01-07 01:37:14 +0000399 }
Arnold Schwaighofer20ef54f2013-03-02 04:02:52 +0000400
401 // Custom lowering of vectors.
Benjamin Kramer21585fd2013-08-09 19:33:32 +0000402 static const CostTblEntry<MVT::SimpleValueType> CustomLowered[] = {
Arnold Schwaighofer20ef54f2013-03-02 04:02:52 +0000403 // A v2i64/v4i64 and multiply is custom lowered as a series of long
404 // multiplies(3), shifts(4) and adds(2).
405 { ISD::MUL, MVT::v2i64, 9 },
406 { ISD::MUL, MVT::v4i64, 9 },
407 };
Benjamin Kramer21585fd2013-08-09 19:33:32 +0000408 int Idx = CostTableLookup(CustomLowered, ISD, LT.second);
Arnold Schwaighofer20ef54f2013-03-02 04:02:52 +0000409 if (Idx != -1)
410 return LT.first * CustomLowered[Idx].Cost;
411
412 // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle,
413 // 2x pmuludq, 2x shuffle.
414 if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
415 !ST->hasSSE41())
Andrea Di Biagiob7882b32014-02-12 23:43:47 +0000416 return LT.first * 6;
Arnold Schwaighofer20ef54f2013-03-02 04:02:52 +0000417
Chandler Carruth664e3542013-01-07 01:37:14 +0000418 // Fallback to the default implementation.
Chandler Carruth705b1852015-01-31 03:43:40 +0000419 return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
Chandler Carruth664e3542013-01-07 01:37:14 +0000420}
421
Chandler Carruth93205eb2015-08-05 18:08:10 +0000422int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
423 Type *SubTp) {
Karthik Bhate03a25d2014-06-20 04:32:48 +0000424 // We only estimate the cost of reverse and alternate shuffles.
Chandler Carruth705b1852015-01-31 03:43:40 +0000425 if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)
426 return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
Chandler Carruth664e3542013-01-07 01:37:14 +0000427
Chandler Carruth705b1852015-01-31 03:43:40 +0000428 if (Kind == TTI::SK_Reverse) {
Chandler Carruth93205eb2015-08-05 18:08:10 +0000429 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
430 int Cost = 1;
Karthik Bhate03a25d2014-06-20 04:32:48 +0000431 if (LT.second.getSizeInBits() > 128)
432 Cost = 3; // Extract + insert + copy.
Chandler Carruth664e3542013-01-07 01:37:14 +0000433
Karthik Bhate03a25d2014-06-20 04:32:48 +0000434 // Multiple by the number of parts.
435 return Cost * LT.first;
436 }
437
Chandler Carruth705b1852015-01-31 03:43:40 +0000438 if (Kind == TTI::SK_Alternate) {
Andrea Di Biagioc8e8bda2014-07-03 22:24:18 +0000439 // 64-bit packed float vectors (v2f32) are widened to type v4f32.
440 // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
Chandler Carruth93205eb2015-08-05 18:08:10 +0000441 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
Karthik Bhate03a25d2014-06-20 04:32:48 +0000442
Andrea Di Biagioc8e8bda2014-07-03 22:24:18 +0000443 // The backend knows how to generate a single VEX.256 version of
444 // instruction VPBLENDW if the target supports AVX2.
445 if (ST->hasAVX2() && LT.second == MVT::v16i16)
446 return LT.first;
447
448 static const CostTblEntry<MVT::SimpleValueType> AVXAltShuffleTbl[] = {
449 {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vblendpd
450 {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vblendpd
451
452 {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vblendps
453 {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vblendps
454
455 // This shuffle is custom lowered into a sequence of:
456 // 2x vextractf128 , 2x vpblendw , 1x vinsertf128
457 {ISD::VECTOR_SHUFFLE, MVT::v16i16, 5},
458
459 // This shuffle is custom lowered into a long sequence of:
460 // 2x vextractf128 , 4x vpshufb , 2x vpor , 1x vinsertf128
461 {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9}
462 };
463
464 if (ST->hasAVX()) {
465 int Idx = CostTableLookup(AVXAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
466 if (Idx != -1)
467 return LT.first * AVXAltShuffleTbl[Idx].Cost;
468 }
469
470 static const CostTblEntry<MVT::SimpleValueType> SSE41AltShuffleTbl[] = {
471 // These are lowered into movsd.
472 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
473 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
474
475 // packed float vectors with four elements are lowered into BLENDI dag
476 // nodes. A v4i32/v4f32 BLENDI generates a single 'blendps'/'blendpd'.
477 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
478 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
479
480 // This shuffle generates a single pshufw.
481 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
482
483 // There is no instruction that matches a v16i8 alternate shuffle.
484 // The backend will expand it into the sequence 'pshufb + pshufb + or'.
485 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}
486 };
487
488 if (ST->hasSSE41()) {
489 int Idx = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
490 if (Idx != -1)
491 return LT.first * SSE41AltShuffleTbl[Idx].Cost;
492 }
493
494 static const CostTblEntry<MVT::SimpleValueType> SSSE3AltShuffleTbl[] = {
495 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd
496 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd
497
498 // SSE3 doesn't have 'blendps'. The following shuffles are expanded into
499 // the sequence 'shufps + pshufd'
500 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
501 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
502
503 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or
504 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or
505 };
Michael Liao5bf95782014-12-04 05:20:33 +0000506
Andrea Di Biagioc8e8bda2014-07-03 22:24:18 +0000507 if (ST->hasSSSE3()) {
508 int Idx = CostTableLookup(SSSE3AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
509 if (Idx != -1)
510 return LT.first * SSSE3AltShuffleTbl[Idx].Cost;
511 }
512
513 static const CostTblEntry<MVT::SimpleValueType> SSEAltShuffleTbl[] = {
514 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd
515 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd
516
517 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd
518 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd
Michael Liao5bf95782014-12-04 05:20:33 +0000519
Andrea Di Biagioc8e8bda2014-07-03 22:24:18 +0000520 // This is expanded into a long sequence of four extract + four insert.
521 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw.
522
523 // 8 x (pinsrw + pextrw + and + movb + movzb + or)
524 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48}
525 };
526
Michael Liao5bf95782014-12-04 05:20:33 +0000527 // Fall-back (SSE3 and SSE2).
Andrea Di Biagioc8e8bda2014-07-03 22:24:18 +0000528 int Idx = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
529 if (Idx != -1)
530 return LT.first * SSEAltShuffleTbl[Idx].Cost;
Chandler Carruth705b1852015-01-31 03:43:40 +0000531 return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
Karthik Bhate03a25d2014-06-20 04:32:48 +0000532 }
533
Chandler Carruth705b1852015-01-31 03:43:40 +0000534 return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
Chandler Carruth664e3542013-01-07 01:37:14 +0000535}
536
Chandler Carruth93205eb2015-08-05 18:08:10 +0000537int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
Chandler Carruth664e3542013-01-07 01:37:14 +0000538 int ISD = TLI->InstructionOpcodeToISD(Opcode);
539 assert(ISD && "Invalid opcode");
540
Elena Demikhovsky27012472014-09-16 07:57:37 +0000541 static const TypeConversionCostTblEntry<MVT::SimpleValueType>
542 AVX512ConversionTbl[] = {
543 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
544 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
545 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
546 { ISD::FP_ROUND, MVT::v16f32, MVT::v8f64, 3 },
547
548 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 },
549 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 },
550 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 },
551 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 },
552 { ISD::TRUNCATE, MVT::v16i32, MVT::v8i64, 4 },
553
554 // v16i1 -> v16i32 - load + broadcast
555 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
556 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
557
558 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
559 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
560 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
561 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
562 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i32, 3 },
563 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i32, 3 },
564
Elena Demikhovskyd5e95b52014-11-13 11:46:16 +0000565 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
566 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
567 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
568 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
569 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
570 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
571 { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
Elena Demikhovsky27012472014-09-16 07:57:37 +0000572 };
573
Benjamin Kramer21585fd2013-08-09 19:33:32 +0000574 static const TypeConversionCostTblEntry<MVT::SimpleValueType>
Tim Northoverf0e21612014-02-06 18:18:36 +0000575 AVX2ConversionTbl[] = {
576 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
577 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
578 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
579 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
580 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
581 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
582 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
583 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
584 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
585 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
586 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 },
587 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 },
588 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
589 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
590 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
591 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
592
593 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 },
594 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 },
595 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
596 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 },
597 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
598 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 },
Elena Demikhovsky27012472014-09-16 07:57:37 +0000599
600 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
601 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
Quentin Colombet360460b2014-11-11 02:23:47 +0000602
603 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
Tim Northoverf0e21612014-02-06 18:18:36 +0000604 };
605
606 static const TypeConversionCostTblEntry<MVT::SimpleValueType>
Benjamin Kramer21585fd2013-08-09 19:33:32 +0000607 AVXConversionTbl[] = {
Tim Northoverf0e21612014-02-06 18:18:36 +0000608 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
609 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
610 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
611 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
612 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 },
613 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
614 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
615 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
616 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
617 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
618 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 },
619 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
620 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 },
621 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
622 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
623 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
624
625 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 },
626 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 },
627 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 },
628 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
629 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
630 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 },
631 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 },
Benjamin Kramer52ceb442013-04-01 10:23:49 +0000632
633 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
634 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 },
635 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
636 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
637 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
638 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
639 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 },
640 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
641 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
642 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 },
643 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 },
644 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
645
646 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
647 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 },
648 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
649 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 },
650 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
651 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 },
652 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
653 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 },
654 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
655 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
656 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
657 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
Quentin Colombet85b904d2014-03-27 22:27:41 +0000658 // The generic code to compute the scalar overhead is currently broken.
659 // Workaround this limitation by estimating the scalarization overhead
660 // here. We have roughly 10 instructions per scalar element.
661 // Multiply that by the vector width.
662 // FIXME: remove that when PR19268 is fixed.
Quentin Colombet3914bf52014-03-27 00:52:16 +0000663 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
664 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 4*10 },
Benjamin Kramer52ceb442013-04-01 10:23:49 +0000665
Jim Grosbach72fbde82014-03-27 00:04:11 +0000666 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 },
Renato Goline1fb0592013-01-20 20:57:20 +0000667 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },
Adam Nemet6dafe972014-03-30 18:07:13 +0000668 // This node is expanded into scalarized operations but BasicTTI is overly
669 // optimistic estimating its cost. It computes 3 per element (one
670 // vector-extract, one scalar conversion and one vector-insert). The
671 // problem is that the inserts form a read-modify-write chain so latency
672 // should be factored in too. Inflating the cost per element by 1.
673 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 },
Adam Nemet10c4ce22014-03-31 21:54:48 +0000674 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 },
Chandler Carruth664e3542013-01-07 01:37:14 +0000675 };
676
Simon Pilgrime2c244f2015-07-19 15:36:12 +0000677 static const TypeConversionCostTblEntry<MVT::SimpleValueType>
678 SSE2ConvTbl[] = {
679 // These are somewhat magic numbers justified by looking at the output of
680 // Intel's IACA, running some kernels and making sure when we take
681 // legalization into account the throughput will be overestimated.
682 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
683 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
684 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
685 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
686 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
687 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
688 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
689 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
690 // There are faster sequences for float conversions.
691 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
692 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
693 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
694 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
695 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
696 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 },
697 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
698 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
699 };
700
Chandler Carruth93205eb2015-08-05 18:08:10 +0000701 std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
702 std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
Simon Pilgrime2c244f2015-07-19 15:36:12 +0000703
704 if (ST->hasSSE2() && !ST->hasAVX()) {
705 int Idx =
706 ConvertCostTableLookup(SSE2ConvTbl, ISD, LTDest.second, LTSrc.second);
707 if (Idx != -1)
708 return LTSrc.first * SSE2ConvTbl[Idx].Cost;
709 }
710
711 if (ST->hasAVX512()) {
712 int Idx = ConvertCostTableLookup(AVX512ConversionTbl, ISD, LTDest.second,
713 LTSrc.second);
714 if (Idx != -1)
715 return AVX512ConversionTbl[Idx].Cost;
716 }
717
718 EVT SrcTy = TLI->getValueType(DL, Src);
719 EVT DstTy = TLI->getValueType(DL, Dst);
720
721 // The function getSimpleVT only handles simple value types.
722 if (!SrcTy.isSimple() || !DstTy.isSimple())
723 return BaseT::getCastInstrCost(Opcode, Dst, Src);
724
Tim Northoverf0e21612014-02-06 18:18:36 +0000725 if (ST->hasAVX2()) {
726 int Idx = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
727 DstTy.getSimpleVT(), SrcTy.getSimpleVT());
728 if (Idx != -1)
729 return AVX2ConversionTbl[Idx].Cost;
730 }
731
Chandler Carruth664e3542013-01-07 01:37:14 +0000732 if (ST->hasAVX()) {
Benjamin Kramer21585fd2013-08-09 19:33:32 +0000733 int Idx = ConvertCostTableLookup(AVXConversionTbl, ISD, DstTy.getSimpleVT(),
734 SrcTy.getSimpleVT());
Renato Goline1fb0592013-01-20 20:57:20 +0000735 if (Idx != -1)
736 return AVXConversionTbl[Idx].Cost;
Chandler Carruth664e3542013-01-07 01:37:14 +0000737 }
738
Chandler Carruth705b1852015-01-31 03:43:40 +0000739 return BaseT::getCastInstrCost(Opcode, Dst, Src);
Chandler Carruth664e3542013-01-07 01:37:14 +0000740}
741
Chandler Carruth93205eb2015-08-05 18:08:10 +0000742int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
Chandler Carruth664e3542013-01-07 01:37:14 +0000743 // Legalize the type.
Chandler Carruth93205eb2015-08-05 18:08:10 +0000744 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
Chandler Carruth664e3542013-01-07 01:37:14 +0000745
746 MVT MTy = LT.second;
747
748 int ISD = TLI->InstructionOpcodeToISD(Opcode);
749 assert(ISD && "Invalid opcode");
750
Benjamin Kramer21585fd2013-08-09 19:33:32 +0000751 static const CostTblEntry<MVT::SimpleValueType> SSE42CostTbl[] = {
Renato Goline1fb0592013-01-20 20:57:20 +0000752 { ISD::SETCC, MVT::v2f64, 1 },
753 { ISD::SETCC, MVT::v4f32, 1 },
754 { ISD::SETCC, MVT::v2i64, 1 },
755 { ISD::SETCC, MVT::v4i32, 1 },
756 { ISD::SETCC, MVT::v8i16, 1 },
757 { ISD::SETCC, MVT::v16i8, 1 },
Chandler Carruth664e3542013-01-07 01:37:14 +0000758 };
759
Benjamin Kramer21585fd2013-08-09 19:33:32 +0000760 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTbl[] = {
Renato Goline1fb0592013-01-20 20:57:20 +0000761 { ISD::SETCC, MVT::v4f64, 1 },
762 { ISD::SETCC, MVT::v8f32, 1 },
Chandler Carruth664e3542013-01-07 01:37:14 +0000763 // AVX1 does not support 8-wide integer compare.
Renato Goline1fb0592013-01-20 20:57:20 +0000764 { ISD::SETCC, MVT::v4i64, 4 },
765 { ISD::SETCC, MVT::v8i32, 4 },
766 { ISD::SETCC, MVT::v16i16, 4 },
767 { ISD::SETCC, MVT::v32i8, 4 },
Chandler Carruth664e3542013-01-07 01:37:14 +0000768 };
769
Benjamin Kramer21585fd2013-08-09 19:33:32 +0000770 static const CostTblEntry<MVT::SimpleValueType> AVX2CostTbl[] = {
Renato Goline1fb0592013-01-20 20:57:20 +0000771 { ISD::SETCC, MVT::v4i64, 1 },
772 { ISD::SETCC, MVT::v8i32, 1 },
773 { ISD::SETCC, MVT::v16i16, 1 },
774 { ISD::SETCC, MVT::v32i8, 1 },
Chandler Carruth664e3542013-01-07 01:37:14 +0000775 };
776
Elena Demikhovsky27012472014-09-16 07:57:37 +0000777 static const CostTblEntry<MVT::SimpleValueType> AVX512CostTbl[] = {
778 { ISD::SETCC, MVT::v8i64, 1 },
779 { ISD::SETCC, MVT::v16i32, 1 },
780 { ISD::SETCC, MVT::v8f64, 1 },
781 { ISD::SETCC, MVT::v16f32, 1 },
782 };
783
784 if (ST->hasAVX512()) {
785 int Idx = CostTableLookup(AVX512CostTbl, ISD, MTy);
786 if (Idx != -1)
787 return LT.first * AVX512CostTbl[Idx].Cost;
788 }
789
Chandler Carruth664e3542013-01-07 01:37:14 +0000790 if (ST->hasAVX2()) {
Benjamin Kramer21585fd2013-08-09 19:33:32 +0000791 int Idx = CostTableLookup(AVX2CostTbl, ISD, MTy);
Renato Goline1fb0592013-01-20 20:57:20 +0000792 if (Idx != -1)
793 return LT.first * AVX2CostTbl[Idx].Cost;
Chandler Carruth664e3542013-01-07 01:37:14 +0000794 }
795
796 if (ST->hasAVX()) {
Benjamin Kramer21585fd2013-08-09 19:33:32 +0000797 int Idx = CostTableLookup(AVX1CostTbl, ISD, MTy);
Renato Goline1fb0592013-01-20 20:57:20 +0000798 if (Idx != -1)
799 return LT.first * AVX1CostTbl[Idx].Cost;
Chandler Carruth664e3542013-01-07 01:37:14 +0000800 }
801
802 if (ST->hasSSE42()) {
Benjamin Kramer21585fd2013-08-09 19:33:32 +0000803 int Idx = CostTableLookup(SSE42CostTbl, ISD, MTy);
Renato Goline1fb0592013-01-20 20:57:20 +0000804 if (Idx != -1)
805 return LT.first * SSE42CostTbl[Idx].Cost;
Chandler Carruth664e3542013-01-07 01:37:14 +0000806 }
807
Chandler Carruth705b1852015-01-31 03:43:40 +0000808 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
Chandler Carruth664e3542013-01-07 01:37:14 +0000809}
810
Chandler Carruth93205eb2015-08-05 18:08:10 +0000811int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
Chandler Carruth664e3542013-01-07 01:37:14 +0000812 assert(Val->isVectorTy() && "This must be a vector type");
813
814 if (Index != -1U) {
815 // Legalize the type.
Chandler Carruth93205eb2015-08-05 18:08:10 +0000816 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
Chandler Carruth664e3542013-01-07 01:37:14 +0000817
818 // This type is legalized to a scalar type.
819 if (!LT.second.isVector())
820 return 0;
821
822 // The type may be split. Normalize the index to the new type.
823 unsigned Width = LT.second.getVectorNumElements();
824 Index = Index % Width;
825
826 // Floating point scalars are already located in index #0.
827 if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
828 return 0;
829 }
830
Chandler Carruth705b1852015-01-31 03:43:40 +0000831 return BaseT::getVectorInstrCost(Opcode, Val, Index);
Chandler Carruth664e3542013-01-07 01:37:14 +0000832}
833
Chandler Carruth93205eb2015-08-05 18:08:10 +0000834int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
Nadav Rotemf9ecbcb2013-06-27 17:52:04 +0000835 assert (Ty->isVectorTy() && "Can only scalarize vectors");
Chandler Carruth93205eb2015-08-05 18:08:10 +0000836 int Cost = 0;
Nadav Rotemf9ecbcb2013-06-27 17:52:04 +0000837
838 for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
839 if (Insert)
Chandler Carruth705b1852015-01-31 03:43:40 +0000840 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i);
Nadav Rotemf9ecbcb2013-06-27 17:52:04 +0000841 if (Extract)
Chandler Carruth705b1852015-01-31 03:43:40 +0000842 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i);
Nadav Rotemf9ecbcb2013-06-27 17:52:04 +0000843 }
844
845 return Cost;
846}
847
Chandler Carruth93205eb2015-08-05 18:08:10 +0000848int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
849 unsigned AddressSpace) {
Alp Tokerf907b892013-12-05 05:44:44 +0000850 // Handle non-power-of-two vectors such as <3 x float>
Nadav Rotemf9ecbcb2013-06-27 17:52:04 +0000851 if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
852 unsigned NumElem = VTy->getVectorNumElements();
853
854 // Handle a few common cases:
855 // <3 x float>
856 if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
857 // Cost = 64 bit store + extract + 32 bit store.
858 return 3;
859
860 // <3 x double>
861 if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
862 // Cost = 128 bit store + unpack + 64 bit store.
863 return 3;
864
Alp Tokerf907b892013-12-05 05:44:44 +0000865 // Assume that all other non-power-of-two numbers are scalarized.
Nadav Rotemf9ecbcb2013-06-27 17:52:04 +0000866 if (!isPowerOf2_32(NumElem)) {
Chandler Carruth93205eb2015-08-05 18:08:10 +0000867 int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
868 AddressSpace);
869 int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
870 Opcode == Instruction::Store);
Nadav Rotemf9ecbcb2013-06-27 17:52:04 +0000871 return NumElem * Cost + SplitCost;
872 }
873 }
874
Chandler Carruth664e3542013-01-07 01:37:14 +0000875 // Legalize the type.
Chandler Carruth93205eb2015-08-05 18:08:10 +0000876 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
Chandler Carruth664e3542013-01-07 01:37:14 +0000877 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
878 "Invalid Opcode");
879
880 // Each load/store unit costs 1.
Chandler Carruth93205eb2015-08-05 18:08:10 +0000881 int Cost = LT.first * 1;
Chandler Carruth664e3542013-01-07 01:37:14 +0000882
883 // On Sandybridge 256bit load/stores are double pumped
884 // (but not on Haswell).
885 if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2())
886 Cost*=2;
887
888 return Cost;
889}
Arnold Schwaighofer6042a262013-07-12 19:16:07 +0000890
Chandler Carruth93205eb2015-08-05 18:08:10 +0000891int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
892 unsigned Alignment,
893 unsigned AddressSpace) {
Elena Demikhovskya3232f72015-01-25 08:44:46 +0000894 VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
895 if (!SrcVTy)
896 // To calculate scalar take the regular cost, without mask
897 return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
898
899 unsigned NumElem = SrcVTy->getVectorNumElements();
900 VectorType *MaskTy =
901 VectorType::get(Type::getInt8Ty(getGlobalContext()), NumElem);
902 if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy, 1)) ||
903 (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy, 1)) ||
904 !isPowerOf2_32(NumElem)) {
905 // Scalarization
Chandler Carruth93205eb2015-08-05 18:08:10 +0000906 int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
907 int ScalarCompareCost = getCmpSelInstrCost(
Hans Wennborg083ca9b2015-10-06 23:24:35 +0000908 Instruction::ICmp, Type::getInt8Ty(getGlobalContext()), nullptr);
Chandler Carruth93205eb2015-08-05 18:08:10 +0000909 int BranchCost = getCFInstrCost(Instruction::Br);
910 int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
Elena Demikhovskya3232f72015-01-25 08:44:46 +0000911
Chandler Carruth93205eb2015-08-05 18:08:10 +0000912 int ValueSplitCost = getScalarizationOverhead(
913 SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store);
914 int MemopCost =
Chandler Carruth705b1852015-01-31 03:43:40 +0000915 NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
916 Alignment, AddressSpace);
Elena Demikhovskya3232f72015-01-25 08:44:46 +0000917 return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
918 }
919
920 // Legalize the type.
Chandler Carruth93205eb2015-08-05 18:08:10 +0000921 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
922 int Cost = 0;
Mehdi Amini44ede332015-07-09 02:09:04 +0000923 if (LT.second != TLI->getValueType(DL, SrcVTy).getSimpleVT() &&
Elena Demikhovskya3232f72015-01-25 08:44:46 +0000924 LT.second.getVectorNumElements() == NumElem)
925 // Promotion requires expand/truncate for data and a shuffle for mask.
Hans Wennborg083ca9b2015-10-06 23:24:35 +0000926 Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) +
927 getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr);
Chandler Carruth705b1852015-01-31 03:43:40 +0000928
Elena Demikhovskya3232f72015-01-25 08:44:46 +0000929 else if (LT.second.getVectorNumElements() > NumElem) {
930 VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
931 LT.second.getVectorNumElements());
932 // Expanding requires fill mask with zeroes
Chandler Carruth705b1852015-01-31 03:43:40 +0000933 Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
Elena Demikhovskya3232f72015-01-25 08:44:46 +0000934 }
935 if (!ST->hasAVX512())
936 return Cost + LT.first*4; // Each maskmov costs 4
937
938 // AVX-512 masked load/store is cheapper
939 return Cost+LT.first;
940}
941
Chandler Carruth93205eb2015-08-05 18:08:10 +0000942int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
Arnold Schwaighofer6042a262013-07-12 19:16:07 +0000943 // Address computations in vectorized code with non-consecutive addresses will
944 // likely result in more instructions compared to scalar code where the
945 // computation can more often be merged into the index mode. The resulting
946 // extra micro-ops can significantly decrease throughput.
947 unsigned NumVectorInstToHideOverhead = 10;
948
949 if (Ty->isVectorTy() && IsComplex)
950 return NumVectorInstToHideOverhead;
951
Chandler Carruth705b1852015-01-31 03:43:40 +0000952 return BaseT::getAddressComputationCost(Ty, IsComplex);
Arnold Schwaighofer6042a262013-07-12 19:16:07 +0000953}
Yi Jiang5c343de2013-09-19 17:48:48 +0000954
Chandler Carruth93205eb2015-08-05 18:08:10 +0000955int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
956 bool IsPairwise) {
Michael Liao5bf95782014-12-04 05:20:33 +0000957
Chandler Carruth93205eb2015-08-05 18:08:10 +0000958 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
Michael Liao5bf95782014-12-04 05:20:33 +0000959
Yi Jiang5c343de2013-09-19 17:48:48 +0000960 MVT MTy = LT.second;
Michael Liao5bf95782014-12-04 05:20:33 +0000961
Yi Jiang5c343de2013-09-19 17:48:48 +0000962 int ISD = TLI->InstructionOpcodeToISD(Opcode);
963 assert(ISD && "Invalid opcode");
Michael Liao5bf95782014-12-04 05:20:33 +0000964
965 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
966 // and make it as the cost.
967
Yi Jiang5c343de2013-09-19 17:48:48 +0000968 static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblPairWise[] = {
969 { ISD::FADD, MVT::v2f64, 2 },
970 { ISD::FADD, MVT::v4f32, 4 },
971 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
972 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
973 { ISD::ADD, MVT::v8i16, 5 },
974 };
Michael Liao5bf95782014-12-04 05:20:33 +0000975
Yi Jiang5c343de2013-09-19 17:48:48 +0000976 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblPairWise[] = {
977 { ISD::FADD, MVT::v4f32, 4 },
978 { ISD::FADD, MVT::v4f64, 5 },
979 { ISD::FADD, MVT::v8f32, 7 },
980 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
981 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
982 { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
983 { ISD::ADD, MVT::v8i16, 5 },
984 { ISD::ADD, MVT::v8i32, 5 },
985 };
986
987 static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblNoPairWise[] = {
988 { ISD::FADD, MVT::v2f64, 2 },
989 { ISD::FADD, MVT::v4f32, 4 },
990 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
991 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
992 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
993 };
Michael Liao5bf95782014-12-04 05:20:33 +0000994
Yi Jiang5c343de2013-09-19 17:48:48 +0000995 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblNoPairWise[] = {
996 { ISD::FADD, MVT::v4f32, 3 },
997 { ISD::FADD, MVT::v4f64, 3 },
998 { ISD::FADD, MVT::v8f32, 4 },
999 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
1000 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8".
1001 { ISD::ADD, MVT::v4i64, 3 },
1002 { ISD::ADD, MVT::v8i16, 4 },
1003 { ISD::ADD, MVT::v8i32, 5 },
1004 };
Michael Liao5bf95782014-12-04 05:20:33 +00001005
Yi Jiang5c343de2013-09-19 17:48:48 +00001006 if (IsPairwise) {
1007 if (ST->hasAVX()) {
1008 int Idx = CostTableLookup(AVX1CostTblPairWise, ISD, MTy);
1009 if (Idx != -1)
1010 return LT.first * AVX1CostTblPairWise[Idx].Cost;
1011 }
Michael Liao5bf95782014-12-04 05:20:33 +00001012
Yi Jiang5c343de2013-09-19 17:48:48 +00001013 if (ST->hasSSE42()) {
1014 int Idx = CostTableLookup(SSE42CostTblPairWise, ISD, MTy);
1015 if (Idx != -1)
1016 return LT.first * SSE42CostTblPairWise[Idx].Cost;
1017 }
1018 } else {
1019 if (ST->hasAVX()) {
1020 int Idx = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy);
1021 if (Idx != -1)
1022 return LT.first * AVX1CostTblNoPairWise[Idx].Cost;
1023 }
Michael Liao5bf95782014-12-04 05:20:33 +00001024
Yi Jiang5c343de2013-09-19 17:48:48 +00001025 if (ST->hasSSE42()) {
1026 int Idx = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy);
1027 if (Idx != -1)
1028 return LT.first * SSE42CostTblNoPairWise[Idx].Cost;
1029 }
1030 }
1031
Chandler Carruth705b1852015-01-31 03:43:40 +00001032 return BaseT::getReductionCost(Opcode, ValTy, IsPairwise);
Yi Jiang5c343de2013-09-19 17:48:48 +00001033}
1034
Juergen Ributzkab2e4edb2014-06-10 00:32:29 +00001035/// \brief Calculate the cost of materializing a 64-bit value. This helper
1036/// method might only calculate a fraction of a larger immediate. Therefore it
1037/// is valid to return a cost of ZERO.
Chandler Carruth93205eb2015-08-05 18:08:10 +00001038int X86TTIImpl::getIntImmCost(int64_t Val) {
Juergen Ributzkab2e4edb2014-06-10 00:32:29 +00001039 if (Val == 0)
Chandler Carruth705b1852015-01-31 03:43:40 +00001040 return TTI::TCC_Free;
Juergen Ributzkab2e4edb2014-06-10 00:32:29 +00001041
1042 if (isInt<32>(Val))
Chandler Carruth705b1852015-01-31 03:43:40 +00001043 return TTI::TCC_Basic;
Juergen Ributzkab2e4edb2014-06-10 00:32:29 +00001044
Chandler Carruth705b1852015-01-31 03:43:40 +00001045 return 2 * TTI::TCC_Basic;
Juergen Ributzkab2e4edb2014-06-10 00:32:29 +00001046}
1047
Chandler Carruth93205eb2015-08-05 18:08:10 +00001048int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001049 assert(Ty->isIntegerTy());
1050
1051 unsigned BitSize = Ty->getPrimitiveSizeInBits();
1052 if (BitSize == 0)
1053 return ~0U;
1054
Juergen Ributzka43176172014-05-19 21:00:53 +00001055 // Never hoist constants larger than 128bit, because this might lead to
1056 // incorrect code generation or assertions in codegen.
1057 // Fixme: Create a cost model for types larger than i128 once the codegen
1058 // issues have been fixed.
1059 if (BitSize > 128)
Chandler Carruth705b1852015-01-31 03:43:40 +00001060 return TTI::TCC_Free;
Juergen Ributzka43176172014-05-19 21:00:53 +00001061
Juergen Ributzkaf0dff492014-03-21 06:04:45 +00001062 if (Imm == 0)
Chandler Carruth705b1852015-01-31 03:43:40 +00001063 return TTI::TCC_Free;
Juergen Ributzkaf0dff492014-03-21 06:04:45 +00001064
Juergen Ributzkab2e4edb2014-06-10 00:32:29 +00001065 // Sign-extend all constants to a multiple of 64-bit.
1066 APInt ImmVal = Imm;
1067 if (BitSize & 0x3f)
1068 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
1069
1070 // Split the constant into 64-bit chunks and calculate the cost for each
1071 // chunk.
Chandler Carruth93205eb2015-08-05 18:08:10 +00001072 int Cost = 0;
Juergen Ributzkab2e4edb2014-06-10 00:32:29 +00001073 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
1074 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
1075 int64_t Val = Tmp.getSExtValue();
1076 Cost += getIntImmCost(Val);
1077 }
1078 // We need at least one instruction to materialze the constant.
Chandler Carruth93205eb2015-08-05 18:08:10 +00001079 return std::max(1, Cost);
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001080}
1081
Chandler Carruth93205eb2015-08-05 18:08:10 +00001082int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
1083 Type *Ty) {
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001084 assert(Ty->isIntegerTy());
1085
1086 unsigned BitSize = Ty->getPrimitiveSizeInBits();
Juergen Ributzka43176172014-05-19 21:00:53 +00001087 // There is no cost model for constants with a bit size of 0. Return TCC_Free
1088 // here, so that constant hoisting will ignore this constant.
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001089 if (BitSize == 0)
Chandler Carruth705b1852015-01-31 03:43:40 +00001090 return TTI::TCC_Free;
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001091
Juergen Ributzkaf0dff492014-03-21 06:04:45 +00001092 unsigned ImmIdx = ~0U;
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001093 switch (Opcode) {
Chandler Carruth705b1852015-01-31 03:43:40 +00001094 default:
1095 return TTI::TCC_Free;
Juergen Ributzkaf0dff492014-03-21 06:04:45 +00001096 case Instruction::GetElementPtr:
Juergen Ributzka27435b32014-04-02 21:45:36 +00001097 // Always hoist the base address of a GetElementPtr. This prevents the
1098 // creation of new constants for every base constant that gets constant
1099 // folded with the offset.
Juergen Ributzka631c4912014-03-25 18:01:25 +00001100 if (Idx == 0)
Chandler Carruth705b1852015-01-31 03:43:40 +00001101 return 2 * TTI::TCC_Basic;
1102 return TTI::TCC_Free;
Juergen Ributzkaf0dff492014-03-21 06:04:45 +00001103 case Instruction::Store:
1104 ImmIdx = 0;
1105 break;
Craig Topper79dd1bf2015-10-06 02:50:24 +00001106 case Instruction::And:
1107 // We support 64-bit ANDs with immediates with 32-bits of leading zeroes
1108 // by using a 32-bit operation with implicit zero extension. Detect such
1109 // immediates here as the normal path expects bit 31 to be sign extended.
1110 if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue()))
1111 return TTI::TCC_Free;
1112 // Fallthrough
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001113 case Instruction::Add:
1114 case Instruction::Sub:
1115 case Instruction::Mul:
1116 case Instruction::UDiv:
1117 case Instruction::SDiv:
1118 case Instruction::URem:
1119 case Instruction::SRem:
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001120 case Instruction::Or:
1121 case Instruction::Xor:
1122 case Instruction::ICmp:
Juergen Ributzkaf0dff492014-03-21 06:04:45 +00001123 ImmIdx = 1;
1124 break;
Michael Zolotukhin1f4a9602014-04-30 19:17:32 +00001125 // Always return TCC_Free for the shift value of a shift instruction.
1126 case Instruction::Shl:
1127 case Instruction::LShr:
1128 case Instruction::AShr:
1129 if (Idx == 1)
Chandler Carruth705b1852015-01-31 03:43:40 +00001130 return TTI::TCC_Free;
Michael Zolotukhin1f4a9602014-04-30 19:17:32 +00001131 break;
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001132 case Instruction::Trunc:
1133 case Instruction::ZExt:
1134 case Instruction::SExt:
1135 case Instruction::IntToPtr:
1136 case Instruction::PtrToInt:
1137 case Instruction::BitCast:
Juergen Ributzkaf0dff492014-03-21 06:04:45 +00001138 case Instruction::PHI:
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001139 case Instruction::Call:
1140 case Instruction::Select:
1141 case Instruction::Ret:
1142 case Instruction::Load:
Juergen Ributzkaf0dff492014-03-21 06:04:45 +00001143 break;
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001144 }
Juergen Ributzkaf0dff492014-03-21 06:04:45 +00001145
Juergen Ributzkab2e4edb2014-06-10 00:32:29 +00001146 if (Idx == ImmIdx) {
Chandler Carruth93205eb2015-08-05 18:08:10 +00001147 int NumConstants = (BitSize + 63) / 64;
1148 int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
Chandler Carruth705b1852015-01-31 03:43:40 +00001149 return (Cost <= NumConstants * TTI::TCC_Basic)
Chandler Carruth93205eb2015-08-05 18:08:10 +00001150 ? static_cast<int>(TTI::TCC_Free)
Chandler Carruth705b1852015-01-31 03:43:40 +00001151 : Cost;
Juergen Ributzkab2e4edb2014-06-10 00:32:29 +00001152 }
Juergen Ributzkaf0dff492014-03-21 06:04:45 +00001153
Chandler Carruth705b1852015-01-31 03:43:40 +00001154 return X86TTIImpl::getIntImmCost(Imm, Ty);
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001155}
1156
Chandler Carruth93205eb2015-08-05 18:08:10 +00001157int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
1158 Type *Ty) {
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001159 assert(Ty->isIntegerTy());
1160
1161 unsigned BitSize = Ty->getPrimitiveSizeInBits();
Juergen Ributzka43176172014-05-19 21:00:53 +00001162 // There is no cost model for constants with a bit size of 0. Return TCC_Free
1163 // here, so that constant hoisting will ignore this constant.
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001164 if (BitSize == 0)
Chandler Carruth705b1852015-01-31 03:43:40 +00001165 return TTI::TCC_Free;
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001166
1167 switch (IID) {
Chandler Carruth705b1852015-01-31 03:43:40 +00001168 default:
1169 return TTI::TCC_Free;
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001170 case Intrinsic::sadd_with_overflow:
1171 case Intrinsic::uadd_with_overflow:
1172 case Intrinsic::ssub_with_overflow:
1173 case Intrinsic::usub_with_overflow:
1174 case Intrinsic::smul_with_overflow:
1175 case Intrinsic::umul_with_overflow:
Juergen Ributzkaf0dff492014-03-21 06:04:45 +00001176 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
Chandler Carruth705b1852015-01-31 03:43:40 +00001177 return TTI::TCC_Free;
Juergen Ributzka5eef98c2014-03-25 18:01:23 +00001178 break;
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001179 case Intrinsic::experimental_stackmap:
Juergen Ributzka5eef98c2014-03-25 18:01:23 +00001180 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
Chandler Carruth705b1852015-01-31 03:43:40 +00001181 return TTI::TCC_Free;
Juergen Ributzka5eef98c2014-03-25 18:01:23 +00001182 break;
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001183 case Intrinsic::experimental_patchpoint_void:
1184 case Intrinsic::experimental_patchpoint_i64:
Juergen Ributzka5eef98c2014-03-25 18:01:23 +00001185 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
Chandler Carruth705b1852015-01-31 03:43:40 +00001186 return TTI::TCC_Free;
Juergen Ributzka5eef98c2014-03-25 18:01:23 +00001187 break;
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001188 }
Chandler Carruth705b1852015-01-31 03:43:40 +00001189 return X86TTIImpl::getIntImmCost(Imm, Ty);
Juergen Ributzkaf26beda2014-01-25 02:02:55 +00001190}
NAKAMURA Takumi0b305db2015-07-14 04:03:49 +00001191
1192bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, int Consecutive) {
1193 int DataWidth = DataTy->getPrimitiveSizeInBits();
1194
1195 // Todo: AVX512 allows gather/scatter, works with strided and random as well
1196 if ((DataWidth < 32) || (Consecutive == 0))
1197 return false;
1198 if (ST->hasAVX512() || ST->hasAVX2())
1199 return true;
1200 return false;
1201}
Elena Demikhovskyf1de34b2014-12-04 09:40:44 +00001202
Chandler Carruth705b1852015-01-31 03:43:40 +00001203bool X86TTIImpl::isLegalMaskedStore(Type *DataType, int Consecutive) {
Elena Demikhovsky3fcafa22014-12-14 09:43:50 +00001204 return isLegalMaskedLoad(DataType, Consecutive);
Elena Demikhovskyf1de34b2014-12-04 09:40:44 +00001205}
1206
Eric Christopherd566fb12015-07-29 22:09:48 +00001207bool X86TTIImpl::areInlineCompatible(const Function *Caller,
1208 const Function *Callee) const {
Eric Christophere1002262015-07-02 01:11:50 +00001209 const TargetMachine &TM = getTLI()->getTargetMachine();
1210
1211 // Work this as a subsetting of subtarget features.
1212 const FeatureBitset &CallerBits =
1213 TM.getSubtargetImpl(*Caller)->getFeatureBits();
1214 const FeatureBitset &CalleeBits =
1215 TM.getSubtargetImpl(*Callee)->getFeatureBits();
1216
1217 // FIXME: This is likely too limiting as it will include subtarget features
1218 // that we might not care about for inlining, but it is conservatively
1219 // correct.
1220 return (CallerBits & CalleeBits) == CalleeBits;
1221}