blob: e93ec274ee902c89dda85294aada49adab4add71 [file] [log] [blame]
David L Kreitzer0e3ae302016-12-01 19:56:39 +00001//===--------- X86InterleavedAccess.cpp ----------------------------------===//
David L Kreitzer01a057a2016-10-14 18:20:41 +00002//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
David L Kreitzer0e3ae302016-12-01 19:56:39 +00008//===--------------------------------------------------------------------===//
9///
10/// \file
11/// This file contains the X86 implementation of the interleaved accesses
12/// optimization generating X86-specific instructions/intrinsics for
13/// interleaved access groups.
14///
15//===--------------------------------------------------------------------===//
David L Kreitzer01a057a2016-10-14 18:20:41 +000016
David L Kreitzer01a057a2016-10-14 18:20:41 +000017#include "X86TargetMachine.h"
Farhana Aleen4b652a52017-06-22 22:59:04 +000018#include "llvm/Analysis/VectorUtils.h"
David L Kreitzer01a057a2016-10-14 18:20:41 +000019
20using namespace llvm;
21
Benjamin Kramerefcf06f2017-02-11 11:06:55 +000022namespace {
David L Kreitzer0e3ae302016-12-01 19:56:39 +000023/// \brief This class holds necessary information to represent an interleaved
24/// access group and supports utilities to lower the group into
25/// X86-specific instructions/intrinsics.
26/// E.g. A group of interleaving access loads (Factor = 2; accessing every
27/// other element)
28/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
29/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
30/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
David L Kreitzer0e3ae302016-12-01 19:56:39 +000031class X86InterleavedAccessGroup {
32 /// \brief Reference to the wide-load instruction of an interleaved access
33 /// group.
34 Instruction *const Inst;
35
36 /// \brief Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
37 ArrayRef<ShuffleVectorInst *> Shuffles;
38
39 /// \brief Reference to the starting index of each user-shuffle.
40 ArrayRef<unsigned> Indices;
41
42 /// \brief Reference to the interleaving stride in terms of elements.
43 const unsigned Factor;
44
45 /// \brief Reference to the underlying target.
46 const X86Subtarget &Subtarget;
47
48 const DataLayout &DL;
49
50 IRBuilder<> &Builder;
51
52 /// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
Farhana Aleen4b652a52017-06-22 22:59:04 +000053 /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
54 void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
David L Kreitzer0e3ae302016-12-01 19:56:39 +000055 SmallVectorImpl<Instruction *> &DecomposedVectors);
56
57 /// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
58 /// returns the transposed-vectors in \p TransposedVectors.
59 /// E.g.
60 /// InputVectors:
61 /// In-V0 = p1, p2, p3, p4
62 /// In-V1 = q1, q2, q3, q4
63 /// In-V2 = r1, r2, r3, r4
64 /// In-V3 = s1, s2, s3, s4
65 /// OutputVectors:
66 /// Out-V0 = p1, q1, r1, s1
67 /// Out-V1 = p2, q2, r2, s2
68 /// Out-V2 = p3, q3, r3, s3
69 /// Out-V3 = P4, q4, r4, s4
70 void transpose_4x4(ArrayRef<Instruction *> InputVectors,
Michael Zuckermanc1918ad2017-07-26 08:10:14 +000071 SmallVectorImpl<Value *> &TransposedMatrix);
Michael Zuckerman680ac102017-08-07 13:22:39 +000072 void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
73 SmallVectorImpl<Value *> &TransposedMatrix,
74 unsigned NumSubVecElems);
Michael Zuckerman4a97df02017-09-25 14:50:38 +000075 void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
76 SmallVectorImpl<Value *> &TransposedMatrix);
Michael Zuckerman645f7772017-09-26 18:49:11 +000077 void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
78 SmallVectorImpl<Value *> &TransposedMatrix,
79 unsigned NumSubVecElems);
Michael Zuckerman5a385942017-09-07 14:02:13 +000080 void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
81 SmallVectorImpl<Value *> &TransposedMatrix,
82 unsigned NumSubVecElems);
Michael Zuckerman680ac102017-08-07 13:22:39 +000083
David L Kreitzer0e3ae302016-12-01 19:56:39 +000084public:
85 /// In order to form an interleaved access group X86InterleavedAccessGroup
86 /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
87 /// \p Shuffs, reference to the first indices of each interleaved-vector
88 /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
89 /// X86-specific instructions/intrinsics it also requires the underlying
90 /// target information \p STarget.
91 explicit X86InterleavedAccessGroup(Instruction *I,
92 ArrayRef<ShuffleVectorInst *> Shuffs,
Farhana Aleen4b652a52017-06-22 22:59:04 +000093 ArrayRef<unsigned> Ind, const unsigned F,
David L Kreitzer0e3ae302016-12-01 19:56:39 +000094 const X86Subtarget &STarget,
95 IRBuilder<> &B)
96 : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
97 DL(Inst->getModule()->getDataLayout()), Builder(B) {}
98
99 /// \brief Returns true if this interleaved access group can be lowered into
100 /// x86-specific instructions/intrinsics, false otherwise.
101 bool isSupported() const;
102
103 /// \brief Lowers this interleaved access group into X86-specific
104 /// instructions/intrinsics.
105 bool lowerIntoOptimizedSequence();
106};
Benjamin Kramerefcf06f2017-02-11 11:06:55 +0000107} // end anonymous namespace
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000108
109bool X86InterleavedAccessGroup::isSupported() const {
David L Kreitzer01a057a2016-10-14 18:20:41 +0000110 VectorType *ShuffleVecTy = Shuffles[0]->getType();
David L Kreitzer01a057a2016-10-14 18:20:41 +0000111 Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
Farhana Aleene4a89a62017-07-21 21:35:00 +0000112 unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
Farhana Aleene4a89a62017-07-21 21:35:00 +0000113 unsigned WideInstSize;
David L Kreitzer01a057a2016-10-14 18:20:41 +0000114
Michael Zuckerman5a385942017-09-07 14:02:13 +0000115 // Currently, lowering is supported for the following vectors:
116 // Stride 4:
117 // 1. Store and load of 4-element vectors of 64 bits on AVX.
118 // 2. Store of 16/32-element vectors of 8 bits on AVX.
119 // Stride 3:
120 // 1. Load of 16/32-element vecotrs of 8 bits on AVX.
121 if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
Michael Zuckerman680ac102017-08-07 13:22:39 +0000122 return false;
123
Farhana Aleene4a89a62017-07-21 21:35:00 +0000124 if (isa<LoadInst>(Inst)) {
Farhana Aleene4a89a62017-07-21 21:35:00 +0000125 WideInstSize = DL.getTypeSizeInBits(Inst->getType());
126 } else
127 WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
128
Michael Zuckerman680ac102017-08-07 13:22:39 +0000129 // We support shuffle represents stride 4 for byte type with size of
130 // WideInstSize.
Michael Zuckerman5a385942017-09-07 14:02:13 +0000131 if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
132 return true;
133
134 if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
Michael Zuckerman4a97df02017-09-25 14:50:38 +0000135 (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024))
Michael Zuckerman5a385942017-09-07 14:02:13 +0000136 return true;
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000137
Michael Zuckerman645f7772017-09-26 18:49:11 +0000138 if (ShuffleElemSize == 8 && Factor == 3 &&
Michael Zuckerman5a385942017-09-07 14:02:13 +0000139 (WideInstSize == 384 || WideInstSize == 768))
Michael Zuckerman645f7772017-09-26 18:49:11 +0000140 return true;
David L Kreitzer01a057a2016-10-14 18:20:41 +0000141
Michael Zuckerman5a385942017-09-07 14:02:13 +0000142 return false;
David L Kreitzer01a057a2016-10-14 18:20:41 +0000143}
144
Farhana Aleen4b652a52017-06-22 22:59:04 +0000145void X86InterleavedAccessGroup::decompose(
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000146 Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
147 SmallVectorImpl<Instruction *> &DecomposedVectors) {
Farhana Aleen4b652a52017-06-22 22:59:04 +0000148
149 assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
150 "Expected Load or Shuffle");
151
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000152 Type *VecTy = VecInst->getType();
Benjamin Kramer215b22e2016-12-01 20:49:34 +0000153 (void)VecTy;
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000154 assert(VecTy->isVectorTy() &&
155 DL.getTypeSizeInBits(VecTy) >=
156 DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
157 "Invalid Inst-size!!!");
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000158
Farhana Aleen4b652a52017-06-22 22:59:04 +0000159 if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
160 Value *Op0 = SVI->getOperand(0);
161 Value *Op1 = SVI->getOperand(1);
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000162
Farhana Aleen4b652a52017-06-22 22:59:04 +0000163 // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
164 for (unsigned i = 0; i < NumSubVectors; ++i)
165 DecomposedVectors.push_back(
166 cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
Farhana Aleene4a89a62017-07-21 21:35:00 +0000167 Op0, Op1,
168 createSequentialMask(Builder, Indices[i],
169 SubVecTy->getVectorNumElements(), 0))));
Farhana Aleen4b652a52017-06-22 22:59:04 +0000170 return;
171 }
172
173 // Decompose the load instruction.
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000174 LoadInst *LI = cast<LoadInst>(VecInst);
175 Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
Michael Zuckerman5a385942017-09-07 14:02:13 +0000176 Value *VecBasePtr;
177 unsigned int NumLoads = NumSubVectors;
178 // In the case of stride 3 with a vector of 32 elements load the information
179 // in the following way:
180 // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
181 if (DL.getTypeSizeInBits(VecTy) == 768) {
182 Type *VecTran =
183 VectorType::get(Type::getInt8Ty(LI->getContext()), 16)->getPointerTo();
184 VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTran);
185 NumLoads = NumSubVectors * 2;
186 } else
187 VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
Farhana Aleen4b652a52017-06-22 22:59:04 +0000188 // Generate N loads of T type.
Michael Zuckerman5a385942017-09-07 14:02:13 +0000189 for (unsigned i = 0; i < NumLoads; i++) {
Farhana Aleen4b652a52017-06-22 22:59:04 +0000190 // TODO: Support inbounds GEP.
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000191 Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
192 Instruction *NewLoad =
193 Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
194 DecomposedVectors.push_back(NewLoad);
195 }
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000196}
197
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000198// Create shuffle mask for concatenation of two half vectors.
199// Low = false: mask generated for the shuffle
200// shuffle(VEC1,VEC2,{NumElement/2, NumElement/2+1, NumElement/2+2...,
201// NumElement-1, NumElement+NumElement/2,
202// NumElement+NumElement/2+1..., 2*NumElement-1})
203// = concat(high_half(VEC1),high_half(VEC2))
204// Low = true: mask generated for the shuffle
205// shuffle(VEC1,VEC2,{0,1,2,...,NumElement/2-1,NumElement,
206// NumElement+1...,NumElement+NumElement/2-1})
207// = concat(low_half(VEC1),low_half(VEC2))
208static void createConcatShuffleMask(int NumElements,
209 SmallVectorImpl<uint32_t> &Mask, bool Low) {
210 int NumHalfElements = NumElements / 2;
211 int Offset = Low ? 0 : NumHalfElements;
212 for (int i = 0; i < NumHalfElements; ++i)
213 Mask.push_back(i + Offset);
214 for (int i = 0; i < NumHalfElements; ++i)
215 Mask.push_back(i + Offset + NumElements);
216}
217
Michael Zuckerman80d3649f2017-09-13 18:28:09 +0000218// Changing the scale of the vector type by reducing the number of elements and
219// doubling the scalar size.
220static MVT scaleVectorType(MVT VT) {
221 unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
222 return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
223 VT.getVectorNumElements() / 2);
224}
225
Michael Zuckerman4a97df02017-09-25 14:50:38 +0000226void X86InterleavedAccessGroup::interleave8bitStride4VF8(
227 ArrayRef<Instruction *> Matrix,
228 SmallVectorImpl<Value *> &TransposedMatrix) {
229 // Assuming we start from the following vectors:
230 // Matrix[0]= c0 c1 c2 c3 c4 ... c7
231 // Matrix[1]= m0 m1 m2 m3 m4 ... m7
232 // Matrix[2]= y0 y1 y2 y3 y4 ... y7
233 // Matrix[3]= k0 k1 k2 k3 k4 ... k7
234
235 MVT VT = MVT::v8i16;
236 TransposedMatrix.resize(2);
237 SmallVector<uint32_t, 16> MaskLow;
238 SmallVector<uint32_t, 32> MaskLowTemp1, MaskLowWord;
239 SmallVector<uint32_t, 32> MaskHighTemp1, MaskHighWord;
240
241 for (unsigned i = 0; i < 8; ++i) {
242 MaskLow.push_back(i);
243 MaskLow.push_back(i + 8);
244 }
245
246 createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1, true, false);
247 createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1, false, false);
248 scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
249 scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
250 // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
251 // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
252 Value *IntrVec1Low =
253 Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
254 Value *IntrVec2Low =
255 Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
256
257 // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
258 // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
259
260 TransposedMatrix[0] =
261 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
262 TransposedMatrix[1] =
263 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
264}
265
Michael Zuckerman680ac102017-08-07 13:22:39 +0000266void X86InterleavedAccessGroup::interleave8bitStride4(
267 ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix,
268 unsigned numberOfElement) {
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000269
270 // Example: Assuming we start from the following vectors:
271 // Matrix[0]= c0 c1 c2 c3 c4 ... c31
272 // Matrix[1]= m0 m1 m2 m3 m4 ... m31
273 // Matrix[2]= y0 y1 y2 y3 y4 ... y31
274 // Matrix[3]= k0 k1 k2 k3 k4 ... k31
275
Michael Zuckerman80d3649f2017-09-13 18:28:09 +0000276 MVT VT = MVT::getVectorVT(MVT::i8, numberOfElement);
277 MVT HalfVT = scaleVectorType(VT);
Michael Zuckerman680ac102017-08-07 13:22:39 +0000278
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000279 TransposedMatrix.resize(4);
Michael Zuckerman80d3649f2017-09-13 18:28:09 +0000280 SmallVector<uint32_t, 32> MaskHigh;
281 SmallVector<uint32_t, 32> MaskLow;
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000282 SmallVector<uint32_t, 32> MaskHighTemp1;
283 SmallVector<uint32_t, 32> MaskLowTemp1;
Michael Zuckerman80d3649f2017-09-13 18:28:09 +0000284 SmallVector<uint32_t, 32> MaskHighWord;
285 SmallVector<uint32_t, 32> MaskLowWord;
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000286 SmallVector<uint32_t, 32> ConcatLow;
287 SmallVector<uint32_t, 32> ConcatHigh;
288
289 // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
290 // shuffle pattern.
291
Michael Zuckerman80d3649f2017-09-13 18:28:09 +0000292 createUnpackShuffleMask<uint32_t>(VT, MaskHigh, false, false);
293 createUnpackShuffleMask<uint32_t>(VT, MaskLow, true, false);
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000294
295 // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
296 // shuffle pattern.
297
Michael Zuckerman680ac102017-08-07 13:22:39 +0000298 createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp1, true, false);
299 createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp1, false, false);
Michael Zuckerman80d3649f2017-09-13 18:28:09 +0000300 scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
301 scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000302
303 // IntrVec1Low = c0 m0 c1 m1 ... c7 m7 | c16 m16 c17 m17 ... c23 m23
304 // IntrVec1High = c8 m8 c9 m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
305 // IntrVec2Low = y0 k0 y1 k1 ... y7 k7 | y16 k16 y17 k17 ... y23 k23
306 // IntrVec2High = y8 k8 y9 k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31
307
308 Value *IntrVec1Low =
309 Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
310 Value *IntrVec1High =
311 Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
312 Value *IntrVec2Low =
313 Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
314 Value *IntrVec2High =
315 Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
316
317 // cmyk4 cmyk5 cmyk6 cmyk7 | cmyk20 cmyk21 cmyk22 cmyk23
318 // cmyk12 cmyk13 cmyk14 cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31
319 // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk16 cmyk17 cmyk18 cmyk19
320 // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27
321
322 Value *High =
323 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
324 Value *High1 =
325 Builder.CreateShuffleVector(IntrVec1High, IntrVec2High, MaskHighWord);
326 Value *Low =
327 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
328 Value *Low1 =
329 Builder.CreateShuffleVector(IntrVec1High, IntrVec2High, MaskLowWord);
330
Michael Zuckerman680ac102017-08-07 13:22:39 +0000331 if (VT == MVT::v16i8) {
332 TransposedMatrix[0] = Low;
333 TransposedMatrix[1] = High;
334 TransposedMatrix[2] = Low1;
335 TransposedMatrix[3] = High1;
336 return;
337 }
Michael Zuckerman80d3649f2017-09-13 18:28:09 +0000338
339 // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk4 cmyk5 cmyk6 cmyk7
340 // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk12 cmyk13 cmyk14 cmyk15
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000341 // cmyk16 cmyk17 cmyk18 cmyk19 | cmyk20 cmyk21 cmyk22 cmyk23
342 // cmyk24 cmyk25 cmyk26 cmyk27 | cmyk28 cmyk29 cmyk30 cmyk31
343
Michael Zuckerman80d3649f2017-09-13 18:28:09 +0000344 // ConcatHigh and ConcatLow built in the vperm2i128 and vinserti128 X86
345 // shuffle pattern.
346 SmallVector<uint32_t, 32> ConcatHigh12, ConcatHigh13;
347 createConcatShuffleMask(numberOfElement, ConcatLow, true);
348 createConcatShuffleMask(numberOfElement, ConcatHigh, false);
349
350 TransposedMatrix[0] = Builder.CreateShuffleVector(Low, High, ConcatLow);
351 TransposedMatrix[1] = Builder.CreateShuffleVector(Low1, High1, ConcatLow);
352 TransposedMatrix[2] = Builder.CreateShuffleVector(Low, High, ConcatHigh);
353 TransposedMatrix[3] = Builder.CreateShuffleVector(Low1, High1, ConcatHigh);
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000354}
355
Michael Zuckerman5a385942017-09-07 14:02:13 +0000356// createShuffleStride returns shuffle mask of size N.
357// The shuffle pattern is as following :
358// {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane),
359// (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
360// (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
361// Where Lane is the # of lanes in a register:
362// VectorSize = 128 => Lane = 1
363// VectorSize = 256 => Lane = 2
364// For example shuffle pattern for VF 16 register size 256 -> lanes = 2
365// {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
366static void createShuffleStride(MVT VT, int Stride,
367 SmallVectorImpl<uint32_t> &Mask) {
368 int VectorSize = VT.getSizeInBits();
369 int VF = VT.getVectorNumElements();
370 int LaneCount = std::max(VectorSize / 128, 1);
371 for (int Lane = 0; Lane < LaneCount; Lane++)
372 for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
373 Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
374}
375
376// setGroupSize sets 'SizeInfo' to the size(number of elements) of group
377// inside mask a shuffleMask. A mask contains exactly 3 groups, where
378// each group is a monotonically increasing sequence with stride 3.
379// For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
380static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
381 int VectorSize = VT.getSizeInBits();
382 int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
383 for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
384 int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
385 SizeInfo.push_back(GroupSize);
386 FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
387 }
388}
389
390// DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
391// vpalign works according to lanes
392// Where Lane is the # of lanes in a register:
393// VectorWide = 128 => Lane = 1
394// VectorWide = 256 => Lane = 2
395// For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
396// For Lane = 2 shuffle pattern is:
397// {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
398// Imm variable sets the offset amount. The result of the
399// function is stored inside ShuffleMask vector and it built as described in
400// the begin of the description. AlignDirection is a boolean that indecat the
401// direction of the alignment. (false - align to the "right" side while true -
402// align to the "left" side)
403static void DecodePALIGNRMask(MVT VT, unsigned Imm,
404 SmallVectorImpl<uint32_t> &ShuffleMask,
405 bool AlignDirection = true, bool Unary = false) {
406
407 unsigned NumElts = VT.getVectorNumElements();
408 unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
409 unsigned NumLaneElts = NumElts / NumLanes;
410
411 Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
412 unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
413
414 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
415 for (unsigned i = 0; i != NumLaneElts; ++i) {
416 unsigned Base = i + Offset;
417 // if i+offset is out of this lane then we actually need the other source
418 // If Unary the other source is the first source.
419 if (Base >= NumLaneElts)
420 Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
421 ShuffleMask.push_back(Base + l);
422 }
423 }
424}
425
426void X86InterleavedAccessGroup::deinterleave8bitStride3(
427 ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
428 unsigned VecElems) {
429
430 // Example: Assuming we start from the following vectors:
431 // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
432 // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
433 // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
434
435 TransposedMatrix.resize(3);
436 SmallVector<uint32_t, 32> Concat;
437 SmallVector<uint32_t, 32> VPShuf;
438 SmallVector<uint32_t, 32> VPAlign[2];
439 SmallVector<uint32_t, 32> VPAlign2;
440 SmallVector<uint32_t, 32> VPAlign3;
441 SmallVector<uint32_t, 3> GroupSize;
442 Value *Vec[3], *TempVector[3];
443
444 MVT VT = MVT::getVT(Shuffles[0]->getType());
445
446 for (unsigned i = 0; i < VecElems && VecElems == 32; ++i)
447 Concat.push_back(i);
448
449 createShuffleStride(VT, 3, VPShuf);
450 setGroupSize(VT, GroupSize);
451
452 for (int i = 0; i < 2; i++)
453 DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
454
455 DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
456 DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
457
458 for (int i = 0; i < 3; i++)
459 Vec[i] = VecElems == 32
460 ? Builder.CreateShuffleVector(InVec[i], InVec[i + 3], Concat)
461 : InVec[i];
462
463 // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
464 // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
465 // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
466
467 for (int i = 0; i < 3; i++)
468 Vec[i] = Builder.CreateShuffleVector(
469 Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf);
470
471 // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
472 // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
473 // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
474
475 for (int i = 0; i < 3; i++)
476 TempVector[i] =
477 Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
478
479 // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
480 // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
481 // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
482
483 for (int i = 0; i < 3; i++)
484 Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
485 VPAlign[1]);
486
487 // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
488 // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
489 // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
490
491 Value *TempVec = Builder.CreateShuffleVector(
492 Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3);
493 TransposedMatrix[0] = Builder.CreateShuffleVector(
494 Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2);
495 TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
496 TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
497
498 return;
499}
500
Michael Zuckerman645f7772017-09-26 18:49:11 +0000501// group2Shuffle reorder the shuffle stride back into continuous order.
502// For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
503// MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
504static void group2Shuffle(MVT VT, SmallVectorImpl<uint32_t> &Mask,
505 SmallVectorImpl<uint32_t> &Output) {
506 int IndexGroup[3] = {0, 0, 0};
507 int Index = 0;
508 int VectorWidth = VT.getSizeInBits();
509 int VF = VT.getVectorNumElements();
510 // Find the index of the different groups.
511 int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
512 for (int i = 0; i < 3; i++) {
513 IndexGroup[(Index * 3) % (VF / Lane)] = Index;
514 Index += Mask[i];
515 }
516 // According to the index compute the convert mask.
517 for (int i = 0; i < VF / Lane; i++) {
518 Output.push_back(IndexGroup[i % 3]);
519 IndexGroup[i % 3]++;
520 }
521}
522
523// genShuffleBland - Creates shuffle according to two vectors.This function is
524// only works on instructions with lane inside 256 registers. According to
525// the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
526// offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
527// Where the 'LowOffset' refers to the first vector and the highOffset refers to
528// the second vector.
529// |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20|
530// |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25|
531// |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31|
532// For the sequence to work as a mirror to the load.
533// We must consider the elements order as above.
534// In this function we are combining two types of shuffles.
535// The first one is vpshufed and the second is a type of "blend" shuffle.
536// By computing the shuffle on a sequence of 16 elements(one lane) and add the
537// correct offset. We are creating a vpsuffed + blend sequence between two
538// shuffles.
Michael Zuckerman0b5db552017-09-29 12:45:54 +0000539static void genShuffleBland(MVT VT, ArrayRef<uint32_t> Mask,
Michael Zuckerman645f7772017-09-26 18:49:11 +0000540 SmallVectorImpl<uint32_t> &Out, int LowOffset,
541 int HighOffset) {
542 assert(VT.getSizeInBits() == 256 &&
543 "This function works on only width of 256");
544 unsigned NumOfElm = VT.getVectorNumElements();
545 for (unsigned i = 0; i < Mask.size(); i++)
546 Out.push_back(Mask[i] + LowOffset);
547 for (unsigned i = 0; i < Mask.size(); i++)
548 Out.push_back(Mask[i] + HighOffset + NumOfElm);
549}
550
551void X86InterleavedAccessGroup::interleave8bitStride3(
552 ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
553 unsigned VecElems) {
554
555 // Example: Assuming we start from the following vectors:
556 // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
557 // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
558 // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
559
560 TransposedMatrix.resize(3);
561 SmallVector<uint32_t, 3> GroupSize;
562 SmallVector<uint32_t, 32> VPShuf;
563 SmallVector<uint32_t, 32> VPAlign[3];
564 SmallVector<uint32_t, 32> VPAlign2;
565 SmallVector<uint32_t, 32> VPAlign3;
566 SmallVector<uint32_t, 32> OptimizeShuf[3];
567 Value *Vec[3], *TempVector[3];
568 MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
569
570 setGroupSize(VT, GroupSize);
571
572 for (int i = 0; i < 3; i++)
573 DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
574
575 DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
576 DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
577
578 // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
579 // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
580 // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
581
582 Vec[0] = Builder.CreateShuffleVector(
583 InVec[0], UndefValue::get(InVec[0]->getType()), VPAlign2);
584 Vec[1] = Builder.CreateShuffleVector(
585 InVec[1], UndefValue::get(InVec[1]->getType()), VPAlign3);
586 Vec[2] = InVec[2];
587
588 // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
589 // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
590 // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
591
592 for (int i = 0; i < 3; i++)
593 TempVector[i] =
594 Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
595
596 // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
597 // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
598 // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
599
600 for (int i = 0; i < 3; i++)
601 Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
602 VPAlign[2]);
603
604 // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
605 // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
606 // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
607
608 group2Shuffle(VT, GroupSize, VPShuf);
609
610 if (VT.getSizeInBits() <= 128) {
611 for (int i = 0; i < 3; i++)
612 TransposedMatrix[i] = Builder.CreateShuffleVector(
613 Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
614 return;
615 }
616
617 unsigned NumOfElm = VT.getVectorNumElements();
618 genShuffleBland(VT, VPShuf, OptimizeShuf[0], 0, 0);
619 genShuffleBland(VT, VPShuf, OptimizeShuf[1], 0, NumOfElm / 2);
620 genShuffleBland(VT, VPShuf, OptimizeShuf[2], NumOfElm / 2, NumOfElm / 2);
621
622 for (int i = 0; i < 3; i++)
623 TransposedMatrix[i] = Builder.CreateShuffleVector(
624 Vec[(i * 2) % 3], Vec[(i * 2 + 1) % 3], OptimizeShuf[i]);
625
626 return;
627}
628
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000629void X86InterleavedAccessGroup::transpose_4x4(
630 ArrayRef<Instruction *> Matrix,
631 SmallVectorImpl<Value *> &TransposedMatrix) {
632 assert(Matrix.size() == 4 && "Invalid matrix size");
633 TransposedMatrix.resize(4);
634
635 // dst = src1[0,1],src2[0,1]
636 uint32_t IntMask1[] = {0, 1, 4, 5};
637 ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
638 Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
639 Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
640
641 // dst = src1[2,3],src2[2,3]
642 uint32_t IntMask2[] = {2, 3, 6, 7};
643 Mask = makeArrayRef(IntMask2, 4);
644 Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
645 Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
646
647 // dst = src1[0],src2[0],src1[2],src2[2]
648 uint32_t IntMask3[] = {0, 4, 2, 6};
649 Mask = makeArrayRef(IntMask3, 4);
650 TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
651 TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
652
653 // dst = src1[1],src2[1],src1[3],src2[3]
654 uint32_t IntMask4[] = {1, 5, 3, 7};
655 Mask = makeArrayRef(IntMask4, 4);
656 TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
657 TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
658}
659
660// Lowers this interleaved access group into X86-specific
661// instructions/intrinsics.
662bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
663 SmallVector<Instruction *, 4> DecomposedVectors;
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000664 SmallVector<Value *, 4> TransposedVectors;
Farhana Aleen4b652a52017-06-22 22:59:04 +0000665 VectorType *ShuffleTy = Shuffles[0]->getType();
666
667 if (isa<LoadInst>(Inst)) {
668 // Try to generate target-sized register(/instruction).
669 decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
670
Michael Zuckerman5a385942017-09-07 14:02:13 +0000671 Type *ShuffleEltTy = Inst->getType();
672 unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor;
Farhana Aleen4b652a52017-06-22 22:59:04 +0000673 // Perform matrix-transposition in order to compute interleaved
674 // results by generating some sort of (optimized) target-specific
675 // instructions.
Michael Zuckerman5a385942017-09-07 14:02:13 +0000676
677 switch (NumSubVecElems) {
678 default:
679 return false;
680 case 4:
681 transpose_4x4(DecomposedVectors, TransposedVectors);
682 break;
683 case 8:
684 case 16:
685 case 32:
686 deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
687 NumSubVecElems);
688 break;
689 }
Farhana Aleen4b652a52017-06-22 22:59:04 +0000690
691 // Now replace the unoptimized-interleaved-vectors with the
692 // transposed-interleaved vectors.
693 for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
694 Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
695
696 return true;
697 }
698
699 Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
700 unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
701
702 // Lower the interleaved stores:
703 // 1. Decompose the interleaved wide shuffle into individual shuffle
704 // vectors.
Farhana Aleene4a89a62017-07-21 21:35:00 +0000705 decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
706 DecomposedVectors);
Farhana Aleen4b652a52017-06-22 22:59:04 +0000707
708 // 2. Transpose the interleaved-vectors into vectors of contiguous
709 // elements.
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000710 switch (NumSubVecElems) {
711 case 4:
712 transpose_4x4(DecomposedVectors, TransposedVectors);
713 break;
Michael Zuckerman4a97df02017-09-25 14:50:38 +0000714 case 8:
715 interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
716 break;
Michael Zuckerman680ac102017-08-07 13:22:39 +0000717 case 16:
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000718 case 32:
Michael Zuckerman645f7772017-09-26 18:49:11 +0000719 if (Factor == 4)
720 interleave8bitStride4(DecomposedVectors, TransposedVectors,
721 NumSubVecElems);
722 if (Factor == 3)
723 interleave8bitStride3(DecomposedVectors, TransposedVectors,
724 NumSubVecElems);
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000725 break;
726 default:
727 return false;
728 }
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000729
Farhana Aleen4b652a52017-06-22 22:59:04 +0000730 // 3. Concatenate the contiguous-vectors back into a wide vector.
731 Value *WideVec = concatenateVectors(Builder, TransposedVectors);
732
733 // 4. Generate a store instruction for wide-vec.
734 StoreInst *SI = cast<StoreInst>(Inst);
735 Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
736 SI->getAlignment());
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000737
738 return true;
739}
740
741// Lower interleaved load(s) into target specific instructions/
742// intrinsics. Lowering sequence varies depending on the vector-types, factor,
743// number of shuffles and ISA.
744// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
David L Kreitzer01a057a2016-10-14 18:20:41 +0000745bool X86TargetLowering::lowerInterleavedLoad(
746 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
747 ArrayRef<unsigned> Indices, unsigned Factor) const {
748 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
749 "Invalid interleave factor");
750 assert(!Shuffles.empty() && "Empty shufflevector input");
751 assert(Shuffles.size() == Indices.size() &&
752 "Unmatched number of shufflevectors and indices");
753
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000754 // Create an interleaved access group.
David L Kreitzer01a057a2016-10-14 18:20:41 +0000755 IRBuilder<> Builder(LI);
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000756 X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
757 Builder);
David L Kreitzer01a057a2016-10-14 18:20:41 +0000758
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000759 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
David L Kreitzer01a057a2016-10-14 18:20:41 +0000760}
Farhana Aleen4b652a52017-06-22 22:59:04 +0000761
762bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
763 ShuffleVectorInst *SVI,
764 unsigned Factor) const {
765 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
766 "Invalid interleave factor");
767
Farhana Aleen9bd593e2017-06-22 23:56:31 +0000768 assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
Farhana Aleen4b652a52017-06-22 22:59:04 +0000769 "Invalid interleaved store");
770
771 // Holds the indices of SVI that correspond to the starting index of each
772 // interleaved shuffle.
773 SmallVector<unsigned, 4> Indices;
774 auto Mask = SVI->getShuffleMask();
775 for (unsigned i = 0; i < Factor; i++)
776 Indices.push_back(Mask[i]);
777
778 ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
779
780 // Create an interleaved access group.
781 IRBuilder<> Builder(SI);
782 X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
783 Builder);
784
785 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
786}
Michael Zuckerman80d3649f2017-09-13 18:28:09 +0000787