blob: cdb24b9d40a67c74c39be8bfad2f96def238cbe7 [file] [log] [blame]
Eugene Zelenko60433b62017-10-05 00:33:50 +00001//===- X86InterleavedAccess.cpp -------------------------------------------===//
David L Kreitzer01a057a2016-10-14 18:20:41 +00002//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
Eugene Zelenko60433b62017-10-05 00:33:50 +00008//===----------------------------------------------------------------------===//
9//
David L Kreitzer0e3ae302016-12-01 19:56:39 +000010/// \file
11/// This file contains the X86 implementation of the interleaved accesses
12/// optimization generating X86-specific instructions/intrinsics for
13/// interleaved access groups.
Eugene Zelenko60433b62017-10-05 00:33:50 +000014//
15//===----------------------------------------------------------------------===//
David L Kreitzer01a057a2016-10-14 18:20:41 +000016
Eugene Zelenko60433b62017-10-05 00:33:50 +000017#include "X86ISelLowering.h"
18#include "X86Subtarget.h"
19#include "llvm/ADT/ArrayRef.h"
20#include "llvm/ADT/SmallVector.h"
Farhana Aleen4b652a52017-06-22 22:59:04 +000021#include "llvm/Analysis/VectorUtils.h"
Eugene Zelenko60433b62017-10-05 00:33:50 +000022#include "llvm/CodeGen/MachineValueType.h"
23#include "llvm/IR/Constants.h"
24#include "llvm/IR/DataLayout.h"
25#include "llvm/IR/DerivedTypes.h"
26#include "llvm/IR/IRBuilder.h"
27#include "llvm/IR/Instruction.h"
28#include "llvm/IR/Instructions.h"
29#include "llvm/IR/Module.h"
30#include "llvm/IR/Type.h"
31#include "llvm/IR/Value.h"
32#include "llvm/Support/Casting.h"
33#include <algorithm>
34#include <cassert>
35#include <cmath>
36#include <cstdint>
David L Kreitzer01a057a2016-10-14 18:20:41 +000037
38using namespace llvm;
39
Benjamin Kramerefcf06f2017-02-11 11:06:55 +000040namespace {
Eugene Zelenko60433b62017-10-05 00:33:50 +000041
David L Kreitzer0e3ae302016-12-01 19:56:39 +000042/// \brief This class holds necessary information to represent an interleaved
43/// access group and supports utilities to lower the group into
44/// X86-specific instructions/intrinsics.
45/// E.g. A group of interleaving access loads (Factor = 2; accessing every
46/// other element)
47/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
48/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
49/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
David L Kreitzer0e3ae302016-12-01 19:56:39 +000050class X86InterleavedAccessGroup {
51 /// \brief Reference to the wide-load instruction of an interleaved access
52 /// group.
53 Instruction *const Inst;
54
55 /// \brief Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
56 ArrayRef<ShuffleVectorInst *> Shuffles;
57
58 /// \brief Reference to the starting index of each user-shuffle.
59 ArrayRef<unsigned> Indices;
60
61 /// \brief Reference to the interleaving stride in terms of elements.
62 const unsigned Factor;
63
64 /// \brief Reference to the underlying target.
65 const X86Subtarget &Subtarget;
66
67 const DataLayout &DL;
68
69 IRBuilder<> &Builder;
70
71 /// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
Farhana Aleen4b652a52017-06-22 22:59:04 +000072 /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
73 void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
David L Kreitzer0e3ae302016-12-01 19:56:39 +000074 SmallVectorImpl<Instruction *> &DecomposedVectors);
75
76 /// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
77 /// returns the transposed-vectors in \p TransposedVectors.
78 /// E.g.
79 /// InputVectors:
80 /// In-V0 = p1, p2, p3, p4
81 /// In-V1 = q1, q2, q3, q4
82 /// In-V2 = r1, r2, r3, r4
83 /// In-V3 = s1, s2, s3, s4
84 /// OutputVectors:
85 /// Out-V0 = p1, q1, r1, s1
86 /// Out-V1 = p2, q2, r2, s2
87 /// Out-V2 = p3, q3, r3, s3
88 /// Out-V3 = P4, q4, r4, s4
89 void transpose_4x4(ArrayRef<Instruction *> InputVectors,
Michael Zuckermanc1918ad2017-07-26 08:10:14 +000090 SmallVectorImpl<Value *> &TransposedMatrix);
Michael Zuckerman680ac102017-08-07 13:22:39 +000091 void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
92 SmallVectorImpl<Value *> &TransposedMatrix,
93 unsigned NumSubVecElems);
Michael Zuckerman4a97df02017-09-25 14:50:38 +000094 void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
95 SmallVectorImpl<Value *> &TransposedMatrix);
Michael Zuckerman645f7772017-09-26 18:49:11 +000096 void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
97 SmallVectorImpl<Value *> &TransposedMatrix,
98 unsigned NumSubVecElems);
Michael Zuckerman5a385942017-09-07 14:02:13 +000099 void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
100 SmallVectorImpl<Value *> &TransposedMatrix,
101 unsigned NumSubVecElems);
Michael Zuckerman680ac102017-08-07 13:22:39 +0000102
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000103public:
104 /// In order to form an interleaved access group X86InterleavedAccessGroup
105 /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
106 /// \p Shuffs, reference to the first indices of each interleaved-vector
107 /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
108 /// X86-specific instructions/intrinsics it also requires the underlying
109 /// target information \p STarget.
110 explicit X86InterleavedAccessGroup(Instruction *I,
111 ArrayRef<ShuffleVectorInst *> Shuffs,
Farhana Aleen4b652a52017-06-22 22:59:04 +0000112 ArrayRef<unsigned> Ind, const unsigned F,
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000113 const X86Subtarget &STarget,
114 IRBuilder<> &B)
115 : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
116 DL(Inst->getModule()->getDataLayout()), Builder(B) {}
117
118 /// \brief Returns true if this interleaved access group can be lowered into
119 /// x86-specific instructions/intrinsics, false otherwise.
120 bool isSupported() const;
121
122 /// \brief Lowers this interleaved access group into X86-specific
123 /// instructions/intrinsics.
124 bool lowerIntoOptimizedSequence();
125};
Eugene Zelenko60433b62017-10-05 00:33:50 +0000126
Benjamin Kramerefcf06f2017-02-11 11:06:55 +0000127} // end anonymous namespace
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000128
129bool X86InterleavedAccessGroup::isSupported() const {
David L Kreitzer01a057a2016-10-14 18:20:41 +0000130 VectorType *ShuffleVecTy = Shuffles[0]->getType();
David L Kreitzer01a057a2016-10-14 18:20:41 +0000131 Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
Farhana Aleene4a89a62017-07-21 21:35:00 +0000132 unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
Farhana Aleene4a89a62017-07-21 21:35:00 +0000133 unsigned WideInstSize;
David L Kreitzer01a057a2016-10-14 18:20:41 +0000134
Michael Zuckerman5a385942017-09-07 14:02:13 +0000135 // Currently, lowering is supported for the following vectors:
136 // Stride 4:
137 // 1. Store and load of 4-element vectors of 64 bits on AVX.
138 // 2. Store of 16/32-element vectors of 8 bits on AVX.
139 // Stride 3:
Craig Topper21531142017-11-14 16:14:00 +0000140 // 1. Load of 16/32-element vectors of 8 bits on AVX.
Michael Zuckerman5a385942017-09-07 14:02:13 +0000141 if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
Michael Zuckerman680ac102017-08-07 13:22:39 +0000142 return false;
143
Farhana Aleene4a89a62017-07-21 21:35:00 +0000144 if (isa<LoadInst>(Inst)) {
Farhana Aleene4a89a62017-07-21 21:35:00 +0000145 WideInstSize = DL.getTypeSizeInBits(Inst->getType());
Michael Zuckerman72a6f892017-10-18 08:04:31 +0000146 if (cast<LoadInst>(Inst)->getPointerAddressSpace())
147 return false;
Farhana Aleene4a89a62017-07-21 21:35:00 +0000148 } else
149 WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
150
Michael Zuckerman680ac102017-08-07 13:22:39 +0000151 // We support shuffle represents stride 4 for byte type with size of
152 // WideInstSize.
Michael Zuckerman5a385942017-09-07 14:02:13 +0000153 if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
154 return true;
155
156 if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000157 (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
158 WideInstSize == 2048))
159 return true;
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000160
Michael Zuckerman645f7772017-09-26 18:49:11 +0000161 if (ShuffleElemSize == 8 && Factor == 3 &&
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000162 (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
Michael Zuckerman645f7772017-09-26 18:49:11 +0000163 return true;
David L Kreitzer01a057a2016-10-14 18:20:41 +0000164
Michael Zuckerman5a385942017-09-07 14:02:13 +0000165 return false;
David L Kreitzer01a057a2016-10-14 18:20:41 +0000166}
167
Farhana Aleen4b652a52017-06-22 22:59:04 +0000168void X86InterleavedAccessGroup::decompose(
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000169 Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
170 SmallVectorImpl<Instruction *> &DecomposedVectors) {
Farhana Aleen4b652a52017-06-22 22:59:04 +0000171 assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
172 "Expected Load or Shuffle");
173
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000174 Type *VecWidth = VecInst->getType();
175 (void)VecWidth;
176 assert(VecWidth->isVectorTy() &&
177 DL.getTypeSizeInBits(VecWidth) >=
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000178 DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
179 "Invalid Inst-size!!!");
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000180
Farhana Aleen4b652a52017-06-22 22:59:04 +0000181 if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
182 Value *Op0 = SVI->getOperand(0);
183 Value *Op1 = SVI->getOperand(1);
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000184
Farhana Aleen4b652a52017-06-22 22:59:04 +0000185 // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
186 for (unsigned i = 0; i < NumSubVectors; ++i)
187 DecomposedVectors.push_back(
188 cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
Farhana Aleene4a89a62017-07-21 21:35:00 +0000189 Op0, Op1,
190 createSequentialMask(Builder, Indices[i],
191 SubVecTy->getVectorNumElements(), 0))));
Farhana Aleen4b652a52017-06-22 22:59:04 +0000192 return;
193 }
194
195 // Decompose the load instruction.
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000196 LoadInst *LI = cast<LoadInst>(VecInst);
197 Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
Michael Zuckerman5a385942017-09-07 14:02:13 +0000198 Value *VecBasePtr;
199 unsigned int NumLoads = NumSubVectors;
200 // In the case of stride 3 with a vector of 32 elements load the information
201 // in the following way:
202 // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000203 unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
204 if (VecLength == 768 || VecLength == 1536) {
Michael Zuckerman5a385942017-09-07 14:02:13 +0000205 Type *VecTran =
206 VectorType::get(Type::getInt8Ty(LI->getContext()), 16)->getPointerTo();
207 VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTran);
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000208 NumLoads = NumSubVectors * (VecLength / 384);
Michael Zuckerman5a385942017-09-07 14:02:13 +0000209 } else
210 VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
Farhana Aleen4b652a52017-06-22 22:59:04 +0000211 // Generate N loads of T type.
Michael Zuckerman5a385942017-09-07 14:02:13 +0000212 for (unsigned i = 0; i < NumLoads; i++) {
Farhana Aleen4b652a52017-06-22 22:59:04 +0000213 // TODO: Support inbounds GEP.
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000214 Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
215 Instruction *NewLoad =
216 Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
217 DecomposedVectors.push_back(NewLoad);
218 }
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000219}
220
Michael Zuckerman80d3649f2017-09-13 18:28:09 +0000221// Changing the scale of the vector type by reducing the number of elements and
222// doubling the scalar size.
223static MVT scaleVectorType(MVT VT) {
224 unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
225 return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
226 VT.getVectorNumElements() / 2);
227}
228
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000229static uint32_t Concat[] = {
230 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
231 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
232 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
233 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 };
234
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000235// genShuffleBland - Creates shuffle according to two vectors.This function is
236// only works on instructions with lane inside 256 registers. According to
237// the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
238// offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
239// Where the 'LowOffset' refers to the first vector and the highOffset refers to
240// the second vector.
241// |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20|
242// |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25|
243// |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31|
244// For the sequence to work as a mirror to the load.
245// We must consider the elements order as above.
246// In this function we are combining two types of shuffles.
247// The first one is vpshufed and the second is a type of "blend" shuffle.
248// By computing the shuffle on a sequence of 16 elements(one lane) and add the
249// correct offset. We are creating a vpsuffed + blend sequence between two
250// shuffles.
251static void genShuffleBland(MVT VT, ArrayRef<uint32_t> Mask,
252 SmallVectorImpl<uint32_t> &Out, int LowOffset,
253 int HighOffset) {
254 assert(VT.getSizeInBits() >= 256 &&
255 "This function doesn't accept width smaller then 256");
256 unsigned NumOfElm = VT.getVectorNumElements();
257 for (unsigned i = 0; i < Mask.size(); i++)
258 Out.push_back(Mask[i] + LowOffset);
259 for (unsigned i = 0; i < Mask.size(); i++)
260 Out.push_back(Mask[i] + HighOffset + NumOfElm);
261}
262
Craig Topper21531142017-11-14 16:14:00 +0000263// reorderSubVector returns the data to is the original state. And de-facto is
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000264// the opposite of the function concatSubVector.
265
266// For VecElems = 16
267// Invec[0] - |0| TransposedMatrix[0] - |0|
268// Invec[1] - |1| => TransposedMatrix[1] - |1|
269// Invec[2] - |2| TransposedMatrix[2] - |2|
270
271// For VecElems = 32
272// Invec[0] - |0|3| TransposedMatrix[0] - |0|1|
273// Invec[1] - |1|4| => TransposedMatrix[1] - |2|3|
274// Invec[2] - |2|5| TransposedMatrix[2] - |4|5|
275
276// For VecElems = 64
277// Invec[0] - |0|3|6|9 | TransposedMatrix[0] - |0|1|2 |3 |
278// Invec[1] - |1|4|7|10| => TransposedMatrix[1] - |4|5|6 |7 |
279// Invec[2] - |2|5|8|11| TransposedMatrix[2] - |8|9|10|11|
280
281static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
282 ArrayRef<Value *> Vec, ArrayRef<uint32_t> VPShuf,
283 unsigned VecElems, unsigned Stride,
284 IRBuilder<> Builder) {
285
286 if (VecElems == 16) {
287 for (unsigned i = 0; i < Stride; i++)
288 TransposedMatrix[i] = Builder.CreateShuffleVector(
289 Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
290 return;
291 }
292
293 SmallVector<uint32_t, 32> OptimizeShuf;
294 Value *Temp[8];
295
296 for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
297 genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
298 (i + 1) / Stride * 16);
299 Temp[i / 2] = Builder.CreateShuffleVector(
300 Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
301 OptimizeShuf.clear();
302 }
303
304 if (VecElems == 32) {
305 std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
306 return;
307 }
308 else
309 for (unsigned i = 0; i < Stride; i++)
310 TransposedMatrix[i] =
311 Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000312}
313
Michael Zuckerman4a97df02017-09-25 14:50:38 +0000314void X86InterleavedAccessGroup::interleave8bitStride4VF8(
315 ArrayRef<Instruction *> Matrix,
316 SmallVectorImpl<Value *> &TransposedMatrix) {
317 // Assuming we start from the following vectors:
318 // Matrix[0]= c0 c1 c2 c3 c4 ... c7
319 // Matrix[1]= m0 m1 m2 m3 m4 ... m7
320 // Matrix[2]= y0 y1 y2 y3 y4 ... y7
321 // Matrix[3]= k0 k1 k2 k3 k4 ... k7
322
323 MVT VT = MVT::v8i16;
324 TransposedMatrix.resize(2);
325 SmallVector<uint32_t, 16> MaskLow;
326 SmallVector<uint32_t, 32> MaskLowTemp1, MaskLowWord;
327 SmallVector<uint32_t, 32> MaskHighTemp1, MaskHighWord;
328
329 for (unsigned i = 0; i < 8; ++i) {
330 MaskLow.push_back(i);
331 MaskLow.push_back(i + 8);
332 }
333
334 createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1, true, false);
335 createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1, false, false);
336 scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
337 scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
338 // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
339 // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
340 Value *IntrVec1Low =
341 Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
342 Value *IntrVec2Low =
343 Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
344
345 // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
346 // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
347
348 TransposedMatrix[0] =
349 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
350 TransposedMatrix[1] =
351 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
352}
353
Michael Zuckerman680ac102017-08-07 13:22:39 +0000354void X86InterleavedAccessGroup::interleave8bitStride4(
355 ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix,
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000356 unsigned NumOfElm) {
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000357 // Example: Assuming we start from the following vectors:
358 // Matrix[0]= c0 c1 c2 c3 c4 ... c31
359 // Matrix[1]= m0 m1 m2 m3 m4 ... m31
360 // Matrix[2]= y0 y1 y2 y3 y4 ... y31
361 // Matrix[3]= k0 k1 k2 k3 k4 ... k31
362
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000363 MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm);
Michael Zuckerman80d3649f2017-09-13 18:28:09 +0000364 MVT HalfVT = scaleVectorType(VT);
Michael Zuckerman680ac102017-08-07 13:22:39 +0000365
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000366 TransposedMatrix.resize(4);
Michael Zuckerman80d3649f2017-09-13 18:28:09 +0000367 SmallVector<uint32_t, 32> MaskHigh;
368 SmallVector<uint32_t, 32> MaskLow;
Michael Zuckermanb92b6d42017-09-30 14:55:03 +0000369 SmallVector<uint32_t, 32> LowHighMask[2];
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000370 SmallVector<uint32_t, 32> MaskHighTemp;
371 SmallVector<uint32_t, 32> MaskLowTemp;
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000372
373 // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
374 // shuffle pattern.
375
Michael Zuckerman80d3649f2017-09-13 18:28:09 +0000376 createUnpackShuffleMask<uint32_t>(VT, MaskLow, true, false);
Michael Zuckermanb92b6d42017-09-30 14:55:03 +0000377 createUnpackShuffleMask<uint32_t>(VT, MaskHigh, false, false);
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000378
379 // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
380 // shuffle pattern.
381
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000382 createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp, true, false);
383 createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp, false, false);
384 scaleShuffleMask<uint32_t>(2, MaskLowTemp, LowHighMask[0]);
385 scaleShuffleMask<uint32_t>(2, MaskHighTemp, LowHighMask[1]);
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000386
387 // IntrVec1Low = c0 m0 c1 m1 ... c7 m7 | c16 m16 c17 m17 ... c23 m23
388 // IntrVec1High = c8 m8 c9 m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
389 // IntrVec2Low = y0 k0 y1 k1 ... y7 k7 | y16 k16 y17 k17 ... y23 k23
390 // IntrVec2High = y8 k8 y9 k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31
Michael Zuckermanb92b6d42017-09-30 14:55:03 +0000391 Value *IntrVec[4];
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000392
Michael Zuckermanb92b6d42017-09-30 14:55:03 +0000393 IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
394 IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
395 IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
396 IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000397
398 // cmyk4 cmyk5 cmyk6 cmyk7 | cmyk20 cmyk21 cmyk22 cmyk23
399 // cmyk12 cmyk13 cmyk14 cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31
400 // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk16 cmyk17 cmyk18 cmyk19
401 // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27
402
Michael Zuckermanb92b6d42017-09-30 14:55:03 +0000403 Value *VecOut[4];
404 for (int i = 0; i < 4; i++)
405 VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
406 LowHighMask[i % 2]);
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000407
Michael Zuckerman80d3649f2017-09-13 18:28:09 +0000408 // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk4 cmyk5 cmyk6 cmyk7
409 // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk12 cmyk13 cmyk14 cmyk15
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000410 // cmyk16 cmyk17 cmyk18 cmyk19 | cmyk20 cmyk21 cmyk22 cmyk23
411 // cmyk24 cmyk25 cmyk26 cmyk27 | cmyk28 cmyk29 cmyk30 cmyk31
412
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000413 if (VT == MVT::v16i8) {
414 std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
415 return;
416 }
Michael Zuckerman80d3649f2017-09-13 18:28:09 +0000417
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000418 reorderSubVector(VT, TransposedMatrix, VecOut, makeArrayRef(Concat, 16),
419 NumOfElm, 4, Builder);
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000420}
421
Michael Zuckerman5a385942017-09-07 14:02:13 +0000422// createShuffleStride returns shuffle mask of size N.
423// The shuffle pattern is as following :
424// {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane),
425// (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
426// (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
427// Where Lane is the # of lanes in a register:
428// VectorSize = 128 => Lane = 1
429// VectorSize = 256 => Lane = 2
430// For example shuffle pattern for VF 16 register size 256 -> lanes = 2
431// {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
432static void createShuffleStride(MVT VT, int Stride,
433 SmallVectorImpl<uint32_t> &Mask) {
434 int VectorSize = VT.getSizeInBits();
435 int VF = VT.getVectorNumElements();
436 int LaneCount = std::max(VectorSize / 128, 1);
437 for (int Lane = 0; Lane < LaneCount; Lane++)
438 for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
439 Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
440}
441
442// setGroupSize sets 'SizeInfo' to the size(number of elements) of group
443// inside mask a shuffleMask. A mask contains exactly 3 groups, where
444// each group is a monotonically increasing sequence with stride 3.
445// For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
446static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
447 int VectorSize = VT.getSizeInBits();
448 int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
449 for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
450 int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
451 SizeInfo.push_back(GroupSize);
452 FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
453 }
454}
455
456// DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
457// vpalign works according to lanes
458// Where Lane is the # of lanes in a register:
459// VectorWide = 128 => Lane = 1
460// VectorWide = 256 => Lane = 2
461// For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
462// For Lane = 2 shuffle pattern is:
463// {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
464// Imm variable sets the offset amount. The result of the
465// function is stored inside ShuffleMask vector and it built as described in
466// the begin of the description. AlignDirection is a boolean that indecat the
467// direction of the alignment. (false - align to the "right" side while true -
468// align to the "left" side)
469static void DecodePALIGNRMask(MVT VT, unsigned Imm,
470 SmallVectorImpl<uint32_t> &ShuffleMask,
471 bool AlignDirection = true, bool Unary = false) {
Michael Zuckerman5a385942017-09-07 14:02:13 +0000472 unsigned NumElts = VT.getVectorNumElements();
473 unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
474 unsigned NumLaneElts = NumElts / NumLanes;
475
476 Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
477 unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
478
479 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
480 for (unsigned i = 0; i != NumLaneElts; ++i) {
481 unsigned Base = i + Offset;
482 // if i+offset is out of this lane then we actually need the other source
483 // If Unary the other source is the first source.
484 if (Base >= NumLaneElts)
485 Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
486 ShuffleMask.push_back(Base + l);
487 }
488 }
489}
490
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000491// concatSubVector - The function rebuilds the data to a correct expected
492// order. An assumption(The shape of the matrix) was taken for the
493// deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
494// This function ensures that the data is built in correct way for the lane
495// instructions. Each lane inside the vector is a 128-bit length.
496//
497// The 'InVec' argument contains the data in increasing order. In InVec[0] You
498// can find the first 128 bit data. The number of different lanes inside a
499// vector depends on the 'VecElems'.In general, the formula is
500// VecElems * type / 128. The size of the array 'InVec' depends and equal to
501// 'VecElems'.
502
503// For VecElems = 16
504// Invec[0] - |0| Vec[0] - |0|
505// Invec[1] - |1| => Vec[1] - |1|
506// Invec[2] - |2| Vec[2] - |2|
507
508// For VecElems = 32
509// Invec[0] - |0|1| Vec[0] - |0|3|
510// Invec[1] - |2|3| => Vec[1] - |1|4|
511// Invec[2] - |4|5| Vec[2] - |2|5|
512
513// For VecElems = 64
514// Invec[0] - |0|1|2 |3 | Vec[0] - |0|3|6|9 |
515// Invec[1] - |4|5|6 |7 | => Vec[1] - |1|4|7|10|
516// Invec[2] - |8|9|10|11| Vec[2] - |2|5|8|11|
517
518static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec,
519 unsigned VecElems, IRBuilder<> Builder) {
520 if (VecElems == 16) {
521 for (int i = 0; i < 3; i++)
522 Vec[i] = InVec[i];
523 return;
524 }
525
526 for (unsigned j = 0; j < VecElems / 32; j++)
527 for (int i = 0; i < 3; i++)
528 Vec[i + j * 3] = Builder.CreateShuffleVector(
529 InVec[j * 6 + i], InVec[j * 6 + i + 3], makeArrayRef(Concat, 32));
530
531 if (VecElems == 32)
532 return;
533
534 for (int i = 0; i < 3; i++)
535 Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat);
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000536}
537
Michael Zuckerman5a385942017-09-07 14:02:13 +0000538void X86InterleavedAccessGroup::deinterleave8bitStride3(
539 ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
540 unsigned VecElems) {
Michael Zuckerman5a385942017-09-07 14:02:13 +0000541 // Example: Assuming we start from the following vectors:
542 // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
543 // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
544 // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
545
546 TransposedMatrix.resize(3);
Michael Zuckerman5a385942017-09-07 14:02:13 +0000547 SmallVector<uint32_t, 32> VPShuf;
548 SmallVector<uint32_t, 32> VPAlign[2];
549 SmallVector<uint32_t, 32> VPAlign2;
550 SmallVector<uint32_t, 32> VPAlign3;
551 SmallVector<uint32_t, 3> GroupSize;
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000552 Value *Vec[6], *TempVector[3];
Michael Zuckerman5a385942017-09-07 14:02:13 +0000553
554 MVT VT = MVT::getVT(Shuffles[0]->getType());
555
Michael Zuckerman5a385942017-09-07 14:02:13 +0000556 createShuffleStride(VT, 3, VPShuf);
557 setGroupSize(VT, GroupSize);
558
559 for (int i = 0; i < 2; i++)
560 DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
561
562 DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
563 DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
564
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000565 concatSubVector(Vec, InVec, VecElems, Builder);
Michael Zuckerman5a385942017-09-07 14:02:13 +0000566 // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
567 // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
568 // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
569
570 for (int i = 0; i < 3; i++)
571 Vec[i] = Builder.CreateShuffleVector(
572 Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf);
573
574 // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
575 // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
576 // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
577
578 for (int i = 0; i < 3; i++)
579 TempVector[i] =
580 Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
581
582 // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
583 // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
584 // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
585
586 for (int i = 0; i < 3; i++)
587 Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
588 VPAlign[1]);
589
590 // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
591 // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
592 // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
593
594 Value *TempVec = Builder.CreateShuffleVector(
595 Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3);
596 TransposedMatrix[0] = Builder.CreateShuffleVector(
597 Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2);
598 TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
599 TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
Michael Zuckerman5a385942017-09-07 14:02:13 +0000600}
601
Michael Zuckerman645f7772017-09-26 18:49:11 +0000602// group2Shuffle reorder the shuffle stride back into continuous order.
603// For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
604// MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
605static void group2Shuffle(MVT VT, SmallVectorImpl<uint32_t> &Mask,
606 SmallVectorImpl<uint32_t> &Output) {
607 int IndexGroup[3] = {0, 0, 0};
608 int Index = 0;
609 int VectorWidth = VT.getSizeInBits();
610 int VF = VT.getVectorNumElements();
611 // Find the index of the different groups.
612 int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
613 for (int i = 0; i < 3; i++) {
614 IndexGroup[(Index * 3) % (VF / Lane)] = Index;
615 Index += Mask[i];
616 }
617 // According to the index compute the convert mask.
618 for (int i = 0; i < VF / Lane; i++) {
619 Output.push_back(IndexGroup[i % 3]);
620 IndexGroup[i % 3]++;
621 }
622}
623
Michael Zuckerman645f7772017-09-26 18:49:11 +0000624void X86InterleavedAccessGroup::interleave8bitStride3(
625 ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
626 unsigned VecElems) {
Michael Zuckerman645f7772017-09-26 18:49:11 +0000627 // Example: Assuming we start from the following vectors:
628 // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
629 // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
630 // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
631
632 TransposedMatrix.resize(3);
633 SmallVector<uint32_t, 3> GroupSize;
634 SmallVector<uint32_t, 32> VPShuf;
635 SmallVector<uint32_t, 32> VPAlign[3];
636 SmallVector<uint32_t, 32> VPAlign2;
637 SmallVector<uint32_t, 32> VPAlign3;
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000638
Michael Zuckerman645f7772017-09-26 18:49:11 +0000639 Value *Vec[3], *TempVector[3];
640 MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
641
642 setGroupSize(VT, GroupSize);
643
644 for (int i = 0; i < 3; i++)
645 DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
646
647 DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
648 DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
649
650 // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
651 // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
652 // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
653
654 Vec[0] = Builder.CreateShuffleVector(
655 InVec[0], UndefValue::get(InVec[0]->getType()), VPAlign2);
656 Vec[1] = Builder.CreateShuffleVector(
657 InVec[1], UndefValue::get(InVec[1]->getType()), VPAlign3);
658 Vec[2] = InVec[2];
659
660 // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
661 // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
662 // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
663
664 for (int i = 0; i < 3; i++)
665 TempVector[i] =
666 Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
667
668 // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
669 // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
670 // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
671
672 for (int i = 0; i < 3; i++)
673 Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
674 VPAlign[2]);
675
676 // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
677 // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
678 // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
679
Michael Zuckerman645f7772017-09-26 18:49:11 +0000680 unsigned NumOfElm = VT.getVectorNumElements();
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000681 group2Shuffle(VT, GroupSize, VPShuf);
682 reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm,3, Builder);
Michael Zuckerman645f7772017-09-26 18:49:11 +0000683}
684
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000685void X86InterleavedAccessGroup::transpose_4x4(
686 ArrayRef<Instruction *> Matrix,
687 SmallVectorImpl<Value *> &TransposedMatrix) {
688 assert(Matrix.size() == 4 && "Invalid matrix size");
689 TransposedMatrix.resize(4);
690
691 // dst = src1[0,1],src2[0,1]
692 uint32_t IntMask1[] = {0, 1, 4, 5};
693 ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
694 Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
695 Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
696
697 // dst = src1[2,3],src2[2,3]
698 uint32_t IntMask2[] = {2, 3, 6, 7};
699 Mask = makeArrayRef(IntMask2, 4);
700 Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
701 Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
702
703 // dst = src1[0],src2[0],src1[2],src2[2]
704 uint32_t IntMask3[] = {0, 4, 2, 6};
705 Mask = makeArrayRef(IntMask3, 4);
706 TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
707 TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
708
709 // dst = src1[1],src2[1],src1[3],src2[3]
710 uint32_t IntMask4[] = {1, 5, 3, 7};
711 Mask = makeArrayRef(IntMask4, 4);
712 TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
713 TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
714}
715
716// Lowers this interleaved access group into X86-specific
717// instructions/intrinsics.
718bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
719 SmallVector<Instruction *, 4> DecomposedVectors;
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000720 SmallVector<Value *, 4> TransposedVectors;
Farhana Aleen4b652a52017-06-22 22:59:04 +0000721 VectorType *ShuffleTy = Shuffles[0]->getType();
722
723 if (isa<LoadInst>(Inst)) {
724 // Try to generate target-sized register(/instruction).
725 decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
726
Michael Zuckerman5a385942017-09-07 14:02:13 +0000727 Type *ShuffleEltTy = Inst->getType();
728 unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor;
Farhana Aleen4b652a52017-06-22 22:59:04 +0000729 // Perform matrix-transposition in order to compute interleaved
730 // results by generating some sort of (optimized) target-specific
731 // instructions.
Michael Zuckerman5a385942017-09-07 14:02:13 +0000732
733 switch (NumSubVecElems) {
734 default:
735 return false;
736 case 4:
737 transpose_4x4(DecomposedVectors, TransposedVectors);
738 break;
739 case 8:
740 case 16:
741 case 32:
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000742 case 64:
Michael Zuckerman5a385942017-09-07 14:02:13 +0000743 deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
744 NumSubVecElems);
745 break;
746 }
Farhana Aleen4b652a52017-06-22 22:59:04 +0000747
748 // Now replace the unoptimized-interleaved-vectors with the
749 // transposed-interleaved vectors.
750 for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
751 Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
752
753 return true;
754 }
755
756 Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
757 unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
758
759 // Lower the interleaved stores:
760 // 1. Decompose the interleaved wide shuffle into individual shuffle
761 // vectors.
Farhana Aleene4a89a62017-07-21 21:35:00 +0000762 decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
763 DecomposedVectors);
Farhana Aleen4b652a52017-06-22 22:59:04 +0000764
765 // 2. Transpose the interleaved-vectors into vectors of contiguous
766 // elements.
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000767 switch (NumSubVecElems) {
768 case 4:
769 transpose_4x4(DecomposedVectors, TransposedVectors);
770 break;
Michael Zuckerman4a97df02017-09-25 14:50:38 +0000771 case 8:
772 interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
773 break;
Michael Zuckerman680ac102017-08-07 13:22:39 +0000774 case 16:
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000775 case 32:
Michael Zuckermane4084f6b2017-10-02 07:35:25 +0000776 case 64:
Michael Zuckerman645f7772017-09-26 18:49:11 +0000777 if (Factor == 4)
778 interleave8bitStride4(DecomposedVectors, TransposedVectors,
779 NumSubVecElems);
780 if (Factor == 3)
781 interleave8bitStride3(DecomposedVectors, TransposedVectors,
782 NumSubVecElems);
Michael Zuckermanc1918ad2017-07-26 08:10:14 +0000783 break;
784 default:
785 return false;
786 }
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000787
Farhana Aleen4b652a52017-06-22 22:59:04 +0000788 // 3. Concatenate the contiguous-vectors back into a wide vector.
789 Value *WideVec = concatenateVectors(Builder, TransposedVectors);
790
791 // 4. Generate a store instruction for wide-vec.
792 StoreInst *SI = cast<StoreInst>(Inst);
793 Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
794 SI->getAlignment());
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000795
796 return true;
797}
798
799// Lower interleaved load(s) into target specific instructions/
800// intrinsics. Lowering sequence varies depending on the vector-types, factor,
801// number of shuffles and ISA.
802// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
David L Kreitzer01a057a2016-10-14 18:20:41 +0000803bool X86TargetLowering::lowerInterleavedLoad(
804 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
805 ArrayRef<unsigned> Indices, unsigned Factor) const {
806 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
807 "Invalid interleave factor");
808 assert(!Shuffles.empty() && "Empty shufflevector input");
809 assert(Shuffles.size() == Indices.size() &&
810 "Unmatched number of shufflevectors and indices");
811
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000812 // Create an interleaved access group.
David L Kreitzer01a057a2016-10-14 18:20:41 +0000813 IRBuilder<> Builder(LI);
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000814 X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
815 Builder);
David L Kreitzer01a057a2016-10-14 18:20:41 +0000816
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000817 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
David L Kreitzer01a057a2016-10-14 18:20:41 +0000818}
Farhana Aleen4b652a52017-06-22 22:59:04 +0000819
820bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
821 ShuffleVectorInst *SVI,
822 unsigned Factor) const {
823 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
824 "Invalid interleave factor");
825
Farhana Aleen9bd593e2017-06-22 23:56:31 +0000826 assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
Farhana Aleen4b652a52017-06-22 22:59:04 +0000827 "Invalid interleaved store");
828
829 // Holds the indices of SVI that correspond to the starting index of each
830 // interleaved shuffle.
831 SmallVector<unsigned, 4> Indices;
832 auto Mask = SVI->getShuffleMask();
833 for (unsigned i = 0; i < Factor; i++)
834 Indices.push_back(Mask[i]);
835
836 ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
837
838 // Create an interleaved access group.
839 IRBuilder<> Builder(SI);
840 X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
841 Builder);
842
843 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
844}