blob: de3f6723175302f88b4795ed5c749197b7834f18 [file] [log] [blame]
David L Kreitzer0e3ae302016-12-01 19:56:39 +00001//===--------- X86InterleavedAccess.cpp ----------------------------------===//
David L Kreitzer01a057a2016-10-14 18:20:41 +00002//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
David L Kreitzer0e3ae302016-12-01 19:56:39 +00008//===--------------------------------------------------------------------===//
9///
10/// \file
11/// This file contains the X86 implementation of the interleaved accesses
12/// optimization generating X86-specific instructions/intrinsics for
13/// interleaved access groups.
14///
15//===--------------------------------------------------------------------===//
David L Kreitzer01a057a2016-10-14 18:20:41 +000016
17#include "X86ISelLowering.h"
18#include "X86TargetMachine.h"
Farhana Aleen4b652a52017-06-22 22:59:04 +000019#include "llvm/Analysis/VectorUtils.h"
David L Kreitzer01a057a2016-10-14 18:20:41 +000020
21using namespace llvm;
22
Benjamin Kramerefcf06f2017-02-11 11:06:55 +000023namespace {
David L Kreitzer0e3ae302016-12-01 19:56:39 +000024/// \brief This class holds necessary information to represent an interleaved
25/// access group and supports utilities to lower the group into
26/// X86-specific instructions/intrinsics.
27/// E.g. A group of interleaving access loads (Factor = 2; accessing every
28/// other element)
29/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
30/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
31/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
David L Kreitzer0e3ae302016-12-01 19:56:39 +000032class X86InterleavedAccessGroup {
33 /// \brief Reference to the wide-load instruction of an interleaved access
34 /// group.
35 Instruction *const Inst;
36
37 /// \brief Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
38 ArrayRef<ShuffleVectorInst *> Shuffles;
39
40 /// \brief Reference to the starting index of each user-shuffle.
41 ArrayRef<unsigned> Indices;
42
43 /// \brief Reference to the interleaving stride in terms of elements.
44 const unsigned Factor;
45
46 /// \brief Reference to the underlying target.
47 const X86Subtarget &Subtarget;
48
49 const DataLayout &DL;
50
51 IRBuilder<> &Builder;
52
53 /// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
Farhana Aleen4b652a52017-06-22 22:59:04 +000054 /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
55 void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
David L Kreitzer0e3ae302016-12-01 19:56:39 +000056 SmallVectorImpl<Instruction *> &DecomposedVectors);
57
58 /// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
59 /// returns the transposed-vectors in \p TransposedVectors.
60 /// E.g.
61 /// InputVectors:
62 /// In-V0 = p1, p2, p3, p4
63 /// In-V1 = q1, q2, q3, q4
64 /// In-V2 = r1, r2, r3, r4
65 /// In-V3 = s1, s2, s3, s4
66 /// OutputVectors:
67 /// Out-V0 = p1, q1, r1, s1
68 /// Out-V1 = p2, q2, r2, s2
69 /// Out-V2 = p3, q3, r3, s3
70 /// Out-V3 = P4, q4, r4, s4
71 void transpose_4x4(ArrayRef<Instruction *> InputVectors,
72 SmallVectorImpl<Value *> &TrasposedVectors);
73
74public:
75 /// In order to form an interleaved access group X86InterleavedAccessGroup
76 /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
77 /// \p Shuffs, reference to the first indices of each interleaved-vector
78 /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
79 /// X86-specific instructions/intrinsics it also requires the underlying
80 /// target information \p STarget.
81 explicit X86InterleavedAccessGroup(Instruction *I,
82 ArrayRef<ShuffleVectorInst *> Shuffs,
Farhana Aleen4b652a52017-06-22 22:59:04 +000083 ArrayRef<unsigned> Ind, const unsigned F,
David L Kreitzer0e3ae302016-12-01 19:56:39 +000084 const X86Subtarget &STarget,
85 IRBuilder<> &B)
86 : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
87 DL(Inst->getModule()->getDataLayout()), Builder(B) {}
88
89 /// \brief Returns true if this interleaved access group can be lowered into
90 /// x86-specific instructions/intrinsics, false otherwise.
91 bool isSupported() const;
92
93 /// \brief Lowers this interleaved access group into X86-specific
94 /// instructions/intrinsics.
95 bool lowerIntoOptimizedSequence();
96};
Benjamin Kramerefcf06f2017-02-11 11:06:55 +000097} // end anonymous namespace
David L Kreitzer0e3ae302016-12-01 19:56:39 +000098
99bool X86InterleavedAccessGroup::isSupported() const {
David L Kreitzer01a057a2016-10-14 18:20:41 +0000100 VectorType *ShuffleVecTy = Shuffles[0]->getType();
David L Kreitzer01a057a2016-10-14 18:20:41 +0000101 Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
Farhana Aleene4a89a62017-07-21 21:35:00 +0000102 unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
103 unsigned SupportedNumElem = 4;
104 unsigned WideInstSize;
David L Kreitzer01a057a2016-10-14 18:20:41 +0000105
Farhana Aleen4b652a52017-06-22 22:59:04 +0000106 // Currently, lowering is supported for 4-element vectors of 64 bits on AVX.
Farhana Aleene4a89a62017-07-21 21:35:00 +0000107 if (isa<LoadInst>(Inst)) {
108 if (DL.getTypeSizeInBits(ShuffleVecTy) != SupportedNumElem * ShuffleElemSize)
109 return false;
David L Kreitzer01a057a2016-10-14 18:20:41 +0000110
Farhana Aleene4a89a62017-07-21 21:35:00 +0000111 WideInstSize = DL.getTypeSizeInBits(Inst->getType());
112 } else
113 WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
114
115 if (!Subtarget.hasAVX() || Factor != 4 || ShuffleElemSize != 64 ||
116 WideInstSize != (Factor * ShuffleElemSize * SupportedNumElem))
David L Kreitzer01a057a2016-10-14 18:20:41 +0000117 return false;
118
119 return true;
120}
121
Farhana Aleen4b652a52017-06-22 22:59:04 +0000122void X86InterleavedAccessGroup::decompose(
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000123 Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
124 SmallVectorImpl<Instruction *> &DecomposedVectors) {
Farhana Aleen4b652a52017-06-22 22:59:04 +0000125
126 assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
127 "Expected Load or Shuffle");
128
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000129 Type *VecTy = VecInst->getType();
Benjamin Kramer215b22e2016-12-01 20:49:34 +0000130 (void)VecTy;
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000131 assert(VecTy->isVectorTy() &&
132 DL.getTypeSizeInBits(VecTy) >=
133 DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
134 "Invalid Inst-size!!!");
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000135
Farhana Aleen4b652a52017-06-22 22:59:04 +0000136 if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
137 Value *Op0 = SVI->getOperand(0);
138 Value *Op1 = SVI->getOperand(1);
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000139
Farhana Aleen4b652a52017-06-22 22:59:04 +0000140 // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
141 for (unsigned i = 0; i < NumSubVectors; ++i)
142 DecomposedVectors.push_back(
143 cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
Farhana Aleene4a89a62017-07-21 21:35:00 +0000144 Op0, Op1,
145 createSequentialMask(Builder, Indices[i],
146 SubVecTy->getVectorNumElements(), 0))));
Farhana Aleen4b652a52017-06-22 22:59:04 +0000147 return;
148 }
149
150 // Decompose the load instruction.
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000151 LoadInst *LI = cast<LoadInst>(VecInst);
152 Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000153 Value *VecBasePtr =
154 Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
155
Farhana Aleen4b652a52017-06-22 22:59:04 +0000156 // Generate N loads of T type.
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000157 for (unsigned i = 0; i < NumSubVectors; i++) {
Farhana Aleen4b652a52017-06-22 22:59:04 +0000158 // TODO: Support inbounds GEP.
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000159 Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
160 Instruction *NewLoad =
161 Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
162 DecomposedVectors.push_back(NewLoad);
163 }
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000164}
165
166void X86InterleavedAccessGroup::transpose_4x4(
167 ArrayRef<Instruction *> Matrix,
168 SmallVectorImpl<Value *> &TransposedMatrix) {
169 assert(Matrix.size() == 4 && "Invalid matrix size");
170 TransposedMatrix.resize(4);
171
172 // dst = src1[0,1],src2[0,1]
173 uint32_t IntMask1[] = {0, 1, 4, 5};
174 ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
175 Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
176 Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
177
178 // dst = src1[2,3],src2[2,3]
179 uint32_t IntMask2[] = {2, 3, 6, 7};
180 Mask = makeArrayRef(IntMask2, 4);
181 Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
182 Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
183
184 // dst = src1[0],src2[0],src1[2],src2[2]
185 uint32_t IntMask3[] = {0, 4, 2, 6};
186 Mask = makeArrayRef(IntMask3, 4);
187 TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
188 TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
189
190 // dst = src1[1],src2[1],src1[3],src2[3]
191 uint32_t IntMask4[] = {1, 5, 3, 7};
192 Mask = makeArrayRef(IntMask4, 4);
193 TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
194 TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
195}
196
197// Lowers this interleaved access group into X86-specific
198// instructions/intrinsics.
199bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
200 SmallVector<Instruction *, 4> DecomposedVectors;
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000201 SmallVector<Value *, 4> TransposedVectors;
Farhana Aleen4b652a52017-06-22 22:59:04 +0000202 VectorType *ShuffleTy = Shuffles[0]->getType();
203
204 if (isa<LoadInst>(Inst)) {
205 // Try to generate target-sized register(/instruction).
206 decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
207
208 // Perform matrix-transposition in order to compute interleaved
209 // results by generating some sort of (optimized) target-specific
210 // instructions.
211 transpose_4x4(DecomposedVectors, TransposedVectors);
212
213 // Now replace the unoptimized-interleaved-vectors with the
214 // transposed-interleaved vectors.
215 for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
216 Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
217
218 return true;
219 }
220
221 Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
222 unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
223
224 // Lower the interleaved stores:
225 // 1. Decompose the interleaved wide shuffle into individual shuffle
226 // vectors.
Farhana Aleene4a89a62017-07-21 21:35:00 +0000227 decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
228 DecomposedVectors);
Farhana Aleen4b652a52017-06-22 22:59:04 +0000229
230 // 2. Transpose the interleaved-vectors into vectors of contiguous
231 // elements.
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000232 transpose_4x4(DecomposedVectors, TransposedVectors);
233
Farhana Aleen4b652a52017-06-22 22:59:04 +0000234 // 3. Concatenate the contiguous-vectors back into a wide vector.
235 Value *WideVec = concatenateVectors(Builder, TransposedVectors);
236
237 // 4. Generate a store instruction for wide-vec.
238 StoreInst *SI = cast<StoreInst>(Inst);
239 Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
240 SI->getAlignment());
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000241
242 return true;
243}
244
245// Lower interleaved load(s) into target specific instructions/
246// intrinsics. Lowering sequence varies depending on the vector-types, factor,
247// number of shuffles and ISA.
248// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
David L Kreitzer01a057a2016-10-14 18:20:41 +0000249bool X86TargetLowering::lowerInterleavedLoad(
250 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
251 ArrayRef<unsigned> Indices, unsigned Factor) const {
252 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
253 "Invalid interleave factor");
254 assert(!Shuffles.empty() && "Empty shufflevector input");
255 assert(Shuffles.size() == Indices.size() &&
256 "Unmatched number of shufflevectors and indices");
257
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000258 // Create an interleaved access group.
David L Kreitzer01a057a2016-10-14 18:20:41 +0000259 IRBuilder<> Builder(LI);
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000260 X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
261 Builder);
David L Kreitzer01a057a2016-10-14 18:20:41 +0000262
David L Kreitzer0e3ae302016-12-01 19:56:39 +0000263 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
David L Kreitzer01a057a2016-10-14 18:20:41 +0000264}
Farhana Aleen4b652a52017-06-22 22:59:04 +0000265
266bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
267 ShuffleVectorInst *SVI,
268 unsigned Factor) const {
269 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
270 "Invalid interleave factor");
271
Farhana Aleen9bd593e2017-06-22 23:56:31 +0000272 assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
Farhana Aleen4b652a52017-06-22 22:59:04 +0000273 "Invalid interleaved store");
274
275 // Holds the indices of SVI that correspond to the starting index of each
276 // interleaved shuffle.
277 SmallVector<unsigned, 4> Indices;
278 auto Mask = SVI->getShuffleMask();
279 for (unsigned i = 0; i < Factor; i++)
280 Indices.push_back(Mask[i]);
281
282 ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
283
284 // Create an interleaved access group.
285 IRBuilder<> Builder(SI);
286 X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
287 Builder);
288
289 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
290}