Blame - llvm/lib/Target/X86/X86InterleavedAccess.cpp - toolchain/llvm-project

blob: f680cfaf9955fd111a81ddd05131246fab0a8ebc [file] [log] [blame]

David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	1	//===--------- X86InterleavedAccess.cpp ----------------------------------===//
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	8	//===--------------------------------------------------------------------===//
				9	///
				10	/// \file
				11	/// This file contains the X86 implementation of the interleaved accesses
				12	/// optimization generating X86-specific instructions/intrinsics for
				13	/// interleaved access groups.
				14	///
				15	//===--------------------------------------------------------------------===//
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	16
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	17	#include "X86TargetMachine.h"
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	18	#include "llvm/Analysis/VectorUtils.h"
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	19
				20	using namespace llvm;
				21
Benjamin Kramer	efcf06f	2017-02-11 11:06:55 +0000	[diff] [blame]	22	namespace {
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	23	/// \brief This class holds necessary information to represent an interleaved
				24	/// access group and supports utilities to lower the group into
				25	/// X86-specific instructions/intrinsics.
				26	/// E.g. A group of interleaving access loads (Factor = 2; accessing every
				27	/// other element)
				28	/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
				29	/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
				30	/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	31	class X86InterleavedAccessGroup {
				32	/// \brief Reference to the wide-load instruction of an interleaved access
				33	/// group.
				34	Instruction *const Inst;
				35
				36	/// \brief Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
				37	ArrayRef<ShuffleVectorInst *> Shuffles;
				38
				39	/// \brief Reference to the starting index of each user-shuffle.
				40	ArrayRef<unsigned> Indices;
				41
				42	/// \brief Reference to the interleaving stride in terms of elements.
				43	const unsigned Factor;
				44
				45	/// \brief Reference to the underlying target.
				46	const X86Subtarget &Subtarget;
				47
				48	const DataLayout &DL;
				49
				50	IRBuilder<> &Builder;
				51
				52	/// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	53	/// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
				54	void decompose(Instruction Inst, unsigned NumSubVectors, VectorType T,
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	55	SmallVectorImpl<Instruction *> &DecomposedVectors);
				56
				57	/// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
				58	/// returns the transposed-vectors in \p TransposedVectors.
				59	/// E.g.
				60	/// InputVectors:
				61	/// In-V0 = p1, p2, p3, p4
				62	/// In-V1 = q1, q2, q3, q4
				63	/// In-V2 = r1, r2, r3, r4
				64	/// In-V3 = s1, s2, s3, s4
				65	/// OutputVectors:
				66	/// Out-V0 = p1, q1, r1, s1
				67	/// Out-V1 = p2, q2, r2, s2
				68	/// Out-V2 = p3, q3, r3, s3
				69	/// Out-V3 = P4, q4, r4, s4
				70	void transpose_4x4(ArrayRef<Instruction *> InputVectors,
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	71	SmallVectorImpl<Value *> &TransposedMatrix);
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	72	void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
				73	SmallVectorImpl<Value *> &TransposedMatrix,
				74	unsigned NumSubVecElems);
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame^]	75	void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
				76	SmallVectorImpl<Value *> &TransposedMatrix,
				77	unsigned NumSubVecElems);
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	78
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	79	public:
				80	/// In order to form an interleaved access group X86InterleavedAccessGroup
				81	/// requires a wide-load instruction \p 'I', a group of interleaved-vectors
				82	/// \p Shuffs, reference to the first indices of each interleaved-vector
				83	/// \p 'Ind' and the interleaving stride factor \p F. In order to generate
				84	/// X86-specific instructions/intrinsics it also requires the underlying
				85	/// target information \p STarget.
				86	explicit X86InterleavedAccessGroup(Instruction *I,
				87	ArrayRef<ShuffleVectorInst *> Shuffs,
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	88	ArrayRef<unsigned> Ind, const unsigned F,
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	89	const X86Subtarget &STarget,
				90	IRBuilder<> &B)
				91	: Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
				92	DL(Inst->getModule()->getDataLayout()), Builder(B) {}
				93
				94	/// \brief Returns true if this interleaved access group can be lowered into
				95	/// x86-specific instructions/intrinsics, false otherwise.
				96	bool isSupported() const;
				97
				98	/// \brief Lowers this interleaved access group into X86-specific
				99	/// instructions/intrinsics.
				100	bool lowerIntoOptimizedSequence();
				101	};
Benjamin Kramer	efcf06f	2017-02-11 11:06:55 +0000	[diff] [blame]	102	} // end anonymous namespace
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	103
				104	bool X86InterleavedAccessGroup::isSupported() const {
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	105	VectorType *ShuffleVecTy = Shuffles[0]->getType();
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	106	Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	107	unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	108	unsigned WideInstSize;
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	109
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame^]	110	// Currently, lowering is supported for the following vectors:
				111	// Stride 4:
				112	// 1. Store and load of 4-element vectors of 64 bits on AVX.
				113	// 2. Store of 16/32-element vectors of 8 bits on AVX.
				114	// Stride 3:
				115	// 1. Load of 16/32-element vecotrs of 8 bits on AVX.
				116	if (!Subtarget.hasAVX() \|\| (Factor != 4 && Factor != 3))
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	117	return false;
				118
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	119	if (isa<LoadInst>(Inst)) {
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	120	WideInstSize = DL.getTypeSizeInBits(Inst->getType());
				121	} else
				122	WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
				123
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	124	// We support shuffle represents stride 4 for byte type with size of
				125	// WideInstSize.
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame^]	126	if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
				127	return true;
				128
				129	if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	130	(WideInstSize == 512 \|\| WideInstSize == 1024))
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame^]	131	return true;
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	132
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame^]	133	if (ShuffleElemSize == 8 && isa<LoadInst>(Inst) && Factor == 3 &&
				134	(WideInstSize == 384 \|\| WideInstSize == 768))
				135	return true;
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	136
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame^]	137	return false;
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	138	}
				139
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	140	void X86InterleavedAccessGroup::decompose(
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	141	Instruction VecInst, unsigned NumSubVectors, VectorType SubVecTy,
				142	SmallVectorImpl<Instruction *> &DecomposedVectors) {
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	143
				144	assert((isa<LoadInst>(VecInst) \|\| isa<ShuffleVectorInst>(VecInst)) &&
				145	"Expected Load or Shuffle");
				146
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	147	Type *VecTy = VecInst->getType();
Benjamin Kramer	215b22e	2016-12-01 20:49:34 +0000	[diff] [blame]	148	(void)VecTy;
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	149	assert(VecTy->isVectorTy() &&
				150	DL.getTypeSizeInBits(VecTy) >=
				151	DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
				152	"Invalid Inst-size!!!");
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	153
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	154	if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
				155	Value *Op0 = SVI->getOperand(0);
				156	Value *Op1 = SVI->getOperand(1);
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	157
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	158	// Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
				159	for (unsigned i = 0; i < NumSubVectors; ++i)
				160	DecomposedVectors.push_back(
				161	cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	162	Op0, Op1,
				163	createSequentialMask(Builder, Indices[i],
				164	SubVecTy->getVectorNumElements(), 0))));
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	165	return;
				166	}
				167
				168	// Decompose the load instruction.
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	169	LoadInst *LI = cast<LoadInst>(VecInst);
				170	Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame^]	171	Value *VecBasePtr;
				172	unsigned int NumLoads = NumSubVectors;
				173	// In the case of stride 3 with a vector of 32 elements load the information
				174	// in the following way:
				175	// [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
				176	if (DL.getTypeSizeInBits(VecTy) == 768) {
				177	Type *VecTran =
				178	VectorType::get(Type::getInt8Ty(LI->getContext()), 16)->getPointerTo();
				179	VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTran);
				180	NumLoads = NumSubVectors * 2;
				181	} else
				182	VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	183	// Generate N loads of T type.
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame^]	184	for (unsigned i = 0; i < NumLoads; i++) {
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	185	// TODO: Support inbounds GEP.
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	186	Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
				187	Instruction *NewLoad =
				188	Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
				189	DecomposedVectors.push_back(NewLoad);
				190	}
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	191	}
				192
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	193	// Create shuffle mask for concatenation of two half vectors.
				194	// Low = false: mask generated for the shuffle
				195	// shuffle(VEC1,VEC2,{NumElement/2, NumElement/2+1, NumElement/2+2...,
				196	// NumElement-1, NumElement+NumElement/2,
				197	// NumElement+NumElement/2+1..., 2*NumElement-1})
				198	// = concat(high_half(VEC1),high_half(VEC2))
				199	// Low = true: mask generated for the shuffle
				200	// shuffle(VEC1,VEC2,{0,1,2,...,NumElement/2-1,NumElement,
				201	// NumElement+1...,NumElement+NumElement/2-1})
				202	// = concat(low_half(VEC1),low_half(VEC2))
				203	static void createConcatShuffleMask(int NumElements,
				204	SmallVectorImpl<uint32_t> &Mask, bool Low) {
				205	int NumHalfElements = NumElements / 2;
				206	int Offset = Low ? 0 : NumHalfElements;
				207	for (int i = 0; i < NumHalfElements; ++i)
				208	Mask.push_back(i + Offset);
				209	for (int i = 0; i < NumHalfElements; ++i)
				210	Mask.push_back(i + Offset + NumElements);
				211	}
				212
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	213	void X86InterleavedAccessGroup::interleave8bitStride4(
				214	ArrayRef<Instruction > Matrix, SmallVectorImpl<Value > &TransposedMatrix,
				215	unsigned numberOfElement) {
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	216
				217	// Example: Assuming we start from the following vectors:
				218	// Matrix[0]= c0 c1 c2 c3 c4 ... c31
				219	// Matrix[1]= m0 m1 m2 m3 m4 ... m31
				220	// Matrix[2]= y0 y1 y2 y3 y4 ... y31
				221	// Matrix[3]= k0 k1 k2 k3 k4 ... k31
				222
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	223	Type *VecTyepVt = VectorType::get(Type::getInt8Ty(Shuffles[0]->getContext()),
				224	numberOfElement);
				225	Type *VecTyepVtHalf = VectorType::get(
				226	Type::getInt16Ty(Shuffles[0]->getContext()), numberOfElement / 2);
				227	MVT VT = MVT::getVT(VecTyepVt);
				228	MVT HalfVT = MVT::getVT(VecTyepVtHalf);
				229
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	230	TransposedMatrix.resize(4);
				231
				232	SmallVector<uint32_t, 32> MaskHighTemp;
				233	SmallVector<uint32_t, 32> MaskLowTemp;
				234	SmallVector<uint32_t, 32> MaskHighTemp1;
				235	SmallVector<uint32_t, 32> MaskLowTemp1;
				236	SmallVector<uint32_t, 32> MaskHighTemp2;
				237	SmallVector<uint32_t, 32> MaskLowTemp2;
				238	SmallVector<uint32_t, 32> ConcatLow;
				239	SmallVector<uint32_t, 32> ConcatHigh;
				240
				241	// MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
				242	// shuffle pattern.
				243
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	244	createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp, false, false);
				245	createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp, true, false);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	246	ArrayRef<uint32_t> MaskHigh = makeArrayRef(MaskHighTemp);
				247	ArrayRef<uint32_t> MaskLow = makeArrayRef(MaskLowTemp);
				248
				249	// ConcatHigh and ConcatLow built in the vperm2i128 and vinserti128 X86
				250	// shuffle pattern.
				251
				252	createConcatShuffleMask(32, ConcatLow, true);
				253	createConcatShuffleMask(32, ConcatHigh, false);
				254	ArrayRef<uint32_t> MaskConcatLow = makeArrayRef(ConcatLow);
				255	ArrayRef<uint32_t> MaskConcatHigh = makeArrayRef(ConcatHigh);
				256
				257	// MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
				258	// shuffle pattern.
				259
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	260	createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp1, true, false);
				261	createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp1, false, false);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	262	scaleShuffleMask<uint32_t>(2, makeArrayRef(MaskHighTemp1), MaskHighTemp2);
				263	scaleShuffleMask<uint32_t>(2, makeArrayRef(MaskLowTemp1), MaskLowTemp2);
				264	ArrayRef<uint32_t> MaskHighWord = makeArrayRef(MaskHighTemp2);
				265	ArrayRef<uint32_t> MaskLowWord = makeArrayRef(MaskLowTemp2);
				266
				267	// IntrVec1Low = c0 m0 c1 m1 ... c7 m7 \| c16 m16 c17 m17 ... c23 m23
				268	// IntrVec1High = c8 m8 c9 m9 ... c15 m15 \| c24 m24 c25 m25 ... c31 m31
				269	// IntrVec2Low = y0 k0 y1 k1 ... y7 k7 \| y16 k16 y17 k17 ... y23 k23
				270	// IntrVec2High = y8 k8 y9 k9 ... y15 k15 \| y24 k24 y25 k25 ... y31 k31
				271
				272	Value *IntrVec1Low =
				273	Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
				274	Value *IntrVec1High =
				275	Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
				276	Value *IntrVec2Low =
				277	Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
				278	Value *IntrVec2High =
				279	Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
				280
				281	// cmyk4 cmyk5 cmyk6 cmyk7 \| cmyk20 cmyk21 cmyk22 cmyk23
				282	// cmyk12 cmyk13 cmyk14 cmyk15 \| cmyk28 cmyk29 cmyk30 cmyk31
				283	// cmyk0 cmyk1 cmyk2 cmyk3 \| cmyk16 cmyk17 cmyk18 cmyk19
				284	// cmyk8 cmyk9 cmyk10 cmyk11 \| cmyk24 cmyk25 cmyk26 cmyk27
				285
				286	Value *High =
				287	Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
				288	Value *High1 =
				289	Builder.CreateShuffleVector(IntrVec1High, IntrVec2High, MaskHighWord);
				290	Value *Low =
				291	Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
				292	Value *Low1 =
				293	Builder.CreateShuffleVector(IntrVec1High, IntrVec2High, MaskLowWord);
				294
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	295	if (VT == MVT::v16i8) {
				296	TransposedMatrix[0] = Low;
				297	TransposedMatrix[1] = High;
				298	TransposedMatrix[2] = Low1;
				299	TransposedMatrix[3] = High1;
				300	return;
				301	}
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	302	// cmyk0 cmyk1 cmyk2 cmyk3 \| cmyk4 cmyk5 cmyk6 cmyk7
				303	// cmyk8 cmyk9 cmyk10 cmyk11 \| cmyk12 cmyk13 cmyk14 cmyk15
				304	// cmyk16 cmyk17 cmyk18 cmyk19 \| cmyk20 cmyk21 cmyk22 cmyk23
				305	// cmyk24 cmyk25 cmyk26 cmyk27 \| cmyk28 cmyk29 cmyk30 cmyk31
				306
				307	TransposedMatrix[0] = Builder.CreateShuffleVector(Low, High, MaskConcatLow);
				308	TransposedMatrix[1] = Builder.CreateShuffleVector(Low1, High1, MaskConcatLow);
				309	TransposedMatrix[2] = Builder.CreateShuffleVector(Low, High, MaskConcatHigh);
				310	TransposedMatrix[3] =
				311	Builder.CreateShuffleVector(Low1, High1, MaskConcatHigh);
				312	}
				313
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame^]	314	// createShuffleStride returns shuffle mask of size N.
				315	// The shuffle pattern is as following :
				316	// {0, Stride%(VF/Lane), (2Stride%(VF/Lane))...(VFStride/Lane)%(VF/Lane),
				317	// (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
				318	// (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
				319	// Where Lane is the # of lanes in a register:
				320	// VectorSize = 128 => Lane = 1
				321	// VectorSize = 256 => Lane = 2
				322	// For example shuffle pattern for VF 16 register size 256 -> lanes = 2
				323	// {<[0\|3\|6\|1\|4\|7\|2\|5]-[8\|11\|14\|9\|12\|15\|10\|13]>}
				324	static void createShuffleStride(MVT VT, int Stride,
				325	SmallVectorImpl<uint32_t> &Mask) {
				326	int VectorSize = VT.getSizeInBits();
				327	int VF = VT.getVectorNumElements();
				328	int LaneCount = std::max(VectorSize / 128, 1);
				329	for (int Lane = 0; Lane < LaneCount; Lane++)
				330	for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
				331	Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
				332	}
				333
				334	// setGroupSize sets 'SizeInfo' to the size(number of elements) of group
				335	// inside mask a shuffleMask. A mask contains exactly 3 groups, where
				336	// each group is a monotonically increasing sequence with stride 3.
				337	// For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
				338	static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
				339	int VectorSize = VT.getSizeInBits();
				340	int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
				341	for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
				342	int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
				343	SizeInfo.push_back(GroupSize);
				344	FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
				345	}
				346	}
				347
				348	// DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
				349	// vpalign works according to lanes
				350	// Where Lane is the # of lanes in a register:
				351	// VectorWide = 128 => Lane = 1
				352	// VectorWide = 256 => Lane = 2
				353	// For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
				354	// For Lane = 2 shuffle pattern is:
				355	// {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
				356	// Imm variable sets the offset amount. The result of the
				357	// function is stored inside ShuffleMask vector and it built as described in
				358	// the begin of the description. AlignDirection is a boolean that indecat the
				359	// direction of the alignment. (false - align to the "right" side while true -
				360	// align to the "left" side)
				361	static void DecodePALIGNRMask(MVT VT, unsigned Imm,
				362	SmallVectorImpl<uint32_t> &ShuffleMask,
				363	bool AlignDirection = true, bool Unary = false) {
				364
				365	unsigned NumElts = VT.getVectorNumElements();
				366	unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
				367	unsigned NumLaneElts = NumElts / NumLanes;
				368
				369	Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
				370	unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
				371
				372	for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
				373	for (unsigned i = 0; i != NumLaneElts; ++i) {
				374	unsigned Base = i + Offset;
				375	// if i+offset is out of this lane then we actually need the other source
				376	// If Unary the other source is the first source.
				377	if (Base >= NumLaneElts)
				378	Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
				379	ShuffleMask.push_back(Base + l);
				380	}
				381	}
				382	}
				383
				384	void X86InterleavedAccessGroup::deinterleave8bitStride3(
				385	ArrayRef<Instruction > InVec, SmallVectorImpl<Value > &TransposedMatrix,
				386	unsigned VecElems) {
				387
				388	// Example: Assuming we start from the following vectors:
				389	// Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
				390	// Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
				391	// Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
				392
				393	TransposedMatrix.resize(3);
				394	SmallVector<uint32_t, 32> Concat;
				395	SmallVector<uint32_t, 32> VPShuf;
				396	SmallVector<uint32_t, 32> VPAlign[2];
				397	SmallVector<uint32_t, 32> VPAlign2;
				398	SmallVector<uint32_t, 32> VPAlign3;
				399	SmallVector<uint32_t, 3> GroupSize;
				400	Value Vec[3], TempVector[3];
				401
				402	MVT VT = MVT::getVT(Shuffles[0]->getType());
				403
				404	for (unsigned i = 0; i < VecElems && VecElems == 32; ++i)
				405	Concat.push_back(i);
				406
				407	createShuffleStride(VT, 3, VPShuf);
				408	setGroupSize(VT, GroupSize);
				409
				410	for (int i = 0; i < 2; i++)
				411	DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
				412
				413	DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
				414	DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
				415
				416	for (int i = 0; i < 3; i++)
				417	Vec[i] = VecElems == 32
				418	? Builder.CreateShuffleVector(InVec[i], InVec[i + 3], Concat)
				419	: InVec[i];
				420
				421	// Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
				422	// Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
				423	// Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
				424
				425	for (int i = 0; i < 3; i++)
				426	Vec[i] = Builder.CreateShuffleVector(
				427	Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf);
				428
				429	// TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
				430	// TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
				431	// TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
				432
				433	for (int i = 0; i < 3; i++)
				434	TempVector[i] =
				435	Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
				436
				437	// Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
				438	// Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
				439	// Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
				440
				441	for (int i = 0; i < 3; i++)
				442	Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
				443	VPAlign[1]);
				444
				445	// TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
				446	// TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
				447	// TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
				448
				449	Value *TempVec = Builder.CreateShuffleVector(
				450	Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3);
				451	TransposedMatrix[0] = Builder.CreateShuffleVector(
				452	Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2);
				453	TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
				454	TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
				455
				456	return;
				457	}
				458
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	459	void X86InterleavedAccessGroup::transpose_4x4(
				460	ArrayRef<Instruction *> Matrix,
				461	SmallVectorImpl<Value *> &TransposedMatrix) {
				462	assert(Matrix.size() == 4 && "Invalid matrix size");
				463	TransposedMatrix.resize(4);
				464
				465	// dst = src1[0,1],src2[0,1]
				466	uint32_t IntMask1[] = {0, 1, 4, 5};
				467	ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
				468	Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
				469	Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
				470
				471	// dst = src1[2,3],src2[2,3]
				472	uint32_t IntMask2[] = {2, 3, 6, 7};
				473	Mask = makeArrayRef(IntMask2, 4);
				474	Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
				475	Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
				476
				477	// dst = src1[0],src2[0],src1[2],src2[2]
				478	uint32_t IntMask3[] = {0, 4, 2, 6};
				479	Mask = makeArrayRef(IntMask3, 4);
				480	TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
				481	TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
				482
				483	// dst = src1[1],src2[1],src1[3],src2[3]
				484	uint32_t IntMask4[] = {1, 5, 3, 7};
				485	Mask = makeArrayRef(IntMask4, 4);
				486	TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
				487	TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
				488	}
				489
				490	// Lowers this interleaved access group into X86-specific
				491	// instructions/intrinsics.
				492	bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
				493	SmallVector<Instruction *, 4> DecomposedVectors;
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	494	SmallVector<Value *, 4> TransposedVectors;
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	495	VectorType *ShuffleTy = Shuffles[0]->getType();
				496
				497	if (isa<LoadInst>(Inst)) {
				498	// Try to generate target-sized register(/instruction).
				499	decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
				500
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame^]	501	Type *ShuffleEltTy = Inst->getType();
				502	unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor;
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	503	// Perform matrix-transposition in order to compute interleaved
				504	// results by generating some sort of (optimized) target-specific
				505	// instructions.
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame^]	506
				507	switch (NumSubVecElems) {
				508	default:
				509	return false;
				510	case 4:
				511	transpose_4x4(DecomposedVectors, TransposedVectors);
				512	break;
				513	case 8:
				514	case 16:
				515	case 32:
				516	deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
				517	NumSubVecElems);
				518	break;
				519	}
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	520
				521	// Now replace the unoptimized-interleaved-vectors with the
				522	// transposed-interleaved vectors.
				523	for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
				524	Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
				525
				526	return true;
				527	}
				528
				529	Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
				530	unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
				531
				532	// Lower the interleaved stores:
				533	// 1. Decompose the interleaved wide shuffle into individual shuffle
				534	// vectors.
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	535	decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
				536	DecomposedVectors);
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	537
				538	// 2. Transpose the interleaved-vectors into vectors of contiguous
				539	// elements.
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	540	switch (NumSubVecElems) {
				541	case 4:
				542	transpose_4x4(DecomposedVectors, TransposedVectors);
				543	break;
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	544	case 16:
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	545	case 32:
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	546	interleave8bitStride4(DecomposedVectors, TransposedVectors, NumSubVecElems);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	547	break;
				548	default:
				549	return false;
				550	}
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	551
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	552	// 3. Concatenate the contiguous-vectors back into a wide vector.
				553	Value *WideVec = concatenateVectors(Builder, TransposedVectors);
				554
				555	// 4. Generate a store instruction for wide-vec.
				556	StoreInst *SI = cast<StoreInst>(Inst);
				557	Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
				558	SI->getAlignment());
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	559
				560	return true;
				561	}
				562
				563	// Lower interleaved load(s) into target specific instructions/
				564	// intrinsics. Lowering sequence varies depending on the vector-types, factor,
				565	// number of shuffles and ISA.
				566	// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	567	bool X86TargetLowering::lowerInterleavedLoad(
				568	LoadInst LI, ArrayRef<ShuffleVectorInst > Shuffles,
				569	ArrayRef<unsigned> Indices, unsigned Factor) const {
				570	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
				571	"Invalid interleave factor");
				572	assert(!Shuffles.empty() && "Empty shufflevector input");
				573	assert(Shuffles.size() == Indices.size() &&
				574	"Unmatched number of shufflevectors and indices");
				575
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	576	// Create an interleaved access group.
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	577	IRBuilder<> Builder(LI);
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	578	X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
				579	Builder);
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	580
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	581	return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	582	}
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	583
				584	bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
				585	ShuffleVectorInst *SVI,
				586	unsigned Factor) const {
				587	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
				588	"Invalid interleave factor");
				589
Farhana Aleen	9bd593e	2017-06-22 23:56:31 +0000	[diff] [blame]	590	assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	591	"Invalid interleaved store");
				592
				593	// Holds the indices of SVI that correspond to the starting index of each
				594	// interleaved shuffle.
				595	SmallVector<unsigned, 4> Indices;
				596	auto Mask = SVI->getShuffleMask();
				597	for (unsigned i = 0; i < Factor; i++)
				598	Indices.push_back(Mask[i]);
				599
				600	ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
				601
				602	// Create an interleaved access group.
				603	IRBuilder<> Builder(SI);
				604	X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
				605	Builder);
				606
				607	return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
				608	}