Blame - llvm/lib/Target/X86/X86InterleavedAccess.cpp - toolchain/llvm-project

blob: e93ec274ee902c89dda85294aada49adab4add71 [file] [log] [blame]

David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	1	//===--------- X86InterleavedAccess.cpp ----------------------------------===//
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	8	//===--------------------------------------------------------------------===//
				9	///
				10	/// \file
				11	/// This file contains the X86 implementation of the interleaved accesses
				12	/// optimization generating X86-specific instructions/intrinsics for
				13	/// interleaved access groups.
				14	///
				15	//===--------------------------------------------------------------------===//
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	16
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	17	#include "X86TargetMachine.h"
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	18	#include "llvm/Analysis/VectorUtils.h"
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	19
				20	using namespace llvm;
				21
Benjamin Kramer	efcf06f	2017-02-11 11:06:55 +0000	[diff] [blame]	22	namespace {
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	23	/// \brief This class holds necessary information to represent an interleaved
				24	/// access group and supports utilities to lower the group into
				25	/// X86-specific instructions/intrinsics.
				26	/// E.g. A group of interleaving access loads (Factor = 2; accessing every
				27	/// other element)
				28	/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
				29	/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
				30	/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	31	class X86InterleavedAccessGroup {
				32	/// \brief Reference to the wide-load instruction of an interleaved access
				33	/// group.
				34	Instruction *const Inst;
				35
				36	/// \brief Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
				37	ArrayRef<ShuffleVectorInst *> Shuffles;
				38
				39	/// \brief Reference to the starting index of each user-shuffle.
				40	ArrayRef<unsigned> Indices;
				41
				42	/// \brief Reference to the interleaving stride in terms of elements.
				43	const unsigned Factor;
				44
				45	/// \brief Reference to the underlying target.
				46	const X86Subtarget &Subtarget;
				47
				48	const DataLayout &DL;
				49
				50	IRBuilder<> &Builder;
				51
				52	/// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	53	/// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
				54	void decompose(Instruction Inst, unsigned NumSubVectors, VectorType T,
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	55	SmallVectorImpl<Instruction *> &DecomposedVectors);
				56
				57	/// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
				58	/// returns the transposed-vectors in \p TransposedVectors.
				59	/// E.g.
				60	/// InputVectors:
				61	/// In-V0 = p1, p2, p3, p4
				62	/// In-V1 = q1, q2, q3, q4
				63	/// In-V2 = r1, r2, r3, r4
				64	/// In-V3 = s1, s2, s3, s4
				65	/// OutputVectors:
				66	/// Out-V0 = p1, q1, r1, s1
				67	/// Out-V1 = p2, q2, r2, s2
				68	/// Out-V2 = p3, q3, r3, s3
				69	/// Out-V3 = P4, q4, r4, s4
				70	void transpose_4x4(ArrayRef<Instruction *> InputVectors,
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	71	SmallVectorImpl<Value *> &TransposedMatrix);
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	72	void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
				73	SmallVectorImpl<Value *> &TransposedMatrix,
				74	unsigned NumSubVecElems);
Michael Zuckerman	4a97df0	2017-09-25 14:50:38 +0000	[diff] [blame]	75	void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
				76	SmallVectorImpl<Value *> &TransposedMatrix);
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	77	void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
				78	SmallVectorImpl<Value *> &TransposedMatrix,
				79	unsigned NumSubVecElems);
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	80	void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
				81	SmallVectorImpl<Value *> &TransposedMatrix,
				82	unsigned NumSubVecElems);
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	83
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	84	public:
				85	/// In order to form an interleaved access group X86InterleavedAccessGroup
				86	/// requires a wide-load instruction \p 'I', a group of interleaved-vectors
				87	/// \p Shuffs, reference to the first indices of each interleaved-vector
				88	/// \p 'Ind' and the interleaving stride factor \p F. In order to generate
				89	/// X86-specific instructions/intrinsics it also requires the underlying
				90	/// target information \p STarget.
				91	explicit X86InterleavedAccessGroup(Instruction *I,
				92	ArrayRef<ShuffleVectorInst *> Shuffs,
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	93	ArrayRef<unsigned> Ind, const unsigned F,
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	94	const X86Subtarget &STarget,
				95	IRBuilder<> &B)
				96	: Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
				97	DL(Inst->getModule()->getDataLayout()), Builder(B) {}
				98
				99	/// \brief Returns true if this interleaved access group can be lowered into
				100	/// x86-specific instructions/intrinsics, false otherwise.
				101	bool isSupported() const;
				102
				103	/// \brief Lowers this interleaved access group into X86-specific
				104	/// instructions/intrinsics.
				105	bool lowerIntoOptimizedSequence();
				106	};
Benjamin Kramer	efcf06f	2017-02-11 11:06:55 +0000	[diff] [blame]	107	} // end anonymous namespace
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	108
				109	bool X86InterleavedAccessGroup::isSupported() const {
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	110	VectorType *ShuffleVecTy = Shuffles[0]->getType();
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	111	Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	112	unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	113	unsigned WideInstSize;
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	114
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	115	// Currently, lowering is supported for the following vectors:
				116	// Stride 4:
				117	// 1. Store and load of 4-element vectors of 64 bits on AVX.
				118	// 2. Store of 16/32-element vectors of 8 bits on AVX.
				119	// Stride 3:
				120	// 1. Load of 16/32-element vecotrs of 8 bits on AVX.
				121	if (!Subtarget.hasAVX() \|\| (Factor != 4 && Factor != 3))
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	122	return false;
				123
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	124	if (isa<LoadInst>(Inst)) {
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	125	WideInstSize = DL.getTypeSizeInBits(Inst->getType());
				126	} else
				127	WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
				128
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	129	// We support shuffle represents stride 4 for byte type with size of
				130	// WideInstSize.
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	131	if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
				132	return true;
				133
				134	if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
Michael Zuckerman	4a97df0	2017-09-25 14:50:38 +0000	[diff] [blame]	135	(WideInstSize == 256 \|\| WideInstSize == 512 \|\| WideInstSize == 1024))
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	136	return true;
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	137
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	138	if (ShuffleElemSize == 8 && Factor == 3 &&
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	139	(WideInstSize == 384 \|\| WideInstSize == 768))
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	140	return true;
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	141
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	142	return false;
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	143	}
				144
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	145	void X86InterleavedAccessGroup::decompose(
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	146	Instruction VecInst, unsigned NumSubVectors, VectorType SubVecTy,
				147	SmallVectorImpl<Instruction *> &DecomposedVectors) {
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	148
				149	assert((isa<LoadInst>(VecInst) \|\| isa<ShuffleVectorInst>(VecInst)) &&
				150	"Expected Load or Shuffle");
				151
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	152	Type *VecTy = VecInst->getType();
Benjamin Kramer	215b22e	2016-12-01 20:49:34 +0000	[diff] [blame]	153	(void)VecTy;
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	154	assert(VecTy->isVectorTy() &&
				155	DL.getTypeSizeInBits(VecTy) >=
				156	DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
				157	"Invalid Inst-size!!!");
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	158
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	159	if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
				160	Value *Op0 = SVI->getOperand(0);
				161	Value *Op1 = SVI->getOperand(1);
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	162
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	163	// Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
				164	for (unsigned i = 0; i < NumSubVectors; ++i)
				165	DecomposedVectors.push_back(
				166	cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	167	Op0, Op1,
				168	createSequentialMask(Builder, Indices[i],
				169	SubVecTy->getVectorNumElements(), 0))));
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	170	return;
				171	}
				172
				173	// Decompose the load instruction.
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	174	LoadInst *LI = cast<LoadInst>(VecInst);
				175	Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	176	Value *VecBasePtr;
				177	unsigned int NumLoads = NumSubVectors;
				178	// In the case of stride 3 with a vector of 32 elements load the information
				179	// in the following way:
				180	// [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
				181	if (DL.getTypeSizeInBits(VecTy) == 768) {
				182	Type *VecTran =
				183	VectorType::get(Type::getInt8Ty(LI->getContext()), 16)->getPointerTo();
				184	VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTran);
				185	NumLoads = NumSubVectors * 2;
				186	} else
				187	VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	188	// Generate N loads of T type.
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	189	for (unsigned i = 0; i < NumLoads; i++) {
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	190	// TODO: Support inbounds GEP.
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	191	Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
				192	Instruction *NewLoad =
				193	Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
				194	DecomposedVectors.push_back(NewLoad);
				195	}
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	196	}
				197
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	198	// Create shuffle mask for concatenation of two half vectors.
				199	// Low = false: mask generated for the shuffle
				200	// shuffle(VEC1,VEC2,{NumElement/2, NumElement/2+1, NumElement/2+2...,
				201	// NumElement-1, NumElement+NumElement/2,
				202	// NumElement+NumElement/2+1..., 2*NumElement-1})
				203	// = concat(high_half(VEC1),high_half(VEC2))
				204	// Low = true: mask generated for the shuffle
				205	// shuffle(VEC1,VEC2,{0,1,2,...,NumElement/2-1,NumElement,
				206	// NumElement+1...,NumElement+NumElement/2-1})
				207	// = concat(low_half(VEC1),low_half(VEC2))
				208	static void createConcatShuffleMask(int NumElements,
				209	SmallVectorImpl<uint32_t> &Mask, bool Low) {
				210	int NumHalfElements = NumElements / 2;
				211	int Offset = Low ? 0 : NumHalfElements;
				212	for (int i = 0; i < NumHalfElements; ++i)
				213	Mask.push_back(i + Offset);
				214	for (int i = 0; i < NumHalfElements; ++i)
				215	Mask.push_back(i + Offset + NumElements);
				216	}
				217
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	218	// Changing the scale of the vector type by reducing the number of elements and
				219	// doubling the scalar size.
				220	static MVT scaleVectorType(MVT VT) {
				221	unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
				222	return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
				223	VT.getVectorNumElements() / 2);
				224	}
				225
Michael Zuckerman	4a97df0	2017-09-25 14:50:38 +0000	[diff] [blame]	226	void X86InterleavedAccessGroup::interleave8bitStride4VF8(
				227	ArrayRef<Instruction *> Matrix,
				228	SmallVectorImpl<Value *> &TransposedMatrix) {
				229	// Assuming we start from the following vectors:
				230	// Matrix[0]= c0 c1 c2 c3 c4 ... c7
				231	// Matrix[1]= m0 m1 m2 m3 m4 ... m7
				232	// Matrix[2]= y0 y1 y2 y3 y4 ... y7
				233	// Matrix[3]= k0 k1 k2 k3 k4 ... k7
				234
				235	MVT VT = MVT::v8i16;
				236	TransposedMatrix.resize(2);
				237	SmallVector<uint32_t, 16> MaskLow;
				238	SmallVector<uint32_t, 32> MaskLowTemp1, MaskLowWord;
				239	SmallVector<uint32_t, 32> MaskHighTemp1, MaskHighWord;
				240
				241	for (unsigned i = 0; i < 8; ++i) {
				242	MaskLow.push_back(i);
				243	MaskLow.push_back(i + 8);
				244	}
				245
				246	createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1, true, false);
				247	createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1, false, false);
				248	scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
				249	scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
				250	// IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
				251	// IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
				252	Value *IntrVec1Low =
				253	Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
				254	Value *IntrVec2Low =
				255	Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
				256
				257	// TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
				258	// TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
				259
				260	TransposedMatrix[0] =
				261	Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
				262	TransposedMatrix[1] =
				263	Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
				264	}
				265
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	266	void X86InterleavedAccessGroup::interleave8bitStride4(
				267	ArrayRef<Instruction > Matrix, SmallVectorImpl<Value > &TransposedMatrix,
				268	unsigned numberOfElement) {
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	269
				270	// Example: Assuming we start from the following vectors:
				271	// Matrix[0]= c0 c1 c2 c3 c4 ... c31
				272	// Matrix[1]= m0 m1 m2 m3 m4 ... m31
				273	// Matrix[2]= y0 y1 y2 y3 y4 ... y31
				274	// Matrix[3]= k0 k1 k2 k3 k4 ... k31
				275
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	276	MVT VT = MVT::getVectorVT(MVT::i8, numberOfElement);
				277	MVT HalfVT = scaleVectorType(VT);
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	278
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	279	TransposedMatrix.resize(4);
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	280	SmallVector<uint32_t, 32> MaskHigh;
				281	SmallVector<uint32_t, 32> MaskLow;
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	282	SmallVector<uint32_t, 32> MaskHighTemp1;
				283	SmallVector<uint32_t, 32> MaskLowTemp1;
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	284	SmallVector<uint32_t, 32> MaskHighWord;
				285	SmallVector<uint32_t, 32> MaskLowWord;
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	286	SmallVector<uint32_t, 32> ConcatLow;
				287	SmallVector<uint32_t, 32> ConcatHigh;
				288
				289	// MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
				290	// shuffle pattern.
				291
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	292	createUnpackShuffleMask<uint32_t>(VT, MaskHigh, false, false);
				293	createUnpackShuffleMask<uint32_t>(VT, MaskLow, true, false);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	294
				295	// MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
				296	// shuffle pattern.
				297
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	298	createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp1, true, false);
				299	createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp1, false, false);
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	300	scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
				301	scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	302
				303	// IntrVec1Low = c0 m0 c1 m1 ... c7 m7 \| c16 m16 c17 m17 ... c23 m23
				304	// IntrVec1High = c8 m8 c9 m9 ... c15 m15 \| c24 m24 c25 m25 ... c31 m31
				305	// IntrVec2Low = y0 k0 y1 k1 ... y7 k7 \| y16 k16 y17 k17 ... y23 k23
				306	// IntrVec2High = y8 k8 y9 k9 ... y15 k15 \| y24 k24 y25 k25 ... y31 k31
				307
				308	Value *IntrVec1Low =
				309	Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
				310	Value *IntrVec1High =
				311	Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
				312	Value *IntrVec2Low =
				313	Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
				314	Value *IntrVec2High =
				315	Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
				316
				317	// cmyk4 cmyk5 cmyk6 cmyk7 \| cmyk20 cmyk21 cmyk22 cmyk23
				318	// cmyk12 cmyk13 cmyk14 cmyk15 \| cmyk28 cmyk29 cmyk30 cmyk31
				319	// cmyk0 cmyk1 cmyk2 cmyk3 \| cmyk16 cmyk17 cmyk18 cmyk19
				320	// cmyk8 cmyk9 cmyk10 cmyk11 \| cmyk24 cmyk25 cmyk26 cmyk27
				321
				322	Value *High =
				323	Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
				324	Value *High1 =
				325	Builder.CreateShuffleVector(IntrVec1High, IntrVec2High, MaskHighWord);
				326	Value *Low =
				327	Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
				328	Value *Low1 =
				329	Builder.CreateShuffleVector(IntrVec1High, IntrVec2High, MaskLowWord);
				330
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	331	if (VT == MVT::v16i8) {
				332	TransposedMatrix[0] = Low;
				333	TransposedMatrix[1] = High;
				334	TransposedMatrix[2] = Low1;
				335	TransposedMatrix[3] = High1;
				336	return;
				337	}
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	338
				339	// cmyk0 cmyk1 cmyk2 cmyk3 \| cmyk4 cmyk5 cmyk6 cmyk7
				340	// cmyk8 cmyk9 cmyk10 cmyk11 \| cmyk12 cmyk13 cmyk14 cmyk15
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	341	// cmyk16 cmyk17 cmyk18 cmyk19 \| cmyk20 cmyk21 cmyk22 cmyk23
				342	// cmyk24 cmyk25 cmyk26 cmyk27 \| cmyk28 cmyk29 cmyk30 cmyk31
				343
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	344	// ConcatHigh and ConcatLow built in the vperm2i128 and vinserti128 X86
				345	// shuffle pattern.
				346	SmallVector<uint32_t, 32> ConcatHigh12, ConcatHigh13;
				347	createConcatShuffleMask(numberOfElement, ConcatLow, true);
				348	createConcatShuffleMask(numberOfElement, ConcatHigh, false);
				349
				350	TransposedMatrix[0] = Builder.CreateShuffleVector(Low, High, ConcatLow);
				351	TransposedMatrix[1] = Builder.CreateShuffleVector(Low1, High1, ConcatLow);
				352	TransposedMatrix[2] = Builder.CreateShuffleVector(Low, High, ConcatHigh);
				353	TransposedMatrix[3] = Builder.CreateShuffleVector(Low1, High1, ConcatHigh);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	354	}
				355
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	356	// createShuffleStride returns shuffle mask of size N.
				357	// The shuffle pattern is as following :
				358	// {0, Stride%(VF/Lane), (2Stride%(VF/Lane))...(VFStride/Lane)%(VF/Lane),
				359	// (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
				360	// (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
				361	// Where Lane is the # of lanes in a register:
				362	// VectorSize = 128 => Lane = 1
				363	// VectorSize = 256 => Lane = 2
				364	// For example shuffle pattern for VF 16 register size 256 -> lanes = 2
				365	// {<[0\|3\|6\|1\|4\|7\|2\|5]-[8\|11\|14\|9\|12\|15\|10\|13]>}
				366	static void createShuffleStride(MVT VT, int Stride,
				367	SmallVectorImpl<uint32_t> &Mask) {
				368	int VectorSize = VT.getSizeInBits();
				369	int VF = VT.getVectorNumElements();
				370	int LaneCount = std::max(VectorSize / 128, 1);
				371	for (int Lane = 0; Lane < LaneCount; Lane++)
				372	for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
				373	Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
				374	}
				375
				376	// setGroupSize sets 'SizeInfo' to the size(number of elements) of group
				377	// inside mask a shuffleMask. A mask contains exactly 3 groups, where
				378	// each group is a monotonically increasing sequence with stride 3.
				379	// For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
				380	static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
				381	int VectorSize = VT.getSizeInBits();
				382	int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
				383	for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
				384	int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
				385	SizeInfo.push_back(GroupSize);
				386	FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
				387	}
				388	}
				389
				390	// DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
				391	// vpalign works according to lanes
				392	// Where Lane is the # of lanes in a register:
				393	// VectorWide = 128 => Lane = 1
				394	// VectorWide = 256 => Lane = 2
				395	// For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
				396	// For Lane = 2 shuffle pattern is:
				397	// {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
				398	// Imm variable sets the offset amount. The result of the
				399	// function is stored inside ShuffleMask vector and it built as described in
				400	// the begin of the description. AlignDirection is a boolean that indecat the
				401	// direction of the alignment. (false - align to the "right" side while true -
				402	// align to the "left" side)
				403	static void DecodePALIGNRMask(MVT VT, unsigned Imm,
				404	SmallVectorImpl<uint32_t> &ShuffleMask,
				405	bool AlignDirection = true, bool Unary = false) {
				406
				407	unsigned NumElts = VT.getVectorNumElements();
				408	unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
				409	unsigned NumLaneElts = NumElts / NumLanes;
				410
				411	Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
				412	unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
				413
				414	for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
				415	for (unsigned i = 0; i != NumLaneElts; ++i) {
				416	unsigned Base = i + Offset;
				417	// if i+offset is out of this lane then we actually need the other source
				418	// If Unary the other source is the first source.
				419	if (Base >= NumLaneElts)
				420	Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
				421	ShuffleMask.push_back(Base + l);
				422	}
				423	}
				424	}
				425
				426	void X86InterleavedAccessGroup::deinterleave8bitStride3(
				427	ArrayRef<Instruction > InVec, SmallVectorImpl<Value > &TransposedMatrix,
				428	unsigned VecElems) {
				429
				430	// Example: Assuming we start from the following vectors:
				431	// Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
				432	// Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
				433	// Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
				434
				435	TransposedMatrix.resize(3);
				436	SmallVector<uint32_t, 32> Concat;
				437	SmallVector<uint32_t, 32> VPShuf;
				438	SmallVector<uint32_t, 32> VPAlign[2];
				439	SmallVector<uint32_t, 32> VPAlign2;
				440	SmallVector<uint32_t, 32> VPAlign3;
				441	SmallVector<uint32_t, 3> GroupSize;
				442	Value Vec[3], TempVector[3];
				443
				444	MVT VT = MVT::getVT(Shuffles[0]->getType());
				445
				446	for (unsigned i = 0; i < VecElems && VecElems == 32; ++i)
				447	Concat.push_back(i);
				448
				449	createShuffleStride(VT, 3, VPShuf);
				450	setGroupSize(VT, GroupSize);
				451
				452	for (int i = 0; i < 2; i++)
				453	DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
				454
				455	DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
				456	DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
				457
				458	for (int i = 0; i < 3; i++)
				459	Vec[i] = VecElems == 32
				460	? Builder.CreateShuffleVector(InVec[i], InVec[i + 3], Concat)
				461	: InVec[i];
				462
				463	// Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
				464	// Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
				465	// Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
				466
				467	for (int i = 0; i < 3; i++)
				468	Vec[i] = Builder.CreateShuffleVector(
				469	Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf);
				470
				471	// TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
				472	// TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
				473	// TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
				474
				475	for (int i = 0; i < 3; i++)
				476	TempVector[i] =
				477	Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
				478
				479	// Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
				480	// Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
				481	// Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
				482
				483	for (int i = 0; i < 3; i++)
				484	Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
				485	VPAlign[1]);
				486
				487	// TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
				488	// TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
				489	// TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
				490
				491	Value *TempVec = Builder.CreateShuffleVector(
				492	Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3);
				493	TransposedMatrix[0] = Builder.CreateShuffleVector(
				494	Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2);
				495	TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
				496	TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
				497
				498	return;
				499	}
				500
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	501	// group2Shuffle reorder the shuffle stride back into continuous order.
				502	// For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
				503	// MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
				504	static void group2Shuffle(MVT VT, SmallVectorImpl<uint32_t> &Mask,
				505	SmallVectorImpl<uint32_t> &Output) {
				506	int IndexGroup[3] = {0, 0, 0};
				507	int Index = 0;
				508	int VectorWidth = VT.getSizeInBits();
				509	int VF = VT.getVectorNumElements();
				510	// Find the index of the different groups.
				511	int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
				512	for (int i = 0; i < 3; i++) {
				513	IndexGroup[(Index * 3) % (VF / Lane)] = Index;
				514	Index += Mask[i];
				515	}
				516	// According to the index compute the convert mask.
				517	for (int i = 0; i < VF / Lane; i++) {
				518	Output.push_back(IndexGroup[i % 3]);
				519	IndexGroup[i % 3]++;
				520	}
				521	}
				522
				523	// genShuffleBland - Creates shuffle according to two vectors.This function is
				524	// only works on instructions with lane inside 256 registers. According to
				525	// the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
				526	// offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
				527	// Where the 'LowOffset' refers to the first vector and the highOffset refers to
				528	// the second vector.
				529	// \|a0....a5,b0....b4,c0....c4\|a16..a21,b16..b20,c16..c20\|
				530	// \|c5...c10,a5....a9,b5....b9\|c21..c26,a22..a26,b21..b25\|
				531	// \|b10..b15,c11..c15,a10..a15\|b26..b31,c27..c31,a27..a31\|
				532	// For the sequence to work as a mirror to the load.
				533	// We must consider the elements order as above.
				534	// In this function we are combining two types of shuffles.
				535	// The first one is vpshufed and the second is a type of "blend" shuffle.
				536	// By computing the shuffle on a sequence of 16 elements(one lane) and add the
				537	// correct offset. We are creating a vpsuffed + blend sequence between two
				538	// shuffles.
Michael Zuckerman	0b5db55	2017-09-29 12:45:54 +0000	[diff] [blame]	539	static void genShuffleBland(MVT VT, ArrayRef<uint32_t> Mask,
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	540	SmallVectorImpl<uint32_t> &Out, int LowOffset,
				541	int HighOffset) {
				542	assert(VT.getSizeInBits() == 256 &&
				543	"This function works on only width of 256");
				544	unsigned NumOfElm = VT.getVectorNumElements();
				545	for (unsigned i = 0; i < Mask.size(); i++)
				546	Out.push_back(Mask[i] + LowOffset);
				547	for (unsigned i = 0; i < Mask.size(); i++)
				548	Out.push_back(Mask[i] + HighOffset + NumOfElm);
				549	}
				550
				551	void X86InterleavedAccessGroup::interleave8bitStride3(
				552	ArrayRef<Instruction > InVec, SmallVectorImpl<Value > &TransposedMatrix,
				553	unsigned VecElems) {
				554
				555	// Example: Assuming we start from the following vectors:
				556	// Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
				557	// Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
				558	// Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
				559
				560	TransposedMatrix.resize(3);
				561	SmallVector<uint32_t, 3> GroupSize;
				562	SmallVector<uint32_t, 32> VPShuf;
				563	SmallVector<uint32_t, 32> VPAlign[3];
				564	SmallVector<uint32_t, 32> VPAlign2;
				565	SmallVector<uint32_t, 32> VPAlign3;
				566	SmallVector<uint32_t, 32> OptimizeShuf[3];
				567	Value Vec[3], TempVector[3];
				568	MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
				569
				570	setGroupSize(VT, GroupSize);
				571
				572	for (int i = 0; i < 3; i++)
				573	DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
				574
				575	DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
				576	DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
				577
				578	// Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
				579	// Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
				580	// Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
				581
				582	Vec[0] = Builder.CreateShuffleVector(
				583	InVec[0], UndefValue::get(InVec[0]->getType()), VPAlign2);
				584	Vec[1] = Builder.CreateShuffleVector(
				585	InVec[1], UndefValue::get(InVec[1]->getType()), VPAlign3);
				586	Vec[2] = InVec[2];
				587
				588	// Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
				589	// Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
				590	// Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
				591
				592	for (int i = 0; i < 3; i++)
				593	TempVector[i] =
				594	Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
				595
				596	// Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
				597	// Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
				598	// Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
				599
				600	for (int i = 0; i < 3; i++)
				601	Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
				602	VPAlign[2]);
				603
				604	// TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
				605	// TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
				606	// TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
				607
				608	group2Shuffle(VT, GroupSize, VPShuf);
				609
				610	if (VT.getSizeInBits() <= 128) {
				611	for (int i = 0; i < 3; i++)
				612	TransposedMatrix[i] = Builder.CreateShuffleVector(
				613	Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
				614	return;
				615	}
				616
				617	unsigned NumOfElm = VT.getVectorNumElements();
				618	genShuffleBland(VT, VPShuf, OptimizeShuf[0], 0, 0);
				619	genShuffleBland(VT, VPShuf, OptimizeShuf[1], 0, NumOfElm / 2);
				620	genShuffleBland(VT, VPShuf, OptimizeShuf[2], NumOfElm / 2, NumOfElm / 2);
				621
				622	for (int i = 0; i < 3; i++)
				623	TransposedMatrix[i] = Builder.CreateShuffleVector(
				624	Vec[(i * 2) % 3], Vec[(i * 2 + 1) % 3], OptimizeShuf[i]);
				625
				626	return;
				627	}
				628
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	629	void X86InterleavedAccessGroup::transpose_4x4(
				630	ArrayRef<Instruction *> Matrix,
				631	SmallVectorImpl<Value *> &TransposedMatrix) {
				632	assert(Matrix.size() == 4 && "Invalid matrix size");
				633	TransposedMatrix.resize(4);
				634
				635	// dst = src1[0,1],src2[0,1]
				636	uint32_t IntMask1[] = {0, 1, 4, 5};
				637	ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
				638	Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
				639	Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
				640
				641	// dst = src1[2,3],src2[2,3]
				642	uint32_t IntMask2[] = {2, 3, 6, 7};
				643	Mask = makeArrayRef(IntMask2, 4);
				644	Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
				645	Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
				646
				647	// dst = src1[0],src2[0],src1[2],src2[2]
				648	uint32_t IntMask3[] = {0, 4, 2, 6};
				649	Mask = makeArrayRef(IntMask3, 4);
				650	TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
				651	TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
				652
				653	// dst = src1[1],src2[1],src1[3],src2[3]
				654	uint32_t IntMask4[] = {1, 5, 3, 7};
				655	Mask = makeArrayRef(IntMask4, 4);
				656	TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
				657	TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
				658	}
				659
				660	// Lowers this interleaved access group into X86-specific
				661	// instructions/intrinsics.
				662	bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
				663	SmallVector<Instruction *, 4> DecomposedVectors;
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	664	SmallVector<Value *, 4> TransposedVectors;
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	665	VectorType *ShuffleTy = Shuffles[0]->getType();
				666
				667	if (isa<LoadInst>(Inst)) {
				668	// Try to generate target-sized register(/instruction).
				669	decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
				670
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	671	Type *ShuffleEltTy = Inst->getType();
				672	unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor;
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	673	// Perform matrix-transposition in order to compute interleaved
				674	// results by generating some sort of (optimized) target-specific
				675	// instructions.
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	676
				677	switch (NumSubVecElems) {
				678	default:
				679	return false;
				680	case 4:
				681	transpose_4x4(DecomposedVectors, TransposedVectors);
				682	break;
				683	case 8:
				684	case 16:
				685	case 32:
				686	deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
				687	NumSubVecElems);
				688	break;
				689	}
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	690
				691	// Now replace the unoptimized-interleaved-vectors with the
				692	// transposed-interleaved vectors.
				693	for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
				694	Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
				695
				696	return true;
				697	}
				698
				699	Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
				700	unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
				701
				702	// Lower the interleaved stores:
				703	// 1. Decompose the interleaved wide shuffle into individual shuffle
				704	// vectors.
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	705	decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
				706	DecomposedVectors);
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	707
				708	// 2. Transpose the interleaved-vectors into vectors of contiguous
				709	// elements.
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	710	switch (NumSubVecElems) {
				711	case 4:
				712	transpose_4x4(DecomposedVectors, TransposedVectors);
				713	break;
Michael Zuckerman	4a97df0	2017-09-25 14:50:38 +0000	[diff] [blame]	714	case 8:
				715	interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
				716	break;
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	717	case 16:
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	718	case 32:
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	719	if (Factor == 4)
				720	interleave8bitStride4(DecomposedVectors, TransposedVectors,
				721	NumSubVecElems);
				722	if (Factor == 3)
				723	interleave8bitStride3(DecomposedVectors, TransposedVectors,
				724	NumSubVecElems);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	725	break;
				726	default:
				727	return false;
				728	}
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	729
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	730	// 3. Concatenate the contiguous-vectors back into a wide vector.
				731	Value *WideVec = concatenateVectors(Builder, TransposedVectors);
				732
				733	// 4. Generate a store instruction for wide-vec.
				734	StoreInst *SI = cast<StoreInst>(Inst);
				735	Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
				736	SI->getAlignment());
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	737
				738	return true;
				739	}
				740
				741	// Lower interleaved load(s) into target specific instructions/
				742	// intrinsics. Lowering sequence varies depending on the vector-types, factor,
				743	// number of shuffles and ISA.
				744	// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	745	bool X86TargetLowering::lowerInterleavedLoad(
				746	LoadInst LI, ArrayRef<ShuffleVectorInst > Shuffles,
				747	ArrayRef<unsigned> Indices, unsigned Factor) const {
				748	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
				749	"Invalid interleave factor");
				750	assert(!Shuffles.empty() && "Empty shufflevector input");
				751	assert(Shuffles.size() == Indices.size() &&
				752	"Unmatched number of shufflevectors and indices");
				753
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	754	// Create an interleaved access group.
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	755	IRBuilder<> Builder(LI);
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	756	X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
				757	Builder);
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	758
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	759	return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	760	}
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	761
				762	bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
				763	ShuffleVectorInst *SVI,
				764	unsigned Factor) const {
				765	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
				766	"Invalid interleave factor");
				767
Farhana Aleen	9bd593e	2017-06-22 23:56:31 +0000	[diff] [blame]	768	assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	769	"Invalid interleaved store");
				770
				771	// Holds the indices of SVI that correspond to the starting index of each
				772	// interleaved shuffle.
				773	SmallVector<unsigned, 4> Indices;
				774	auto Mask = SVI->getShuffleMask();
				775	for (unsigned i = 0; i < Factor; i++)
				776	Indices.push_back(Mask[i]);
				777
				778	ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
				779
				780	// Create an interleaved access group.
				781	IRBuilder<> Builder(SI);
				782	X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
				783	Builder);
				784
				785	return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
				786	}
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	787