Blame - llvm/lib/Target/X86/X86InterleavedAccess.cpp - toolchain/llvm-project

blob: b0c3bb28f7230a6950d64c3365c2e0ad752b9776 [file] [log] [blame]

David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	1	//===--------- X86InterleavedAccess.cpp ----------------------------------===//
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	8	//===--------------------------------------------------------------------===//
				9	///
				10	/// \file
				11	/// This file contains the X86 implementation of the interleaved accesses
				12	/// optimization generating X86-specific instructions/intrinsics for
				13	/// interleaved access groups.
				14	///
				15	//===--------------------------------------------------------------------===//
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	16
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	17	#include "X86TargetMachine.h"
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	18	#include "llvm/Analysis/VectorUtils.h"
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	19
				20	using namespace llvm;
				21
Benjamin Kramer	efcf06f	2017-02-11 11:06:55 +0000	[diff] [blame]	22	namespace {
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	23	/// \brief This class holds necessary information to represent an interleaved
				24	/// access group and supports utilities to lower the group into
				25	/// X86-specific instructions/intrinsics.
				26	/// E.g. A group of interleaving access loads (Factor = 2; accessing every
				27	/// other element)
				28	/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
				29	/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
				30	/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	31	class X86InterleavedAccessGroup {
				32	/// \brief Reference to the wide-load instruction of an interleaved access
				33	/// group.
				34	Instruction *const Inst;
				35
				36	/// \brief Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
				37	ArrayRef<ShuffleVectorInst *> Shuffles;
				38
				39	/// \brief Reference to the starting index of each user-shuffle.
				40	ArrayRef<unsigned> Indices;
				41
				42	/// \brief Reference to the interleaving stride in terms of elements.
				43	const unsigned Factor;
				44
				45	/// \brief Reference to the underlying target.
				46	const X86Subtarget &Subtarget;
				47
				48	const DataLayout &DL;
				49
				50	IRBuilder<> &Builder;
				51
				52	/// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	53	/// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
				54	void decompose(Instruction Inst, unsigned NumSubVectors, VectorType T,
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	55	SmallVectorImpl<Instruction *> &DecomposedVectors);
				56
				57	/// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
				58	/// returns the transposed-vectors in \p TransposedVectors.
				59	/// E.g.
				60	/// InputVectors:
				61	/// In-V0 = p1, p2, p3, p4
				62	/// In-V1 = q1, q2, q3, q4
				63	/// In-V2 = r1, r2, r3, r4
				64	/// In-V3 = s1, s2, s3, s4
				65	/// OutputVectors:
				66	/// Out-V0 = p1, q1, r1, s1
				67	/// Out-V1 = p2, q2, r2, s2
				68	/// Out-V2 = p3, q3, r3, s3
				69	/// Out-V3 = P4, q4, r4, s4
				70	void transpose_4x4(ArrayRef<Instruction *> InputVectors,
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	71	SmallVectorImpl<Value *> &TransposedMatrix);
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	72	void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
				73	SmallVectorImpl<Value *> &TransposedMatrix,
				74	unsigned NumSubVecElems);
Michael Zuckerman	4a97df0	2017-09-25 14:50:38 +0000	[diff] [blame]	75	void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
				76	SmallVectorImpl<Value *> &TransposedMatrix);
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	77	void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
				78	SmallVectorImpl<Value *> &TransposedMatrix,
				79	unsigned NumSubVecElems);
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	80	void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
				81	SmallVectorImpl<Value *> &TransposedMatrix,
				82	unsigned NumSubVecElems);
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	83
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	84	public:
				85	/// In order to form an interleaved access group X86InterleavedAccessGroup
				86	/// requires a wide-load instruction \p 'I', a group of interleaved-vectors
				87	/// \p Shuffs, reference to the first indices of each interleaved-vector
				88	/// \p 'Ind' and the interleaving stride factor \p F. In order to generate
				89	/// X86-specific instructions/intrinsics it also requires the underlying
				90	/// target information \p STarget.
				91	explicit X86InterleavedAccessGroup(Instruction *I,
				92	ArrayRef<ShuffleVectorInst *> Shuffs,
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	93	ArrayRef<unsigned> Ind, const unsigned F,
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	94	const X86Subtarget &STarget,
				95	IRBuilder<> &B)
				96	: Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
				97	DL(Inst->getModule()->getDataLayout()), Builder(B) {}
				98
				99	/// \brief Returns true if this interleaved access group can be lowered into
				100	/// x86-specific instructions/intrinsics, false otherwise.
				101	bool isSupported() const;
				102
				103	/// \brief Lowers this interleaved access group into X86-specific
				104	/// instructions/intrinsics.
				105	bool lowerIntoOptimizedSequence();
				106	};
Benjamin Kramer	efcf06f	2017-02-11 11:06:55 +0000	[diff] [blame]	107	} // end anonymous namespace
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	108
				109	bool X86InterleavedAccessGroup::isSupported() const {
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	110	VectorType *ShuffleVecTy = Shuffles[0]->getType();
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	111	Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	112	unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	113	unsigned WideInstSize;
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	114
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	115	// Currently, lowering is supported for the following vectors:
				116	// Stride 4:
				117	// 1. Store and load of 4-element vectors of 64 bits on AVX.
				118	// 2. Store of 16/32-element vectors of 8 bits on AVX.
				119	// Stride 3:
				120	// 1. Load of 16/32-element vecotrs of 8 bits on AVX.
				121	if (!Subtarget.hasAVX() \|\| (Factor != 4 && Factor != 3))
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	122	return false;
				123
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	124	if (isa<LoadInst>(Inst)) {
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	125	WideInstSize = DL.getTypeSizeInBits(Inst->getType());
				126	} else
				127	WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
				128
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	129	// We support shuffle represents stride 4 for byte type with size of
				130	// WideInstSize.
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	131	if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
				132	return true;
				133
				134	if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
Michael Zuckerman	4a97df0	2017-09-25 14:50:38 +0000	[diff] [blame]	135	(WideInstSize == 256 \|\| WideInstSize == 512 \|\| WideInstSize == 1024))
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	136	return true;
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	137
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	138	if (ShuffleElemSize == 8 && Factor == 3 &&
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	139	(WideInstSize == 384 \|\| WideInstSize == 768))
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	140	return true;
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	141
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	142	return false;
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	143	}
				144
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	145	void X86InterleavedAccessGroup::decompose(
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	146	Instruction VecInst, unsigned NumSubVectors, VectorType SubVecTy,
				147	SmallVectorImpl<Instruction *> &DecomposedVectors) {
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	148
				149	assert((isa<LoadInst>(VecInst) \|\| isa<ShuffleVectorInst>(VecInst)) &&
				150	"Expected Load or Shuffle");
				151
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	152	Type *VecTy = VecInst->getType();
Benjamin Kramer	215b22e	2016-12-01 20:49:34 +0000	[diff] [blame]	153	(void)VecTy;
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	154	assert(VecTy->isVectorTy() &&
				155	DL.getTypeSizeInBits(VecTy) >=
				156	DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
				157	"Invalid Inst-size!!!");
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	158
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	159	if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
				160	Value *Op0 = SVI->getOperand(0);
				161	Value *Op1 = SVI->getOperand(1);
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	162
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	163	// Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
				164	for (unsigned i = 0; i < NumSubVectors; ++i)
				165	DecomposedVectors.push_back(
				166	cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	167	Op0, Op1,
				168	createSequentialMask(Builder, Indices[i],
				169	SubVecTy->getVectorNumElements(), 0))));
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	170	return;
				171	}
				172
				173	// Decompose the load instruction.
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	174	LoadInst *LI = cast<LoadInst>(VecInst);
				175	Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	176	Value *VecBasePtr;
				177	unsigned int NumLoads = NumSubVectors;
				178	// In the case of stride 3 with a vector of 32 elements load the information
				179	// in the following way:
				180	// [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
				181	if (DL.getTypeSizeInBits(VecTy) == 768) {
				182	Type *VecTran =
				183	VectorType::get(Type::getInt8Ty(LI->getContext()), 16)->getPointerTo();
				184	VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTran);
				185	NumLoads = NumSubVectors * 2;
				186	} else
				187	VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	188	// Generate N loads of T type.
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	189	for (unsigned i = 0; i < NumLoads; i++) {
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	190	// TODO: Support inbounds GEP.
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	191	Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
				192	Instruction *NewLoad =
				193	Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
				194	DecomposedVectors.push_back(NewLoad);
				195	}
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	196	}
				197
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	198	// Create shuffle mask for concatenation of two half vectors.
				199	// Low = false: mask generated for the shuffle
				200	// shuffle(VEC1,VEC2,{NumElement/2, NumElement/2+1, NumElement/2+2...,
				201	// NumElement-1, NumElement+NumElement/2,
				202	// NumElement+NumElement/2+1..., 2*NumElement-1})
				203	// = concat(high_half(VEC1),high_half(VEC2))
				204	// Low = true: mask generated for the shuffle
				205	// shuffle(VEC1,VEC2,{0,1,2,...,NumElement/2-1,NumElement,
				206	// NumElement+1...,NumElement+NumElement/2-1})
				207	// = concat(low_half(VEC1),low_half(VEC2))
				208	static void createConcatShuffleMask(int NumElements,
				209	SmallVectorImpl<uint32_t> &Mask, bool Low) {
				210	int NumHalfElements = NumElements / 2;
				211	int Offset = Low ? 0 : NumHalfElements;
				212	for (int i = 0; i < NumHalfElements; ++i)
				213	Mask.push_back(i + Offset);
				214	for (int i = 0; i < NumHalfElements; ++i)
				215	Mask.push_back(i + Offset + NumElements);
				216	}
				217
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	218	// Changing the scale of the vector type by reducing the number of elements and
				219	// doubling the scalar size.
				220	static MVT scaleVectorType(MVT VT) {
				221	unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
				222	return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
				223	VT.getVectorNumElements() / 2);
				224	}
				225
Michael Zuckerman	4a97df0	2017-09-25 14:50:38 +0000	[diff] [blame]	226	void X86InterleavedAccessGroup::interleave8bitStride4VF8(
				227	ArrayRef<Instruction *> Matrix,
				228	SmallVectorImpl<Value *> &TransposedMatrix) {
				229	// Assuming we start from the following vectors:
				230	// Matrix[0]= c0 c1 c2 c3 c4 ... c7
				231	// Matrix[1]= m0 m1 m2 m3 m4 ... m7
				232	// Matrix[2]= y0 y1 y2 y3 y4 ... y7
				233	// Matrix[3]= k0 k1 k2 k3 k4 ... k7
				234
				235	MVT VT = MVT::v8i16;
				236	TransposedMatrix.resize(2);
				237	SmallVector<uint32_t, 16> MaskLow;
				238	SmallVector<uint32_t, 32> MaskLowTemp1, MaskLowWord;
				239	SmallVector<uint32_t, 32> MaskHighTemp1, MaskHighWord;
				240
				241	for (unsigned i = 0; i < 8; ++i) {
				242	MaskLow.push_back(i);
				243	MaskLow.push_back(i + 8);
				244	}
				245
				246	createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1, true, false);
				247	createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1, false, false);
				248	scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
				249	scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
				250	// IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
				251	// IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
				252	Value *IntrVec1Low =
				253	Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
				254	Value *IntrVec2Low =
				255	Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
				256
				257	// TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
				258	// TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
				259
				260	TransposedMatrix[0] =
				261	Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
				262	TransposedMatrix[1] =
				263	Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
				264	}
				265
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	266	void X86InterleavedAccessGroup::interleave8bitStride4(
				267	ArrayRef<Instruction > Matrix, SmallVectorImpl<Value > &TransposedMatrix,
				268	unsigned numberOfElement) {
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	269
				270	// Example: Assuming we start from the following vectors:
				271	// Matrix[0]= c0 c1 c2 c3 c4 ... c31
				272	// Matrix[1]= m0 m1 m2 m3 m4 ... m31
				273	// Matrix[2]= y0 y1 y2 y3 y4 ... y31
				274	// Matrix[3]= k0 k1 k2 k3 k4 ... k31
				275
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	276	MVT VT = MVT::getVectorVT(MVT::i8, numberOfElement);
				277	MVT HalfVT = scaleVectorType(VT);
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	278
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	279	TransposedMatrix.resize(4);
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	280	SmallVector<uint32_t, 32> MaskHigh;
				281	SmallVector<uint32_t, 32> MaskLow;
Michael Zuckerman	b92b6d4	2017-09-30 14:55:03 +0000	[diff] [blame^]	282	SmallVector<uint32_t, 32> LowHighMask[2];
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	283	SmallVector<uint32_t, 32> MaskHighTemp1;
				284	SmallVector<uint32_t, 32> MaskLowTemp1;
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	285	SmallVector<uint32_t, 32> MaskHighWord;
				286	SmallVector<uint32_t, 32> MaskLowWord;
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	287	SmallVector<uint32_t, 32> ConcatLow;
				288	SmallVector<uint32_t, 32> ConcatHigh;
				289
				290	// MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
				291	// shuffle pattern.
				292
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	293	createUnpackShuffleMask<uint32_t>(VT, MaskLow, true, false);
Michael Zuckerman	b92b6d4	2017-09-30 14:55:03 +0000	[diff] [blame^]	294	createUnpackShuffleMask<uint32_t>(VT, MaskHigh, false, false);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	295
				296	// MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
				297	// shuffle pattern.
				298
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	299	createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp1, true, false);
				300	createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp1, false, false);
Michael Zuckerman	b92b6d4	2017-09-30 14:55:03 +0000	[diff] [blame^]	301	scaleShuffleMask<uint32_t>(2, MaskLowTemp1, LowHighMask[0]);
				302	scaleShuffleMask<uint32_t>(2, MaskHighTemp1, LowHighMask[1]);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	303
				304	// IntrVec1Low = c0 m0 c1 m1 ... c7 m7 \| c16 m16 c17 m17 ... c23 m23
				305	// IntrVec1High = c8 m8 c9 m9 ... c15 m15 \| c24 m24 c25 m25 ... c31 m31
				306	// IntrVec2Low = y0 k0 y1 k1 ... y7 k7 \| y16 k16 y17 k17 ... y23 k23
				307	// IntrVec2High = y8 k8 y9 k9 ... y15 k15 \| y24 k24 y25 k25 ... y31 k31
Michael Zuckerman	b92b6d4	2017-09-30 14:55:03 +0000	[diff] [blame^]	308	Value *IntrVec[4];
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	309
Michael Zuckerman	b92b6d4	2017-09-30 14:55:03 +0000	[diff] [blame^]	310	IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
				311	IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
				312	IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
				313	IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	314
				315	// cmyk4 cmyk5 cmyk6 cmyk7 \| cmyk20 cmyk21 cmyk22 cmyk23
				316	// cmyk12 cmyk13 cmyk14 cmyk15 \| cmyk28 cmyk29 cmyk30 cmyk31
				317	// cmyk0 cmyk1 cmyk2 cmyk3 \| cmyk16 cmyk17 cmyk18 cmyk19
				318	// cmyk8 cmyk9 cmyk10 cmyk11 \| cmyk24 cmyk25 cmyk26 cmyk27
				319
Michael Zuckerman	b92b6d4	2017-09-30 14:55:03 +0000	[diff] [blame^]	320	Value *VecOut[4];
				321	for (int i = 0; i < 4; i++)
				322	VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
				323	LowHighMask[i % 2]);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	324
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	325	if (VT == MVT::v16i8) {
Michael Zuckerman	b92b6d4	2017-09-30 14:55:03 +0000	[diff] [blame^]	326	std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	327	return;
				328	}
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	329
				330	// cmyk0 cmyk1 cmyk2 cmyk3 \| cmyk4 cmyk5 cmyk6 cmyk7
				331	// cmyk8 cmyk9 cmyk10 cmyk11 \| cmyk12 cmyk13 cmyk14 cmyk15
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	332	// cmyk16 cmyk17 cmyk18 cmyk19 \| cmyk20 cmyk21 cmyk22 cmyk23
				333	// cmyk24 cmyk25 cmyk26 cmyk27 \| cmyk28 cmyk29 cmyk30 cmyk31
				334
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	335	// ConcatHigh and ConcatLow built in the vperm2i128 and vinserti128 X86
				336	// shuffle pattern.
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	337	createConcatShuffleMask(numberOfElement, ConcatLow, true);
				338	createConcatShuffleMask(numberOfElement, ConcatHigh, false);
				339
Michael Zuckerman	b92b6d4	2017-09-30 14:55:03 +0000	[diff] [blame^]	340	TransposedMatrix[0] = Builder.CreateShuffleVector(VecOut[0], VecOut[1], ConcatLow);
				341	TransposedMatrix[1] = Builder.CreateShuffleVector(VecOut[2], VecOut[3], ConcatLow);
				342	TransposedMatrix[2] = Builder.CreateShuffleVector(VecOut[0], VecOut[1], ConcatHigh);
				343	TransposedMatrix[3] = Builder.CreateShuffleVector(VecOut[2], VecOut[3], ConcatHigh);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	344	}
				345
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	346	// createShuffleStride returns shuffle mask of size N.
				347	// The shuffle pattern is as following :
				348	// {0, Stride%(VF/Lane), (2Stride%(VF/Lane))...(VFStride/Lane)%(VF/Lane),
				349	// (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
				350	// (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
				351	// Where Lane is the # of lanes in a register:
				352	// VectorSize = 128 => Lane = 1
				353	// VectorSize = 256 => Lane = 2
				354	// For example shuffle pattern for VF 16 register size 256 -> lanes = 2
				355	// {<[0\|3\|6\|1\|4\|7\|2\|5]-[8\|11\|14\|9\|12\|15\|10\|13]>}
				356	static void createShuffleStride(MVT VT, int Stride,
				357	SmallVectorImpl<uint32_t> &Mask) {
				358	int VectorSize = VT.getSizeInBits();
				359	int VF = VT.getVectorNumElements();
				360	int LaneCount = std::max(VectorSize / 128, 1);
				361	for (int Lane = 0; Lane < LaneCount; Lane++)
				362	for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
				363	Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
				364	}
				365
				366	// setGroupSize sets 'SizeInfo' to the size(number of elements) of group
				367	// inside mask a shuffleMask. A mask contains exactly 3 groups, where
				368	// each group is a monotonically increasing sequence with stride 3.
				369	// For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
				370	static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
				371	int VectorSize = VT.getSizeInBits();
				372	int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
				373	for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
				374	int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
				375	SizeInfo.push_back(GroupSize);
				376	FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
				377	}
				378	}
				379
				380	// DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
				381	// vpalign works according to lanes
				382	// Where Lane is the # of lanes in a register:
				383	// VectorWide = 128 => Lane = 1
				384	// VectorWide = 256 => Lane = 2
				385	// For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
				386	// For Lane = 2 shuffle pattern is:
				387	// {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
				388	// Imm variable sets the offset amount. The result of the
				389	// function is stored inside ShuffleMask vector and it built as described in
				390	// the begin of the description. AlignDirection is a boolean that indecat the
				391	// direction of the alignment. (false - align to the "right" side while true -
				392	// align to the "left" side)
				393	static void DecodePALIGNRMask(MVT VT, unsigned Imm,
				394	SmallVectorImpl<uint32_t> &ShuffleMask,
				395	bool AlignDirection = true, bool Unary = false) {
				396
				397	unsigned NumElts = VT.getVectorNumElements();
				398	unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
				399	unsigned NumLaneElts = NumElts / NumLanes;
				400
				401	Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
				402	unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
				403
				404	for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
				405	for (unsigned i = 0; i != NumLaneElts; ++i) {
				406	unsigned Base = i + Offset;
				407	// if i+offset is out of this lane then we actually need the other source
				408	// If Unary the other source is the first source.
				409	if (Base >= NumLaneElts)
				410	Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
				411	ShuffleMask.push_back(Base + l);
				412	}
				413	}
				414	}
				415
				416	void X86InterleavedAccessGroup::deinterleave8bitStride3(
				417	ArrayRef<Instruction > InVec, SmallVectorImpl<Value > &TransposedMatrix,
				418	unsigned VecElems) {
				419
				420	// Example: Assuming we start from the following vectors:
				421	// Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
				422	// Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
				423	// Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
				424
				425	TransposedMatrix.resize(3);
				426	SmallVector<uint32_t, 32> Concat;
				427	SmallVector<uint32_t, 32> VPShuf;
				428	SmallVector<uint32_t, 32> VPAlign[2];
				429	SmallVector<uint32_t, 32> VPAlign2;
				430	SmallVector<uint32_t, 32> VPAlign3;
				431	SmallVector<uint32_t, 3> GroupSize;
				432	Value Vec[3], TempVector[3];
				433
				434	MVT VT = MVT::getVT(Shuffles[0]->getType());
				435
				436	for (unsigned i = 0; i < VecElems && VecElems == 32; ++i)
				437	Concat.push_back(i);
				438
				439	createShuffleStride(VT, 3, VPShuf);
				440	setGroupSize(VT, GroupSize);
				441
				442	for (int i = 0; i < 2; i++)
				443	DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
				444
				445	DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
				446	DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
				447
				448	for (int i = 0; i < 3; i++)
				449	Vec[i] = VecElems == 32
				450	? Builder.CreateShuffleVector(InVec[i], InVec[i + 3], Concat)
				451	: InVec[i];
				452
				453	// Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
				454	// Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
				455	// Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
				456
				457	for (int i = 0; i < 3; i++)
				458	Vec[i] = Builder.CreateShuffleVector(
				459	Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf);
				460
				461	// TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
				462	// TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
				463	// TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
				464
				465	for (int i = 0; i < 3; i++)
				466	TempVector[i] =
				467	Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
				468
				469	// Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
				470	// Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
				471	// Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
				472
				473	for (int i = 0; i < 3; i++)
				474	Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
				475	VPAlign[1]);
				476
				477	// TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
				478	// TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
				479	// TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
				480
				481	Value *TempVec = Builder.CreateShuffleVector(
				482	Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3);
				483	TransposedMatrix[0] = Builder.CreateShuffleVector(
				484	Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2);
				485	TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
				486	TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
				487
				488	return;
				489	}
				490
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	491	// group2Shuffle reorder the shuffle stride back into continuous order.
				492	// For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
				493	// MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
				494	static void group2Shuffle(MVT VT, SmallVectorImpl<uint32_t> &Mask,
				495	SmallVectorImpl<uint32_t> &Output) {
				496	int IndexGroup[3] = {0, 0, 0};
				497	int Index = 0;
				498	int VectorWidth = VT.getSizeInBits();
				499	int VF = VT.getVectorNumElements();
				500	// Find the index of the different groups.
				501	int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
				502	for (int i = 0; i < 3; i++) {
				503	IndexGroup[(Index * 3) % (VF / Lane)] = Index;
				504	Index += Mask[i];
				505	}
				506	// According to the index compute the convert mask.
				507	for (int i = 0; i < VF / Lane; i++) {
				508	Output.push_back(IndexGroup[i % 3]);
				509	IndexGroup[i % 3]++;
				510	}
				511	}
				512
				513	// genShuffleBland - Creates shuffle according to two vectors.This function is
				514	// only works on instructions with lane inside 256 registers. According to
				515	// the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
				516	// offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
				517	// Where the 'LowOffset' refers to the first vector and the highOffset refers to
				518	// the second vector.
				519	// \|a0....a5,b0....b4,c0....c4\|a16..a21,b16..b20,c16..c20\|
				520	// \|c5...c10,a5....a9,b5....b9\|c21..c26,a22..a26,b21..b25\|
				521	// \|b10..b15,c11..c15,a10..a15\|b26..b31,c27..c31,a27..a31\|
				522	// For the sequence to work as a mirror to the load.
				523	// We must consider the elements order as above.
				524	// In this function we are combining two types of shuffles.
				525	// The first one is vpshufed and the second is a type of "blend" shuffle.
				526	// By computing the shuffle on a sequence of 16 elements(one lane) and add the
				527	// correct offset. We are creating a vpsuffed + blend sequence between two
				528	// shuffles.
Michael Zuckerman	0b5db55	2017-09-29 12:45:54 +0000	[diff] [blame]	529	static void genShuffleBland(MVT VT, ArrayRef<uint32_t> Mask,
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	530	SmallVectorImpl<uint32_t> &Out, int LowOffset,
				531	int HighOffset) {
				532	assert(VT.getSizeInBits() == 256 &&
				533	"This function works on only width of 256");
				534	unsigned NumOfElm = VT.getVectorNumElements();
				535	for (unsigned i = 0; i < Mask.size(); i++)
				536	Out.push_back(Mask[i] + LowOffset);
				537	for (unsigned i = 0; i < Mask.size(); i++)
				538	Out.push_back(Mask[i] + HighOffset + NumOfElm);
				539	}
				540
				541	void X86InterleavedAccessGroup::interleave8bitStride3(
				542	ArrayRef<Instruction > InVec, SmallVectorImpl<Value > &TransposedMatrix,
				543	unsigned VecElems) {
				544
				545	// Example: Assuming we start from the following vectors:
				546	// Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
				547	// Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
				548	// Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
				549
				550	TransposedMatrix.resize(3);
				551	SmallVector<uint32_t, 3> GroupSize;
				552	SmallVector<uint32_t, 32> VPShuf;
				553	SmallVector<uint32_t, 32> VPAlign[3];
				554	SmallVector<uint32_t, 32> VPAlign2;
				555	SmallVector<uint32_t, 32> VPAlign3;
				556	SmallVector<uint32_t, 32> OptimizeShuf[3];
				557	Value Vec[3], TempVector[3];
				558	MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
				559
				560	setGroupSize(VT, GroupSize);
				561
				562	for (int i = 0; i < 3; i++)
				563	DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
				564
				565	DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
				566	DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
				567
				568	// Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
				569	// Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
				570	// Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
				571
				572	Vec[0] = Builder.CreateShuffleVector(
				573	InVec[0], UndefValue::get(InVec[0]->getType()), VPAlign2);
				574	Vec[1] = Builder.CreateShuffleVector(
				575	InVec[1], UndefValue::get(InVec[1]->getType()), VPAlign3);
				576	Vec[2] = InVec[2];
				577
				578	// Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
				579	// Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
				580	// Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
				581
				582	for (int i = 0; i < 3; i++)
				583	TempVector[i] =
				584	Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
				585
				586	// Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
				587	// Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
				588	// Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
				589
				590	for (int i = 0; i < 3; i++)
				591	Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
				592	VPAlign[2]);
				593
				594	// TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
				595	// TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
				596	// TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
				597
				598	group2Shuffle(VT, GroupSize, VPShuf);
				599
				600	if (VT.getSizeInBits() <= 128) {
				601	for (int i = 0; i < 3; i++)
				602	TransposedMatrix[i] = Builder.CreateShuffleVector(
				603	Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
				604	return;
				605	}
				606
				607	unsigned NumOfElm = VT.getVectorNumElements();
				608	genShuffleBland(VT, VPShuf, OptimizeShuf[0], 0, 0);
				609	genShuffleBland(VT, VPShuf, OptimizeShuf[1], 0, NumOfElm / 2);
				610	genShuffleBland(VT, VPShuf, OptimizeShuf[2], NumOfElm / 2, NumOfElm / 2);
				611
				612	for (int i = 0; i < 3; i++)
				613	TransposedMatrix[i] = Builder.CreateShuffleVector(
				614	Vec[(i * 2) % 3], Vec[(i * 2 + 1) % 3], OptimizeShuf[i]);
				615
				616	return;
				617	}
				618
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	619	void X86InterleavedAccessGroup::transpose_4x4(
				620	ArrayRef<Instruction *> Matrix,
				621	SmallVectorImpl<Value *> &TransposedMatrix) {
				622	assert(Matrix.size() == 4 && "Invalid matrix size");
				623	TransposedMatrix.resize(4);
				624
				625	// dst = src1[0,1],src2[0,1]
				626	uint32_t IntMask1[] = {0, 1, 4, 5};
				627	ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
				628	Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
				629	Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
				630
				631	// dst = src1[2,3],src2[2,3]
				632	uint32_t IntMask2[] = {2, 3, 6, 7};
				633	Mask = makeArrayRef(IntMask2, 4);
				634	Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
				635	Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
				636
				637	// dst = src1[0],src2[0],src1[2],src2[2]
				638	uint32_t IntMask3[] = {0, 4, 2, 6};
				639	Mask = makeArrayRef(IntMask3, 4);
				640	TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
				641	TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
				642
				643	// dst = src1[1],src2[1],src1[3],src2[3]
				644	uint32_t IntMask4[] = {1, 5, 3, 7};
				645	Mask = makeArrayRef(IntMask4, 4);
				646	TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
				647	TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
				648	}
				649
				650	// Lowers this interleaved access group into X86-specific
				651	// instructions/intrinsics.
				652	bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
				653	SmallVector<Instruction *, 4> DecomposedVectors;
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	654	SmallVector<Value *, 4> TransposedVectors;
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	655	VectorType *ShuffleTy = Shuffles[0]->getType();
				656
				657	if (isa<LoadInst>(Inst)) {
				658	// Try to generate target-sized register(/instruction).
				659	decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
				660
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	661	Type *ShuffleEltTy = Inst->getType();
				662	unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor;
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	663	// Perform matrix-transposition in order to compute interleaved
				664	// results by generating some sort of (optimized) target-specific
				665	// instructions.
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	666
				667	switch (NumSubVecElems) {
				668	default:
				669	return false;
				670	case 4:
				671	transpose_4x4(DecomposedVectors, TransposedVectors);
				672	break;
				673	case 8:
				674	case 16:
				675	case 32:
				676	deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
				677	NumSubVecElems);
				678	break;
				679	}
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	680
				681	// Now replace the unoptimized-interleaved-vectors with the
				682	// transposed-interleaved vectors.
				683	for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
				684	Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
				685
				686	return true;
				687	}
				688
				689	Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
				690	unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
				691
				692	// Lower the interleaved stores:
				693	// 1. Decompose the interleaved wide shuffle into individual shuffle
				694	// vectors.
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	695	decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
				696	DecomposedVectors);
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	697
				698	// 2. Transpose the interleaved-vectors into vectors of contiguous
				699	// elements.
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	700	switch (NumSubVecElems) {
				701	case 4:
				702	transpose_4x4(DecomposedVectors, TransposedVectors);
				703	break;
Michael Zuckerman	4a97df0	2017-09-25 14:50:38 +0000	[diff] [blame]	704	case 8:
				705	interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
				706	break;
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	707	case 16:
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	708	case 32:
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	709	if (Factor == 4)
				710	interleave8bitStride4(DecomposedVectors, TransposedVectors,
				711	NumSubVecElems);
				712	if (Factor == 3)
				713	interleave8bitStride3(DecomposedVectors, TransposedVectors,
				714	NumSubVecElems);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	715	break;
				716	default:
				717	return false;
				718	}
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	719
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	720	// 3. Concatenate the contiguous-vectors back into a wide vector.
				721	Value *WideVec = concatenateVectors(Builder, TransposedVectors);
				722
				723	// 4. Generate a store instruction for wide-vec.
				724	StoreInst *SI = cast<StoreInst>(Inst);
				725	Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
				726	SI->getAlignment());
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	727
				728	return true;
				729	}
				730
				731	// Lower interleaved load(s) into target specific instructions/
				732	// intrinsics. Lowering sequence varies depending on the vector-types, factor,
				733	// number of shuffles and ISA.
				734	// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	735	bool X86TargetLowering::lowerInterleavedLoad(
				736	LoadInst LI, ArrayRef<ShuffleVectorInst > Shuffles,
				737	ArrayRef<unsigned> Indices, unsigned Factor) const {
				738	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
				739	"Invalid interleave factor");
				740	assert(!Shuffles.empty() && "Empty shufflevector input");
				741	assert(Shuffles.size() == Indices.size() &&
				742	"Unmatched number of shufflevectors and indices");
				743
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	744	// Create an interleaved access group.
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	745	IRBuilder<> Builder(LI);
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	746	X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
				747	Builder);
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	748
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	749	return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	750	}
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	751
				752	bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
				753	ShuffleVectorInst *SVI,
				754	unsigned Factor) const {
				755	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
				756	"Invalid interleave factor");
				757
Farhana Aleen	9bd593e	2017-06-22 23:56:31 +0000	[diff] [blame]	758	assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	759	"Invalid interleaved store");
				760
				761	// Holds the indices of SVI that correspond to the starting index of each
				762	// interleaved shuffle.
				763	SmallVector<unsigned, 4> Indices;
				764	auto Mask = SVI->getShuffleMask();
				765	for (unsigned i = 0; i < Factor; i++)
				766	Indices.push_back(Mask[i]);
				767
				768	ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
				769
				770	// Create an interleaved access group.
				771	IRBuilder<> Builder(SI);
				772	X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
				773	Builder);
				774
				775	return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
				776	}
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	777