Blame - llvm/lib/Target/X86/X86InterleavedAccess.cpp - toolchain/llvm-project

blob: cdb24b9d40a67c74c39be8bfad2f96def238cbe7 [file] [log] [blame]

Eugene Zelenko	60433b6	2017-10-05 00:33:50 +0000	[diff] [blame]	1	//===- X86InterleavedAccess.cpp -------------------------------------------===//
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
Eugene Zelenko	60433b6	2017-10-05 00:33:50 +0000	[diff] [blame]	8	//===----------------------------------------------------------------------===//
				9	//
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	10	/// \file
				11	/// This file contains the X86 implementation of the interleaved accesses
				12	/// optimization generating X86-specific instructions/intrinsics for
				13	/// interleaved access groups.
Eugene Zelenko	60433b6	2017-10-05 00:33:50 +0000	[diff] [blame]	14	//
				15	//===----------------------------------------------------------------------===//
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	16
Eugene Zelenko	60433b6	2017-10-05 00:33:50 +0000	[diff] [blame]	17	#include "X86ISelLowering.h"
				18	#include "X86Subtarget.h"
				19	#include "llvm/ADT/ArrayRef.h"
				20	#include "llvm/ADT/SmallVector.h"
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	21	#include "llvm/Analysis/VectorUtils.h"
Eugene Zelenko	60433b6	2017-10-05 00:33:50 +0000	[diff] [blame]	22	#include "llvm/CodeGen/MachineValueType.h"
				23	#include "llvm/IR/Constants.h"
				24	#include "llvm/IR/DataLayout.h"
				25	#include "llvm/IR/DerivedTypes.h"
				26	#include "llvm/IR/IRBuilder.h"
				27	#include "llvm/IR/Instruction.h"
				28	#include "llvm/IR/Instructions.h"
				29	#include "llvm/IR/Module.h"
				30	#include "llvm/IR/Type.h"
				31	#include "llvm/IR/Value.h"
				32	#include "llvm/Support/Casting.h"
				33	#include <algorithm>
				34	#include <cassert>
				35	#include <cmath>
				36	#include <cstdint>
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	37
				38	using namespace llvm;
				39
Benjamin Kramer	efcf06f	2017-02-11 11:06:55 +0000	[diff] [blame]	40	namespace {
Eugene Zelenko	60433b6	2017-10-05 00:33:50 +0000	[diff] [blame]	41
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	42	/// \brief This class holds necessary information to represent an interleaved
				43	/// access group and supports utilities to lower the group into
				44	/// X86-specific instructions/intrinsics.
				45	/// E.g. A group of interleaving access loads (Factor = 2; accessing every
				46	/// other element)
				47	/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
				48	/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
				49	/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	50	class X86InterleavedAccessGroup {
				51	/// \brief Reference to the wide-load instruction of an interleaved access
				52	/// group.
				53	Instruction *const Inst;
				54
				55	/// \brief Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
				56	ArrayRef<ShuffleVectorInst *> Shuffles;
				57
				58	/// \brief Reference to the starting index of each user-shuffle.
				59	ArrayRef<unsigned> Indices;
				60
				61	/// \brief Reference to the interleaving stride in terms of elements.
				62	const unsigned Factor;
				63
				64	/// \brief Reference to the underlying target.
				65	const X86Subtarget &Subtarget;
				66
				67	const DataLayout &DL;
				68
				69	IRBuilder<> &Builder;
				70
				71	/// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	72	/// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
				73	void decompose(Instruction Inst, unsigned NumSubVectors, VectorType T,
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	74	SmallVectorImpl<Instruction *> &DecomposedVectors);
				75
				76	/// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
				77	/// returns the transposed-vectors in \p TransposedVectors.
				78	/// E.g.
				79	/// InputVectors:
				80	/// In-V0 = p1, p2, p3, p4
				81	/// In-V1 = q1, q2, q3, q4
				82	/// In-V2 = r1, r2, r3, r4
				83	/// In-V3 = s1, s2, s3, s4
				84	/// OutputVectors:
				85	/// Out-V0 = p1, q1, r1, s1
				86	/// Out-V1 = p2, q2, r2, s2
				87	/// Out-V2 = p3, q3, r3, s3
				88	/// Out-V3 = P4, q4, r4, s4
				89	void transpose_4x4(ArrayRef<Instruction *> InputVectors,
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	90	SmallVectorImpl<Value *> &TransposedMatrix);
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	91	void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
				92	SmallVectorImpl<Value *> &TransposedMatrix,
				93	unsigned NumSubVecElems);
Michael Zuckerman	4a97df0	2017-09-25 14:50:38 +0000	[diff] [blame]	94	void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
				95	SmallVectorImpl<Value *> &TransposedMatrix);
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	96	void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
				97	SmallVectorImpl<Value *> &TransposedMatrix,
				98	unsigned NumSubVecElems);
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	99	void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
				100	SmallVectorImpl<Value *> &TransposedMatrix,
				101	unsigned NumSubVecElems);
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	102
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	103	public:
				104	/// In order to form an interleaved access group X86InterleavedAccessGroup
				105	/// requires a wide-load instruction \p 'I', a group of interleaved-vectors
				106	/// \p Shuffs, reference to the first indices of each interleaved-vector
				107	/// \p 'Ind' and the interleaving stride factor \p F. In order to generate
				108	/// X86-specific instructions/intrinsics it also requires the underlying
				109	/// target information \p STarget.
				110	explicit X86InterleavedAccessGroup(Instruction *I,
				111	ArrayRef<ShuffleVectorInst *> Shuffs,
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	112	ArrayRef<unsigned> Ind, const unsigned F,
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	113	const X86Subtarget &STarget,
				114	IRBuilder<> &B)
				115	: Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
				116	DL(Inst->getModule()->getDataLayout()), Builder(B) {}
				117
				118	/// \brief Returns true if this interleaved access group can be lowered into
				119	/// x86-specific instructions/intrinsics, false otherwise.
				120	bool isSupported() const;
				121
				122	/// \brief Lowers this interleaved access group into X86-specific
				123	/// instructions/intrinsics.
				124	bool lowerIntoOptimizedSequence();
				125	};
Eugene Zelenko	60433b6	2017-10-05 00:33:50 +0000	[diff] [blame]	126
Benjamin Kramer	efcf06f	2017-02-11 11:06:55 +0000	[diff] [blame]	127	} // end anonymous namespace
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	128
				129	bool X86InterleavedAccessGroup::isSupported() const {
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	130	VectorType *ShuffleVecTy = Shuffles[0]->getType();
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	131	Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	132	unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	133	unsigned WideInstSize;
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	134
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	135	// Currently, lowering is supported for the following vectors:
				136	// Stride 4:
				137	// 1. Store and load of 4-element vectors of 64 bits on AVX.
				138	// 2. Store of 16/32-element vectors of 8 bits on AVX.
				139	// Stride 3:
Craig Topper	2153114	2017-11-14 16:14:00 +0000	[diff] [blame]	140	// 1. Load of 16/32-element vectors of 8 bits on AVX.
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	141	if (!Subtarget.hasAVX() \|\| (Factor != 4 && Factor != 3))
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	142	return false;
				143
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	144	if (isa<LoadInst>(Inst)) {
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	145	WideInstSize = DL.getTypeSizeInBits(Inst->getType());
Michael Zuckerman	72a6f89	2017-10-18 08:04:31 +0000	[diff] [blame]	146	if (cast<LoadInst>(Inst)->getPointerAddressSpace())
				147	return false;
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	148	} else
				149	WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
				150
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	151	// We support shuffle represents stride 4 for byte type with size of
				152	// WideInstSize.
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	153	if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
				154	return true;
				155
				156	if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	157	(WideInstSize == 256 \|\| WideInstSize == 512 \|\| WideInstSize == 1024 \|\|
				158	WideInstSize == 2048))
				159	return true;
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	160
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	161	if (ShuffleElemSize == 8 && Factor == 3 &&
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	162	(WideInstSize == 384 \|\| WideInstSize == 768 \|\| WideInstSize == 1536))
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	163	return true;
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	164
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	165	return false;
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	166	}
				167
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	168	void X86InterleavedAccessGroup::decompose(
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	169	Instruction VecInst, unsigned NumSubVectors, VectorType SubVecTy,
				170	SmallVectorImpl<Instruction *> &DecomposedVectors) {
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	171	assert((isa<LoadInst>(VecInst) \|\| isa<ShuffleVectorInst>(VecInst)) &&
				172	"Expected Load or Shuffle");
				173
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	174	Type *VecWidth = VecInst->getType();
				175	(void)VecWidth;
				176	assert(VecWidth->isVectorTy() &&
				177	DL.getTypeSizeInBits(VecWidth) >=
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	178	DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
				179	"Invalid Inst-size!!!");
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	180
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	181	if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
				182	Value *Op0 = SVI->getOperand(0);
				183	Value *Op1 = SVI->getOperand(1);
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	184
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	185	// Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
				186	for (unsigned i = 0; i < NumSubVectors; ++i)
				187	DecomposedVectors.push_back(
				188	cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	189	Op0, Op1,
				190	createSequentialMask(Builder, Indices[i],
				191	SubVecTy->getVectorNumElements(), 0))));
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	192	return;
				193	}
				194
				195	// Decompose the load instruction.
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	196	LoadInst *LI = cast<LoadInst>(VecInst);
				197	Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	198	Value *VecBasePtr;
				199	unsigned int NumLoads = NumSubVectors;
				200	// In the case of stride 3 with a vector of 32 elements load the information
				201	// in the following way:
				202	// [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	203	unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
				204	if (VecLength == 768 \|\| VecLength == 1536) {
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	205	Type *VecTran =
				206	VectorType::get(Type::getInt8Ty(LI->getContext()), 16)->getPointerTo();
				207	VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTran);
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	208	NumLoads = NumSubVectors * (VecLength / 384);
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	209	} else
				210	VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	211	// Generate N loads of T type.
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	212	for (unsigned i = 0; i < NumLoads; i++) {
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	213	// TODO: Support inbounds GEP.
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	214	Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
				215	Instruction *NewLoad =
				216	Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
				217	DecomposedVectors.push_back(NewLoad);
				218	}
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	219	}
				220
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	221	// Changing the scale of the vector type by reducing the number of elements and
				222	// doubling the scalar size.
				223	static MVT scaleVectorType(MVT VT) {
				224	unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
				225	return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
				226	VT.getVectorNumElements() / 2);
				227	}
				228
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	229	static uint32_t Concat[] = {
				230	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
				231	16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
				232	32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
				233	48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 };
				234
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	235	// genShuffleBland - Creates shuffle according to two vectors.This function is
				236	// only works on instructions with lane inside 256 registers. According to
				237	// the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
				238	// offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
				239	// Where the 'LowOffset' refers to the first vector and the highOffset refers to
				240	// the second vector.
				241	// \|a0....a5,b0....b4,c0....c4\|a16..a21,b16..b20,c16..c20\|
				242	// \|c5...c10,a5....a9,b5....b9\|c21..c26,a22..a26,b21..b25\|
				243	// \|b10..b15,c11..c15,a10..a15\|b26..b31,c27..c31,a27..a31\|
				244	// For the sequence to work as a mirror to the load.
				245	// We must consider the elements order as above.
				246	// In this function we are combining two types of shuffles.
				247	// The first one is vpshufed and the second is a type of "blend" shuffle.
				248	// By computing the shuffle on a sequence of 16 elements(one lane) and add the
				249	// correct offset. We are creating a vpsuffed + blend sequence between two
				250	// shuffles.
				251	static void genShuffleBland(MVT VT, ArrayRef<uint32_t> Mask,
				252	SmallVectorImpl<uint32_t> &Out, int LowOffset,
				253	int HighOffset) {
				254	assert(VT.getSizeInBits() >= 256 &&
				255	"This function doesn't accept width smaller then 256");
				256	unsigned NumOfElm = VT.getVectorNumElements();
				257	for (unsigned i = 0; i < Mask.size(); i++)
				258	Out.push_back(Mask[i] + LowOffset);
				259	for (unsigned i = 0; i < Mask.size(); i++)
				260	Out.push_back(Mask[i] + HighOffset + NumOfElm);
				261	}
				262
Craig Topper	2153114	2017-11-14 16:14:00 +0000	[diff] [blame]	263	// reorderSubVector returns the data to is the original state. And de-facto is
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	264	// the opposite of the function concatSubVector.
				265
				266	// For VecElems = 16
				267	// Invec[0] - \|0\| TransposedMatrix[0] - \|0\|
				268	// Invec[1] - \|1\| => TransposedMatrix[1] - \|1\|
				269	// Invec[2] - \|2\| TransposedMatrix[2] - \|2\|
				270
				271	// For VecElems = 32
				272	// Invec[0] - \|0\|3\| TransposedMatrix[0] - \|0\|1\|
				273	// Invec[1] - \|1\|4\| => TransposedMatrix[1] - \|2\|3\|
				274	// Invec[2] - \|2\|5\| TransposedMatrix[2] - \|4\|5\|
				275
				276	// For VecElems = 64
				277	// Invec[0] - \|0\|3\|6\|9 \| TransposedMatrix[0] - \|0\|1\|2 \|3 \|
				278	// Invec[1] - \|1\|4\|7\|10\| => TransposedMatrix[1] - \|4\|5\|6 \|7 \|
				279	// Invec[2] - \|2\|5\|8\|11\| TransposedMatrix[2] - \|8\|9\|10\|11\|
				280
				281	static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
				282	ArrayRef<Value *> Vec, ArrayRef<uint32_t> VPShuf,
				283	unsigned VecElems, unsigned Stride,
				284	IRBuilder<> Builder) {
				285
				286	if (VecElems == 16) {
				287	for (unsigned i = 0; i < Stride; i++)
				288	TransposedMatrix[i] = Builder.CreateShuffleVector(
				289	Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
				290	return;
				291	}
				292
				293	SmallVector<uint32_t, 32> OptimizeShuf;
				294	Value *Temp[8];
				295
				296	for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
				297	genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
				298	(i + 1) / Stride * 16);
				299	Temp[i / 2] = Builder.CreateShuffleVector(
				300	Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
				301	OptimizeShuf.clear();
				302	}
				303
				304	if (VecElems == 32) {
				305	std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
				306	return;
				307	}
				308	else
				309	for (unsigned i = 0; i < Stride; i++)
				310	TransposedMatrix[i] =
				311	Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	312	}
				313
Michael Zuckerman	4a97df0	2017-09-25 14:50:38 +0000	[diff] [blame]	314	void X86InterleavedAccessGroup::interleave8bitStride4VF8(
				315	ArrayRef<Instruction *> Matrix,
				316	SmallVectorImpl<Value *> &TransposedMatrix) {
				317	// Assuming we start from the following vectors:
				318	// Matrix[0]= c0 c1 c2 c3 c4 ... c7
				319	// Matrix[1]= m0 m1 m2 m3 m4 ... m7
				320	// Matrix[2]= y0 y1 y2 y3 y4 ... y7
				321	// Matrix[3]= k0 k1 k2 k3 k4 ... k7
				322
				323	MVT VT = MVT::v8i16;
				324	TransposedMatrix.resize(2);
				325	SmallVector<uint32_t, 16> MaskLow;
				326	SmallVector<uint32_t, 32> MaskLowTemp1, MaskLowWord;
				327	SmallVector<uint32_t, 32> MaskHighTemp1, MaskHighWord;
				328
				329	for (unsigned i = 0; i < 8; ++i) {
				330	MaskLow.push_back(i);
				331	MaskLow.push_back(i + 8);
				332	}
				333
				334	createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1, true, false);
				335	createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1, false, false);
				336	scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
				337	scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
				338	// IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
				339	// IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
				340	Value *IntrVec1Low =
				341	Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
				342	Value *IntrVec2Low =
				343	Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
				344
				345	// TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
				346	// TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
				347
				348	TransposedMatrix[0] =
				349	Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
				350	TransposedMatrix[1] =
				351	Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
				352	}
				353
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	354	void X86InterleavedAccessGroup::interleave8bitStride4(
				355	ArrayRef<Instruction > Matrix, SmallVectorImpl<Value > &TransposedMatrix,
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	356	unsigned NumOfElm) {
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	357	// Example: Assuming we start from the following vectors:
				358	// Matrix[0]= c0 c1 c2 c3 c4 ... c31
				359	// Matrix[1]= m0 m1 m2 m3 m4 ... m31
				360	// Matrix[2]= y0 y1 y2 y3 y4 ... y31
				361	// Matrix[3]= k0 k1 k2 k3 k4 ... k31
				362
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	363	MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm);
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	364	MVT HalfVT = scaleVectorType(VT);
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	365
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	366	TransposedMatrix.resize(4);
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	367	SmallVector<uint32_t, 32> MaskHigh;
				368	SmallVector<uint32_t, 32> MaskLow;
Michael Zuckerman	b92b6d4	2017-09-30 14:55:03 +0000	[diff] [blame]	369	SmallVector<uint32_t, 32> LowHighMask[2];
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	370	SmallVector<uint32_t, 32> MaskHighTemp;
				371	SmallVector<uint32_t, 32> MaskLowTemp;
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	372
				373	// MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
				374	// shuffle pattern.
				375
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	376	createUnpackShuffleMask<uint32_t>(VT, MaskLow, true, false);
Michael Zuckerman	b92b6d4	2017-09-30 14:55:03 +0000	[diff] [blame]	377	createUnpackShuffleMask<uint32_t>(VT, MaskHigh, false, false);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	378
				379	// MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
				380	// shuffle pattern.
				381
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	382	createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp, true, false);
				383	createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp, false, false);
				384	scaleShuffleMask<uint32_t>(2, MaskLowTemp, LowHighMask[0]);
				385	scaleShuffleMask<uint32_t>(2, MaskHighTemp, LowHighMask[1]);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	386
				387	// IntrVec1Low = c0 m0 c1 m1 ... c7 m7 \| c16 m16 c17 m17 ... c23 m23
				388	// IntrVec1High = c8 m8 c9 m9 ... c15 m15 \| c24 m24 c25 m25 ... c31 m31
				389	// IntrVec2Low = y0 k0 y1 k1 ... y7 k7 \| y16 k16 y17 k17 ... y23 k23
				390	// IntrVec2High = y8 k8 y9 k9 ... y15 k15 \| y24 k24 y25 k25 ... y31 k31
Michael Zuckerman	b92b6d4	2017-09-30 14:55:03 +0000	[diff] [blame]	391	Value *IntrVec[4];
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	392
Michael Zuckerman	b92b6d4	2017-09-30 14:55:03 +0000	[diff] [blame]	393	IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
				394	IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
				395	IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
				396	IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	397
				398	// cmyk4 cmyk5 cmyk6 cmyk7 \| cmyk20 cmyk21 cmyk22 cmyk23
				399	// cmyk12 cmyk13 cmyk14 cmyk15 \| cmyk28 cmyk29 cmyk30 cmyk31
				400	// cmyk0 cmyk1 cmyk2 cmyk3 \| cmyk16 cmyk17 cmyk18 cmyk19
				401	// cmyk8 cmyk9 cmyk10 cmyk11 \| cmyk24 cmyk25 cmyk26 cmyk27
				402
Michael Zuckerman	b92b6d4	2017-09-30 14:55:03 +0000	[diff] [blame]	403	Value *VecOut[4];
				404	for (int i = 0; i < 4; i++)
				405	VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
				406	LowHighMask[i % 2]);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	407
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	408	// cmyk0 cmyk1 cmyk2 cmyk3 \| cmyk4 cmyk5 cmyk6 cmyk7
				409	// cmyk8 cmyk9 cmyk10 cmyk11 \| cmyk12 cmyk13 cmyk14 cmyk15
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	410	// cmyk16 cmyk17 cmyk18 cmyk19 \| cmyk20 cmyk21 cmyk22 cmyk23
				411	// cmyk24 cmyk25 cmyk26 cmyk27 \| cmyk28 cmyk29 cmyk30 cmyk31
				412
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	413	if (VT == MVT::v16i8) {
				414	std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
				415	return;
				416	}
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	417
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	418	reorderSubVector(VT, TransposedMatrix, VecOut, makeArrayRef(Concat, 16),
				419	NumOfElm, 4, Builder);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	420	}
				421
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	422	// createShuffleStride returns shuffle mask of size N.
				423	// The shuffle pattern is as following :
				424	// {0, Stride%(VF/Lane), (2Stride%(VF/Lane))...(VFStride/Lane)%(VF/Lane),
				425	// (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
				426	// (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
				427	// Where Lane is the # of lanes in a register:
				428	// VectorSize = 128 => Lane = 1
				429	// VectorSize = 256 => Lane = 2
				430	// For example shuffle pattern for VF 16 register size 256 -> lanes = 2
				431	// {<[0\|3\|6\|1\|4\|7\|2\|5]-[8\|11\|14\|9\|12\|15\|10\|13]>}
				432	static void createShuffleStride(MVT VT, int Stride,
				433	SmallVectorImpl<uint32_t> &Mask) {
				434	int VectorSize = VT.getSizeInBits();
				435	int VF = VT.getVectorNumElements();
				436	int LaneCount = std::max(VectorSize / 128, 1);
				437	for (int Lane = 0; Lane < LaneCount; Lane++)
				438	for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
				439	Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
				440	}
				441
				442	// setGroupSize sets 'SizeInfo' to the size(number of elements) of group
				443	// inside mask a shuffleMask. A mask contains exactly 3 groups, where
				444	// each group is a monotonically increasing sequence with stride 3.
				445	// For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
				446	static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
				447	int VectorSize = VT.getSizeInBits();
				448	int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
				449	for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
				450	int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
				451	SizeInfo.push_back(GroupSize);
				452	FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
				453	}
				454	}
				455
				456	// DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
				457	// vpalign works according to lanes
				458	// Where Lane is the # of lanes in a register:
				459	// VectorWide = 128 => Lane = 1
				460	// VectorWide = 256 => Lane = 2
				461	// For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
				462	// For Lane = 2 shuffle pattern is:
				463	// {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
				464	// Imm variable sets the offset amount. The result of the
				465	// function is stored inside ShuffleMask vector and it built as described in
				466	// the begin of the description. AlignDirection is a boolean that indecat the
				467	// direction of the alignment. (false - align to the "right" side while true -
				468	// align to the "left" side)
				469	static void DecodePALIGNRMask(MVT VT, unsigned Imm,
				470	SmallVectorImpl<uint32_t> &ShuffleMask,
				471	bool AlignDirection = true, bool Unary = false) {
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	472	unsigned NumElts = VT.getVectorNumElements();
				473	unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
				474	unsigned NumLaneElts = NumElts / NumLanes;
				475
				476	Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
				477	unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
				478
				479	for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
				480	for (unsigned i = 0; i != NumLaneElts; ++i) {
				481	unsigned Base = i + Offset;
				482	// if i+offset is out of this lane then we actually need the other source
				483	// If Unary the other source is the first source.
				484	if (Base >= NumLaneElts)
				485	Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
				486	ShuffleMask.push_back(Base + l);
				487	}
				488	}
				489	}
				490
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	491	// concatSubVector - The function rebuilds the data to a correct expected
				492	// order. An assumption(The shape of the matrix) was taken for the
				493	// deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
				494	// This function ensures that the data is built in correct way for the lane
				495	// instructions. Each lane inside the vector is a 128-bit length.
				496	//
				497	// The 'InVec' argument contains the data in increasing order. In InVec[0] You
				498	// can find the first 128 bit data. The number of different lanes inside a
				499	// vector depends on the 'VecElems'.In general, the formula is
				500	// VecElems * type / 128. The size of the array 'InVec' depends and equal to
				501	// 'VecElems'.
				502
				503	// For VecElems = 16
				504	// Invec[0] - \|0\| Vec[0] - \|0\|
				505	// Invec[1] - \|1\| => Vec[1] - \|1\|
				506	// Invec[2] - \|2\| Vec[2] - \|2\|
				507
				508	// For VecElems = 32
				509	// Invec[0] - \|0\|1\| Vec[0] - \|0\|3\|
				510	// Invec[1] - \|2\|3\| => Vec[1] - \|1\|4\|
				511	// Invec[2] - \|4\|5\| Vec[2] - \|2\|5\|
				512
				513	// For VecElems = 64
				514	// Invec[0] - \|0\|1\|2 \|3 \| Vec[0] - \|0\|3\|6\|9 \|
				515	// Invec[1] - \|4\|5\|6 \|7 \| => Vec[1] - \|1\|4\|7\|10\|
				516	// Invec[2] - \|8\|9\|10\|11\| Vec[2] - \|2\|5\|8\|11\|
				517
				518	static void concatSubVector(Value *Vec, ArrayRef<Instruction > InVec,
				519	unsigned VecElems, IRBuilder<> Builder) {
				520	if (VecElems == 16) {
				521	for (int i = 0; i < 3; i++)
				522	Vec[i] = InVec[i];
				523	return;
				524	}
				525
				526	for (unsigned j = 0; j < VecElems / 32; j++)
				527	for (int i = 0; i < 3; i++)
				528	Vec[i + j * 3] = Builder.CreateShuffleVector(
				529	InVec[j * 6 + i], InVec[j * 6 + i + 3], makeArrayRef(Concat, 32));
				530
				531	if (VecElems == 32)
				532	return;
				533
				534	for (int i = 0; i < 3; i++)
				535	Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat);
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	536	}
				537
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	538	void X86InterleavedAccessGroup::deinterleave8bitStride3(
				539	ArrayRef<Instruction > InVec, SmallVectorImpl<Value > &TransposedMatrix,
				540	unsigned VecElems) {
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	541	// Example: Assuming we start from the following vectors:
				542	// Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
				543	// Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
				544	// Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
				545
				546	TransposedMatrix.resize(3);
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	547	SmallVector<uint32_t, 32> VPShuf;
				548	SmallVector<uint32_t, 32> VPAlign[2];
				549	SmallVector<uint32_t, 32> VPAlign2;
				550	SmallVector<uint32_t, 32> VPAlign3;
				551	SmallVector<uint32_t, 3> GroupSize;
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	552	Value Vec[6], TempVector[3];
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	553
				554	MVT VT = MVT::getVT(Shuffles[0]->getType());
				555
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	556	createShuffleStride(VT, 3, VPShuf);
				557	setGroupSize(VT, GroupSize);
				558
				559	for (int i = 0; i < 2; i++)
				560	DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
				561
				562	DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
				563	DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
				564
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	565	concatSubVector(Vec, InVec, VecElems, Builder);
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	566	// Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
				567	// Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
				568	// Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
				569
				570	for (int i = 0; i < 3; i++)
				571	Vec[i] = Builder.CreateShuffleVector(
				572	Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf);
				573
				574	// TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
				575	// TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
				576	// TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
				577
				578	for (int i = 0; i < 3; i++)
				579	TempVector[i] =
				580	Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
				581
				582	// Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
				583	// Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
				584	// Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
				585
				586	for (int i = 0; i < 3; i++)
				587	Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
				588	VPAlign[1]);
				589
				590	// TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
				591	// TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
				592	// TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
				593
				594	Value *TempVec = Builder.CreateShuffleVector(
				595	Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3);
				596	TransposedMatrix[0] = Builder.CreateShuffleVector(
				597	Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2);
				598	TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
				599	TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	600	}
				601
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	602	// group2Shuffle reorder the shuffle stride back into continuous order.
				603	// For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
				604	// MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
				605	static void group2Shuffle(MVT VT, SmallVectorImpl<uint32_t> &Mask,
				606	SmallVectorImpl<uint32_t> &Output) {
				607	int IndexGroup[3] = {0, 0, 0};
				608	int Index = 0;
				609	int VectorWidth = VT.getSizeInBits();
				610	int VF = VT.getVectorNumElements();
				611	// Find the index of the different groups.
				612	int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
				613	for (int i = 0; i < 3; i++) {
				614	IndexGroup[(Index * 3) % (VF / Lane)] = Index;
				615	Index += Mask[i];
				616	}
				617	// According to the index compute the convert mask.
				618	for (int i = 0; i < VF / Lane; i++) {
				619	Output.push_back(IndexGroup[i % 3]);
				620	IndexGroup[i % 3]++;
				621	}
				622	}
				623
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	624	void X86InterleavedAccessGroup::interleave8bitStride3(
				625	ArrayRef<Instruction > InVec, SmallVectorImpl<Value > &TransposedMatrix,
				626	unsigned VecElems) {
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	627	// Example: Assuming we start from the following vectors:
				628	// Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
				629	// Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
				630	// Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
				631
				632	TransposedMatrix.resize(3);
				633	SmallVector<uint32_t, 3> GroupSize;
				634	SmallVector<uint32_t, 32> VPShuf;
				635	SmallVector<uint32_t, 32> VPAlign[3];
				636	SmallVector<uint32_t, 32> VPAlign2;
				637	SmallVector<uint32_t, 32> VPAlign3;
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	638
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	639	Value Vec[3], TempVector[3];
				640	MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
				641
				642	setGroupSize(VT, GroupSize);
				643
				644	for (int i = 0; i < 3; i++)
				645	DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
				646
				647	DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
				648	DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
				649
				650	// Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
				651	// Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
				652	// Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
				653
				654	Vec[0] = Builder.CreateShuffleVector(
				655	InVec[0], UndefValue::get(InVec[0]->getType()), VPAlign2);
				656	Vec[1] = Builder.CreateShuffleVector(
				657	InVec[1], UndefValue::get(InVec[1]->getType()), VPAlign3);
				658	Vec[2] = InVec[2];
				659
				660	// Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
				661	// Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
				662	// Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
				663
				664	for (int i = 0; i < 3; i++)
				665	TempVector[i] =
				666	Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
				667
				668	// Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
				669	// Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
				670	// Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
				671
				672	for (int i = 0; i < 3; i++)
				673	Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
				674	VPAlign[2]);
				675
				676	// TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
				677	// TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
				678	// TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
				679
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	680	unsigned NumOfElm = VT.getVectorNumElements();
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	681	group2Shuffle(VT, GroupSize, VPShuf);
				682	reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm,3, Builder);
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	683	}
				684
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	685	void X86InterleavedAccessGroup::transpose_4x4(
				686	ArrayRef<Instruction *> Matrix,
				687	SmallVectorImpl<Value *> &TransposedMatrix) {
				688	assert(Matrix.size() == 4 && "Invalid matrix size");
				689	TransposedMatrix.resize(4);
				690
				691	// dst = src1[0,1],src2[0,1]
				692	uint32_t IntMask1[] = {0, 1, 4, 5};
				693	ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
				694	Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
				695	Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
				696
				697	// dst = src1[2,3],src2[2,3]
				698	uint32_t IntMask2[] = {2, 3, 6, 7};
				699	Mask = makeArrayRef(IntMask2, 4);
				700	Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
				701	Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
				702
				703	// dst = src1[0],src2[0],src1[2],src2[2]
				704	uint32_t IntMask3[] = {0, 4, 2, 6};
				705	Mask = makeArrayRef(IntMask3, 4);
				706	TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
				707	TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
				708
				709	// dst = src1[1],src2[1],src1[3],src2[3]
				710	uint32_t IntMask4[] = {1, 5, 3, 7};
				711	Mask = makeArrayRef(IntMask4, 4);
				712	TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
				713	TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
				714	}
				715
				716	// Lowers this interleaved access group into X86-specific
				717	// instructions/intrinsics.
				718	bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
				719	SmallVector<Instruction *, 4> DecomposedVectors;
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	720	SmallVector<Value *, 4> TransposedVectors;
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	721	VectorType *ShuffleTy = Shuffles[0]->getType();
				722
				723	if (isa<LoadInst>(Inst)) {
				724	// Try to generate target-sized register(/instruction).
				725	decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
				726
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	727	Type *ShuffleEltTy = Inst->getType();
				728	unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor;
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	729	// Perform matrix-transposition in order to compute interleaved
				730	// results by generating some sort of (optimized) target-specific
				731	// instructions.
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	732
				733	switch (NumSubVecElems) {
				734	default:
				735	return false;
				736	case 4:
				737	transpose_4x4(DecomposedVectors, TransposedVectors);
				738	break;
				739	case 8:
				740	case 16:
				741	case 32:
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	742	case 64:
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	743	deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
				744	NumSubVecElems);
				745	break;
				746	}
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	747
				748	// Now replace the unoptimized-interleaved-vectors with the
				749	// transposed-interleaved vectors.
				750	for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
				751	Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
				752
				753	return true;
				754	}
				755
				756	Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
				757	unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
				758
				759	// Lower the interleaved stores:
				760	// 1. Decompose the interleaved wide shuffle into individual shuffle
				761	// vectors.
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	762	decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
				763	DecomposedVectors);
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	764
				765	// 2. Transpose the interleaved-vectors into vectors of contiguous
				766	// elements.
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	767	switch (NumSubVecElems) {
				768	case 4:
				769	transpose_4x4(DecomposedVectors, TransposedVectors);
				770	break;
Michael Zuckerman	4a97df0	2017-09-25 14:50:38 +0000	[diff] [blame]	771	case 8:
				772	interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
				773	break;
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	774	case 16:
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	775	case 32:
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	776	case 64:
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	777	if (Factor == 4)
				778	interleave8bitStride4(DecomposedVectors, TransposedVectors,
				779	NumSubVecElems);
				780	if (Factor == 3)
				781	interleave8bitStride3(DecomposedVectors, TransposedVectors,
				782	NumSubVecElems);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	783	break;
				784	default:
				785	return false;
				786	}
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	787
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	788	// 3. Concatenate the contiguous-vectors back into a wide vector.
				789	Value *WideVec = concatenateVectors(Builder, TransposedVectors);
				790
				791	// 4. Generate a store instruction for wide-vec.
				792	StoreInst *SI = cast<StoreInst>(Inst);
				793	Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
				794	SI->getAlignment());
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	795
				796	return true;
				797	}
				798
				799	// Lower interleaved load(s) into target specific instructions/
				800	// intrinsics. Lowering sequence varies depending on the vector-types, factor,
				801	// number of shuffles and ISA.
				802	// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	803	bool X86TargetLowering::lowerInterleavedLoad(
				804	LoadInst LI, ArrayRef<ShuffleVectorInst > Shuffles,
				805	ArrayRef<unsigned> Indices, unsigned Factor) const {
				806	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
				807	"Invalid interleave factor");
				808	assert(!Shuffles.empty() && "Empty shufflevector input");
				809	assert(Shuffles.size() == Indices.size() &&
				810	"Unmatched number of shufflevectors and indices");
				811
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	812	// Create an interleaved access group.
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	813	IRBuilder<> Builder(LI);
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	814	X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
				815	Builder);
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	816
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	817	return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	818	}
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	819
				820	bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
				821	ShuffleVectorInst *SVI,
				822	unsigned Factor) const {
				823	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
				824	"Invalid interleave factor");
				825
Farhana Aleen	9bd593e	2017-06-22 23:56:31 +0000	[diff] [blame]	826	assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	827	"Invalid interleaved store");
				828
				829	// Holds the indices of SVI that correspond to the starting index of each
				830	// interleaved shuffle.
				831	SmallVector<unsigned, 4> Indices;
				832	auto Mask = SVI->getShuffleMask();
				833	for (unsigned i = 0; i < Factor; i++)
				834	Indices.push_back(Mask[i]);
				835
				836	ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
				837
				838	// Create an interleaved access group.
				839	IRBuilder<> Builder(SI);
				840	X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
				841	Builder);
				842
				843	return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
				844	}