Blame - llvm/lib/Target/X86/X86InterleavedAccess.cpp - toolchain/llvm-project

blob: a95eeffd94dd94a733d924a2b50a0531adaf33e8 [file] [log] [blame]

Eugene Zelenko	60433b6	2017-10-05 00:33:50 +0000	[diff] [blame]	1	//===- X86InterleavedAccess.cpp -------------------------------------------===//
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	2	//
Chandler Carruth	2946cd7	2019-01-19 08:50:56 +0000	[diff] [blame]	3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				4	// See https://llvm.org/LICENSE.txt for license information.
				5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	6	//
Eugene Zelenko	60433b6	2017-10-05 00:33:50 +0000	[diff] [blame]	7	//===----------------------------------------------------------------------===//
				8	//
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	9	/// \file
				10	/// This file contains the X86 implementation of the interleaved accesses
				11	/// optimization generating X86-specific instructions/intrinsics for
				12	/// interleaved access groups.
Eugene Zelenko	60433b6	2017-10-05 00:33:50 +0000	[diff] [blame]	13	//
				14	//===----------------------------------------------------------------------===//
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	15
Eugene Zelenko	60433b6	2017-10-05 00:33:50 +0000	[diff] [blame]	16	#include "X86ISelLowering.h"
				17	#include "X86Subtarget.h"
				18	#include "llvm/ADT/ArrayRef.h"
				19	#include "llvm/ADT/SmallVector.h"
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	20	#include "llvm/Analysis/VectorUtils.h"
Eugene Zelenko	60433b6	2017-10-05 00:33:50 +0000	[diff] [blame]	21	#include "llvm/IR/Constants.h"
				22	#include "llvm/IR/DataLayout.h"
				23	#include "llvm/IR/DerivedTypes.h"
				24	#include "llvm/IR/IRBuilder.h"
				25	#include "llvm/IR/Instruction.h"
				26	#include "llvm/IR/Instructions.h"
				27	#include "llvm/IR/Module.h"
				28	#include "llvm/IR/Type.h"
				29	#include "llvm/IR/Value.h"
				30	#include "llvm/Support/Casting.h"
David Blaikie	13e77db	2018-03-23 23:58:25 +0000	[diff] [blame]	31	#include "llvm/Support/MachineValueType.h"
Eugene Zelenko	60433b6	2017-10-05 00:33:50 +0000	[diff] [blame]	32	#include <algorithm>
				33	#include <cassert>
				34	#include <cmath>
				35	#include <cstdint>
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	36
				37	using namespace llvm;
				38
Benjamin Kramer	efcf06f	2017-02-11 11:06:55 +0000	[diff] [blame]	39	namespace {
Eugene Zelenko	60433b6	2017-10-05 00:33:50 +0000	[diff] [blame]	40
Adrian Prantl	5f8f34e4	2018-05-01 15:54:18 +0000	[diff] [blame]	41	/// This class holds necessary information to represent an interleaved
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	42	/// access group and supports utilities to lower the group into
				43	/// X86-specific instructions/intrinsics.
				44	/// E.g. A group of interleaving access loads (Factor = 2; accessing every
				45	/// other element)
				46	/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
				47	/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
				48	/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	49	class X86InterleavedAccessGroup {
Adrian Prantl	5f8f34e4	2018-05-01 15:54:18 +0000	[diff] [blame]	50	/// Reference to the wide-load instruction of an interleaved access
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	51	/// group.
				52	Instruction *const Inst;
				53
Adrian Prantl	5f8f34e4	2018-05-01 15:54:18 +0000	[diff] [blame]	54	/// Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	55	ArrayRef<ShuffleVectorInst *> Shuffles;
				56
Adrian Prantl	5f8f34e4	2018-05-01 15:54:18 +0000	[diff] [blame]	57	/// Reference to the starting index of each user-shuffle.
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	58	ArrayRef<unsigned> Indices;
				59
Adrian Prantl	5f8f34e4	2018-05-01 15:54:18 +0000	[diff] [blame]	60	/// Reference to the interleaving stride in terms of elements.
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	61	const unsigned Factor;
				62
Adrian Prantl	5f8f34e4	2018-05-01 15:54:18 +0000	[diff] [blame]	63	/// Reference to the underlying target.
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	64	const X86Subtarget &Subtarget;
				65
				66	const DataLayout &DL;
				67
				68	IRBuilder<> &Builder;
				69
Adrian Prantl	5f8f34e4	2018-05-01 15:54:18 +0000	[diff] [blame]	70	/// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	71	/// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
				72	void decompose(Instruction Inst, unsigned NumSubVectors, VectorType T,
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	73	SmallVectorImpl<Instruction *> &DecomposedVectors);
				74
Adrian Prantl	5f8f34e4	2018-05-01 15:54:18 +0000	[diff] [blame]	75	/// Performs matrix transposition on a 4x4 matrix \p InputVectors and
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	76	/// returns the transposed-vectors in \p TransposedVectors.
				77	/// E.g.
				78	/// InputVectors:
				79	/// In-V0 = p1, p2, p3, p4
				80	/// In-V1 = q1, q2, q3, q4
				81	/// In-V2 = r1, r2, r3, r4
				82	/// In-V3 = s1, s2, s3, s4
				83	/// OutputVectors:
				84	/// Out-V0 = p1, q1, r1, s1
				85	/// Out-V1 = p2, q2, r2, s2
				86	/// Out-V2 = p3, q3, r3, s3
				87	/// Out-V3 = P4, q4, r4, s4
				88	void transpose_4x4(ArrayRef<Instruction *> InputVectors,
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	89	SmallVectorImpl<Value *> &TransposedMatrix);
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	90	void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
				91	SmallVectorImpl<Value *> &TransposedMatrix,
				92	unsigned NumSubVecElems);
Michael Zuckerman	4a97df0	2017-09-25 14:50:38 +0000	[diff] [blame]	93	void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
				94	SmallVectorImpl<Value *> &TransposedMatrix);
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	95	void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
				96	SmallVectorImpl<Value *> &TransposedMatrix,
				97	unsigned NumSubVecElems);
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	98	void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
				99	SmallVectorImpl<Value *> &TransposedMatrix,
				100	unsigned NumSubVecElems);
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	101
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	102	public:
				103	/// In order to form an interleaved access group X86InterleavedAccessGroup
				104	/// requires a wide-load instruction \p 'I', a group of interleaved-vectors
				105	/// \p Shuffs, reference to the first indices of each interleaved-vector
				106	/// \p 'Ind' and the interleaving stride factor \p F. In order to generate
				107	/// X86-specific instructions/intrinsics it also requires the underlying
				108	/// target information \p STarget.
				109	explicit X86InterleavedAccessGroup(Instruction *I,
				110	ArrayRef<ShuffleVectorInst *> Shuffs,
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	111	ArrayRef<unsigned> Ind, const unsigned F,
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	112	const X86Subtarget &STarget,
				113	IRBuilder<> &B)
				114	: Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
				115	DL(Inst->getModule()->getDataLayout()), Builder(B) {}
				116
Adrian Prantl	5f8f34e4	2018-05-01 15:54:18 +0000	[diff] [blame]	117	/// Returns true if this interleaved access group can be lowered into
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	118	/// x86-specific instructions/intrinsics, false otherwise.
				119	bool isSupported() const;
				120
Adrian Prantl	5f8f34e4	2018-05-01 15:54:18 +0000	[diff] [blame]	121	/// Lowers this interleaved access group into X86-specific
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	122	/// instructions/intrinsics.
				123	bool lowerIntoOptimizedSequence();
				124	};
Eugene Zelenko	60433b6	2017-10-05 00:33:50 +0000	[diff] [blame]	125
Benjamin Kramer	efcf06f	2017-02-11 11:06:55 +0000	[diff] [blame]	126	} // end anonymous namespace
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	127
				128	bool X86InterleavedAccessGroup::isSupported() const {
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	129	VectorType *ShuffleVecTy = Shuffles[0]->getType();
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	130	Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	131	unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	132	unsigned WideInstSize;
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	133
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	134	// Currently, lowering is supported for the following vectors:
				135	// Stride 4:
				136	// 1. Store and load of 4-element vectors of 64 bits on AVX.
				137	// 2. Store of 16/32-element vectors of 8 bits on AVX.
				138	// Stride 3:
Craig Topper	2153114	2017-11-14 16:14:00 +0000	[diff] [blame]	139	// 1. Load of 16/32-element vectors of 8 bits on AVX.
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	140	if (!Subtarget.hasAVX() \|\| (Factor != 4 && Factor != 3))
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	141	return false;
				142
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	143	if (isa<LoadInst>(Inst)) {
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	144	WideInstSize = DL.getTypeSizeInBits(Inst->getType());
Michael Zuckerman	72a6f89	2017-10-18 08:04:31 +0000	[diff] [blame]	145	if (cast<LoadInst>(Inst)->getPointerAddressSpace())
				146	return false;
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	147	} else
				148	WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
				149
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	150	// We support shuffle represents stride 4 for byte type with size of
				151	// WideInstSize.
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	152	if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
				153	return true;
				154
				155	if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	156	(WideInstSize == 256 \|\| WideInstSize == 512 \|\| WideInstSize == 1024 \|\|
				157	WideInstSize == 2048))
				158	return true;
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	159
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	160	if (ShuffleElemSize == 8 && Factor == 3 &&
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	161	(WideInstSize == 384 \|\| WideInstSize == 768 \|\| WideInstSize == 1536))
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	162	return true;
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	163
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	164	return false;
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	165	}
				166
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	167	void X86InterleavedAccessGroup::decompose(
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	168	Instruction VecInst, unsigned NumSubVectors, VectorType SubVecTy,
				169	SmallVectorImpl<Instruction *> &DecomposedVectors) {
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	170	assert((isa<LoadInst>(VecInst) \|\| isa<ShuffleVectorInst>(VecInst)) &&
				171	"Expected Load or Shuffle");
				172
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	173	Type *VecWidth = VecInst->getType();
				174	(void)VecWidth;
				175	assert(VecWidth->isVectorTy() &&
				176	DL.getTypeSizeInBits(VecWidth) >=
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	177	DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
				178	"Invalid Inst-size!!!");
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	179
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	180	if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
				181	Value *Op0 = SVI->getOperand(0);
				182	Value *Op1 = SVI->getOperand(1);
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	183
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	184	// Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
				185	for (unsigned i = 0; i < NumSubVectors; ++i)
				186	DecomposedVectors.push_back(
				187	cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	188	Op0, Op1,
				189	createSequentialMask(Builder, Indices[i],
				190	SubVecTy->getVectorNumElements(), 0))));
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	191	return;
				192	}
				193
				194	// Decompose the load instruction.
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	195	LoadInst *LI = cast<LoadInst>(VecInst);
James Y Knight	14359ef	2019-02-01 20:44:24 +0000	[diff] [blame]	196	Type VecBaseTy, VecBasePtrTy;
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	197	Value *VecBasePtr;
				198	unsigned int NumLoads = NumSubVectors;
				199	// In the case of stride 3 with a vector of 32 elements load the information
				200	// in the following way:
				201	// [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	202	unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
				203	if (VecLength == 768 \|\| VecLength == 1536) {
James Y Knight	14359ef	2019-02-01 20:44:24 +0000	[diff] [blame]	204	VecBaseTy = VectorType::get(Type::getInt8Ty(LI->getContext()), 16);
				205	VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	206	VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
James Y Knight	14359ef	2019-02-01 20:44:24 +0000	[diff] [blame]	207	NumLoads = NumSubVectors * (VecLength / 384);
				208	} else {
				209	VecBaseTy = SubVecTy;
				210	VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
				211	VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
				212	}
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	213	// Generate N loads of T type.
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	214	for (unsigned i = 0; i < NumLoads; i++) {
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	215	// TODO: Support inbounds GEP.
James Y Knight	7716075	2019-02-01 20:44:47 +0000	[diff] [blame]	216	Value *NewBasePtr =
				217	Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	218	Instruction *NewLoad =
James Y Knight	14359ef	2019-02-01 20:44:24 +0000	[diff] [blame]	219	Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, LI->getAlignment());
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	220	DecomposedVectors.push_back(NewLoad);
				221	}
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	222	}
				223
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	224	// Changing the scale of the vector type by reducing the number of elements and
				225	// doubling the scalar size.
				226	static MVT scaleVectorType(MVT VT) {
				227	unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
				228	return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
				229	VT.getVectorNumElements() / 2);
				230	}
				231
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	232	static uint32_t Concat[] = {
				233	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
				234	16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
				235	32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
				236	48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 };
				237
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	238	// genShuffleBland - Creates shuffle according to two vectors.This function is
				239	// only works on instructions with lane inside 256 registers. According to
				240	// the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
				241	// offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
				242	// Where the 'LowOffset' refers to the first vector and the highOffset refers to
				243	// the second vector.
				244	// \|a0....a5,b0....b4,c0....c4\|a16..a21,b16..b20,c16..c20\|
				245	// \|c5...c10,a5....a9,b5....b9\|c21..c26,a22..a26,b21..b25\|
				246	// \|b10..b15,c11..c15,a10..a15\|b26..b31,c27..c31,a27..a31\|
				247	// For the sequence to work as a mirror to the load.
				248	// We must consider the elements order as above.
				249	// In this function we are combining two types of shuffles.
				250	// The first one is vpshufed and the second is a type of "blend" shuffle.
				251	// By computing the shuffle on a sequence of 16 elements(one lane) and add the
				252	// correct offset. We are creating a vpsuffed + blend sequence between two
				253	// shuffles.
				254	static void genShuffleBland(MVT VT, ArrayRef<uint32_t> Mask,
				255	SmallVectorImpl<uint32_t> &Out, int LowOffset,
				256	int HighOffset) {
				257	assert(VT.getSizeInBits() >= 256 &&
				258	"This function doesn't accept width smaller then 256");
				259	unsigned NumOfElm = VT.getVectorNumElements();
				260	for (unsigned i = 0; i < Mask.size(); i++)
				261	Out.push_back(Mask[i] + LowOffset);
				262	for (unsigned i = 0; i < Mask.size(); i++)
				263	Out.push_back(Mask[i] + HighOffset + NumOfElm);
				264	}
				265
Craig Topper	2153114	2017-11-14 16:14:00 +0000	[diff] [blame]	266	// reorderSubVector returns the data to is the original state. And de-facto is
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	267	// the opposite of the function concatSubVector.
				268
				269	// For VecElems = 16
				270	// Invec[0] - \|0\| TransposedMatrix[0] - \|0\|
				271	// Invec[1] - \|1\| => TransposedMatrix[1] - \|1\|
				272	// Invec[2] - \|2\| TransposedMatrix[2] - \|2\|
				273
				274	// For VecElems = 32
				275	// Invec[0] - \|0\|3\| TransposedMatrix[0] - \|0\|1\|
				276	// Invec[1] - \|1\|4\| => TransposedMatrix[1] - \|2\|3\|
				277	// Invec[2] - \|2\|5\| TransposedMatrix[2] - \|4\|5\|
				278
				279	// For VecElems = 64
				280	// Invec[0] - \|0\|3\|6\|9 \| TransposedMatrix[0] - \|0\|1\|2 \|3 \|
				281	// Invec[1] - \|1\|4\|7\|10\| => TransposedMatrix[1] - \|4\|5\|6 \|7 \|
				282	// Invec[2] - \|2\|5\|8\|11\| TransposedMatrix[2] - \|8\|9\|10\|11\|
				283
				284	static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
				285	ArrayRef<Value *> Vec, ArrayRef<uint32_t> VPShuf,
				286	unsigned VecElems, unsigned Stride,
				287	IRBuilder<> Builder) {
				288
				289	if (VecElems == 16) {
				290	for (unsigned i = 0; i < Stride; i++)
				291	TransposedMatrix[i] = Builder.CreateShuffleVector(
				292	Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
				293	return;
				294	}
				295
				296	SmallVector<uint32_t, 32> OptimizeShuf;
				297	Value *Temp[8];
				298
				299	for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
				300	genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
				301	(i + 1) / Stride * 16);
				302	Temp[i / 2] = Builder.CreateShuffleVector(
				303	Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
				304	OptimizeShuf.clear();
				305	}
				306
				307	if (VecElems == 32) {
				308	std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
				309	return;
				310	}
				311	else
				312	for (unsigned i = 0; i < Stride; i++)
				313	TransposedMatrix[i] =
				314	Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	315	}
				316
Michael Zuckerman	4a97df0	2017-09-25 14:50:38 +0000	[diff] [blame]	317	void X86InterleavedAccessGroup::interleave8bitStride4VF8(
				318	ArrayRef<Instruction *> Matrix,
				319	SmallVectorImpl<Value *> &TransposedMatrix) {
				320	// Assuming we start from the following vectors:
				321	// Matrix[0]= c0 c1 c2 c3 c4 ... c7
				322	// Matrix[1]= m0 m1 m2 m3 m4 ... m7
				323	// Matrix[2]= y0 y1 y2 y3 y4 ... y7
				324	// Matrix[3]= k0 k1 k2 k3 k4 ... k7
				325
				326	MVT VT = MVT::v8i16;
				327	TransposedMatrix.resize(2);
				328	SmallVector<uint32_t, 16> MaskLow;
				329	SmallVector<uint32_t, 32> MaskLowTemp1, MaskLowWord;
				330	SmallVector<uint32_t, 32> MaskHighTemp1, MaskHighWord;
				331
				332	for (unsigned i = 0; i < 8; ++i) {
				333	MaskLow.push_back(i);
				334	MaskLow.push_back(i + 8);
				335	}
				336
				337	createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1, true, false);
				338	createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1, false, false);
				339	scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
				340	scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
				341	// IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
				342	// IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
				343	Value *IntrVec1Low =
				344	Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
				345	Value *IntrVec2Low =
				346	Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
				347
				348	// TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
				349	// TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
				350
				351	TransposedMatrix[0] =
				352	Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
				353	TransposedMatrix[1] =
				354	Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
				355	}
				356
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	357	void X86InterleavedAccessGroup::interleave8bitStride4(
				358	ArrayRef<Instruction > Matrix, SmallVectorImpl<Value > &TransposedMatrix,
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	359	unsigned NumOfElm) {
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	360	// Example: Assuming we start from the following vectors:
				361	// Matrix[0]= c0 c1 c2 c3 c4 ... c31
				362	// Matrix[1]= m0 m1 m2 m3 m4 ... m31
				363	// Matrix[2]= y0 y1 y2 y3 y4 ... y31
				364	// Matrix[3]= k0 k1 k2 k3 k4 ... k31
				365
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	366	MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm);
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	367	MVT HalfVT = scaleVectorType(VT);
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	368
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	369	TransposedMatrix.resize(4);
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	370	SmallVector<uint32_t, 32> MaskHigh;
				371	SmallVector<uint32_t, 32> MaskLow;
Michael Zuckerman	b92b6d4	2017-09-30 14:55:03 +0000	[diff] [blame]	372	SmallVector<uint32_t, 32> LowHighMask[2];
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	373	SmallVector<uint32_t, 32> MaskHighTemp;
				374	SmallVector<uint32_t, 32> MaskLowTemp;
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	375
				376	// MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
				377	// shuffle pattern.
				378
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	379	createUnpackShuffleMask<uint32_t>(VT, MaskLow, true, false);
Michael Zuckerman	b92b6d4	2017-09-30 14:55:03 +0000	[diff] [blame]	380	createUnpackShuffleMask<uint32_t>(VT, MaskHigh, false, false);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	381
				382	// MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
				383	// shuffle pattern.
				384
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	385	createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp, true, false);
				386	createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp, false, false);
				387	scaleShuffleMask<uint32_t>(2, MaskLowTemp, LowHighMask[0]);
				388	scaleShuffleMask<uint32_t>(2, MaskHighTemp, LowHighMask[1]);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	389
				390	// IntrVec1Low = c0 m0 c1 m1 ... c7 m7 \| c16 m16 c17 m17 ... c23 m23
				391	// IntrVec1High = c8 m8 c9 m9 ... c15 m15 \| c24 m24 c25 m25 ... c31 m31
				392	// IntrVec2Low = y0 k0 y1 k1 ... y7 k7 \| y16 k16 y17 k17 ... y23 k23
				393	// IntrVec2High = y8 k8 y9 k9 ... y15 k15 \| y24 k24 y25 k25 ... y31 k31
Michael Zuckerman	b92b6d4	2017-09-30 14:55:03 +0000	[diff] [blame]	394	Value *IntrVec[4];
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	395
Michael Zuckerman	b92b6d4	2017-09-30 14:55:03 +0000	[diff] [blame]	396	IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
				397	IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
				398	IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
				399	IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	400
				401	// cmyk4 cmyk5 cmyk6 cmyk7 \| cmyk20 cmyk21 cmyk22 cmyk23
				402	// cmyk12 cmyk13 cmyk14 cmyk15 \| cmyk28 cmyk29 cmyk30 cmyk31
				403	// cmyk0 cmyk1 cmyk2 cmyk3 \| cmyk16 cmyk17 cmyk18 cmyk19
				404	// cmyk8 cmyk9 cmyk10 cmyk11 \| cmyk24 cmyk25 cmyk26 cmyk27
				405
Michael Zuckerman	b92b6d4	2017-09-30 14:55:03 +0000	[diff] [blame]	406	Value *VecOut[4];
				407	for (int i = 0; i < 4; i++)
				408	VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
				409	LowHighMask[i % 2]);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	410
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	411	// cmyk0 cmyk1 cmyk2 cmyk3 \| cmyk4 cmyk5 cmyk6 cmyk7
				412	// cmyk8 cmyk9 cmyk10 cmyk11 \| cmyk12 cmyk13 cmyk14 cmyk15
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	413	// cmyk16 cmyk17 cmyk18 cmyk19 \| cmyk20 cmyk21 cmyk22 cmyk23
				414	// cmyk24 cmyk25 cmyk26 cmyk27 \| cmyk28 cmyk29 cmyk30 cmyk31
				415
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	416	if (VT == MVT::v16i8) {
				417	std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
				418	return;
				419	}
Michael Zuckerman	80d3649f	2017-09-13 18:28:09 +0000	[diff] [blame]	420
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	421	reorderSubVector(VT, TransposedMatrix, VecOut, makeArrayRef(Concat, 16),
				422	NumOfElm, 4, Builder);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	423	}
				424
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	425	// createShuffleStride returns shuffle mask of size N.
				426	// The shuffle pattern is as following :
				427	// {0, Stride%(VF/Lane), (2Stride%(VF/Lane))...(VFStride/Lane)%(VF/Lane),
				428	// (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
				429	// (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
				430	// Where Lane is the # of lanes in a register:
				431	// VectorSize = 128 => Lane = 1
				432	// VectorSize = 256 => Lane = 2
				433	// For example shuffle pattern for VF 16 register size 256 -> lanes = 2
				434	// {<[0\|3\|6\|1\|4\|7\|2\|5]-[8\|11\|14\|9\|12\|15\|10\|13]>}
				435	static void createShuffleStride(MVT VT, int Stride,
				436	SmallVectorImpl<uint32_t> &Mask) {
				437	int VectorSize = VT.getSizeInBits();
				438	int VF = VT.getVectorNumElements();
				439	int LaneCount = std::max(VectorSize / 128, 1);
				440	for (int Lane = 0; Lane < LaneCount; Lane++)
				441	for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
				442	Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
				443	}
				444
				445	// setGroupSize sets 'SizeInfo' to the size(number of elements) of group
				446	// inside mask a shuffleMask. A mask contains exactly 3 groups, where
				447	// each group is a monotonically increasing sequence with stride 3.
				448	// For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
				449	static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
				450	int VectorSize = VT.getSizeInBits();
				451	int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
				452	for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
				453	int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
				454	SizeInfo.push_back(GroupSize);
				455	FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
				456	}
				457	}
				458
				459	// DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
				460	// vpalign works according to lanes
				461	// Where Lane is the # of lanes in a register:
				462	// VectorWide = 128 => Lane = 1
				463	// VectorWide = 256 => Lane = 2
				464	// For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
				465	// For Lane = 2 shuffle pattern is:
				466	// {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
				467	// Imm variable sets the offset amount. The result of the
				468	// function is stored inside ShuffleMask vector and it built as described in
Craig Topper	eaa1cf5	2018-10-25 05:00:20 +0000	[diff] [blame]	469	// the begin of the description. AlignDirection is a boolean that indicates the
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	470	// direction of the alignment. (false - align to the "right" side while true -
				471	// align to the "left" side)
				472	static void DecodePALIGNRMask(MVT VT, unsigned Imm,
				473	SmallVectorImpl<uint32_t> &ShuffleMask,
				474	bool AlignDirection = true, bool Unary = false) {
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	475	unsigned NumElts = VT.getVectorNumElements();
				476	unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
				477	unsigned NumLaneElts = NumElts / NumLanes;
				478
				479	Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
				480	unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
				481
				482	for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
				483	for (unsigned i = 0; i != NumLaneElts; ++i) {
				484	unsigned Base = i + Offset;
				485	// if i+offset is out of this lane then we actually need the other source
				486	// If Unary the other source is the first source.
				487	if (Base >= NumLaneElts)
				488	Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
				489	ShuffleMask.push_back(Base + l);
				490	}
				491	}
				492	}
				493
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	494	// concatSubVector - The function rebuilds the data to a correct expected
				495	// order. An assumption(The shape of the matrix) was taken for the
				496	// deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
				497	// This function ensures that the data is built in correct way for the lane
				498	// instructions. Each lane inside the vector is a 128-bit length.
				499	//
				500	// The 'InVec' argument contains the data in increasing order. In InVec[0] You
				501	// can find the first 128 bit data. The number of different lanes inside a
				502	// vector depends on the 'VecElems'.In general, the formula is
				503	// VecElems * type / 128. The size of the array 'InVec' depends and equal to
				504	// 'VecElems'.
				505
				506	// For VecElems = 16
				507	// Invec[0] - \|0\| Vec[0] - \|0\|
				508	// Invec[1] - \|1\| => Vec[1] - \|1\|
				509	// Invec[2] - \|2\| Vec[2] - \|2\|
				510
				511	// For VecElems = 32
				512	// Invec[0] - \|0\|1\| Vec[0] - \|0\|3\|
				513	// Invec[1] - \|2\|3\| => Vec[1] - \|1\|4\|
				514	// Invec[2] - \|4\|5\| Vec[2] - \|2\|5\|
				515
				516	// For VecElems = 64
				517	// Invec[0] - \|0\|1\|2 \|3 \| Vec[0] - \|0\|3\|6\|9 \|
				518	// Invec[1] - \|4\|5\|6 \|7 \| => Vec[1] - \|1\|4\|7\|10\|
				519	// Invec[2] - \|8\|9\|10\|11\| Vec[2] - \|2\|5\|8\|11\|
				520
				521	static void concatSubVector(Value *Vec, ArrayRef<Instruction > InVec,
				522	unsigned VecElems, IRBuilder<> Builder) {
				523	if (VecElems == 16) {
				524	for (int i = 0; i < 3; i++)
				525	Vec[i] = InVec[i];
				526	return;
				527	}
				528
				529	for (unsigned j = 0; j < VecElems / 32; j++)
				530	for (int i = 0; i < 3; i++)
				531	Vec[i + j * 3] = Builder.CreateShuffleVector(
				532	InVec[j * 6 + i], InVec[j * 6 + i + 3], makeArrayRef(Concat, 32));
				533
				534	if (VecElems == 32)
				535	return;
				536
				537	for (int i = 0; i < 3; i++)
				538	Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat);
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	539	}
				540
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	541	void X86InterleavedAccessGroup::deinterleave8bitStride3(
				542	ArrayRef<Instruction > InVec, SmallVectorImpl<Value > &TransposedMatrix,
				543	unsigned VecElems) {
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	544	// Example: Assuming we start from the following vectors:
				545	// Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
				546	// Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
				547	// Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
				548
				549	TransposedMatrix.resize(3);
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	550	SmallVector<uint32_t, 32> VPShuf;
				551	SmallVector<uint32_t, 32> VPAlign[2];
				552	SmallVector<uint32_t, 32> VPAlign2;
				553	SmallVector<uint32_t, 32> VPAlign3;
				554	SmallVector<uint32_t, 3> GroupSize;
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	555	Value Vec[6], TempVector[3];
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	556
				557	MVT VT = MVT::getVT(Shuffles[0]->getType());
				558
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	559	createShuffleStride(VT, 3, VPShuf);
				560	setGroupSize(VT, GroupSize);
				561
				562	for (int i = 0; i < 2; i++)
				563	DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
				564
				565	DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
				566	DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
				567
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	568	concatSubVector(Vec, InVec, VecElems, Builder);
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	569	// Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
				570	// Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
				571	// Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
				572
				573	for (int i = 0; i < 3; i++)
				574	Vec[i] = Builder.CreateShuffleVector(
				575	Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf);
				576
				577	// TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
				578	// TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
				579	// TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
				580
				581	for (int i = 0; i < 3; i++)
				582	TempVector[i] =
				583	Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
				584
				585	// Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
				586	// Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
				587	// Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
				588
				589	for (int i = 0; i < 3; i++)
				590	Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
				591	VPAlign[1]);
				592
				593	// TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
				594	// TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
				595	// TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
				596
				597	Value *TempVec = Builder.CreateShuffleVector(
				598	Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3);
				599	TransposedMatrix[0] = Builder.CreateShuffleVector(
				600	Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2);
				601	TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
				602	TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	603	}
				604
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	605	// group2Shuffle reorder the shuffle stride back into continuous order.
				606	// For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
				607	// MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
				608	static void group2Shuffle(MVT VT, SmallVectorImpl<uint32_t> &Mask,
				609	SmallVectorImpl<uint32_t> &Output) {
				610	int IndexGroup[3] = {0, 0, 0};
				611	int Index = 0;
				612	int VectorWidth = VT.getSizeInBits();
				613	int VF = VT.getVectorNumElements();
				614	// Find the index of the different groups.
				615	int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
				616	for (int i = 0; i < 3; i++) {
				617	IndexGroup[(Index * 3) % (VF / Lane)] = Index;
				618	Index += Mask[i];
				619	}
				620	// According to the index compute the convert mask.
				621	for (int i = 0; i < VF / Lane; i++) {
				622	Output.push_back(IndexGroup[i % 3]);
				623	IndexGroup[i % 3]++;
				624	}
				625	}
				626
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	627	void X86InterleavedAccessGroup::interleave8bitStride3(
				628	ArrayRef<Instruction > InVec, SmallVectorImpl<Value > &TransposedMatrix,
				629	unsigned VecElems) {
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	630	// Example: Assuming we start from the following vectors:
				631	// Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
				632	// Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
				633	// Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
				634
				635	TransposedMatrix.resize(3);
				636	SmallVector<uint32_t, 3> GroupSize;
				637	SmallVector<uint32_t, 32> VPShuf;
				638	SmallVector<uint32_t, 32> VPAlign[3];
				639	SmallVector<uint32_t, 32> VPAlign2;
				640	SmallVector<uint32_t, 32> VPAlign3;
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	641
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	642	Value Vec[3], TempVector[3];
				643	MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
				644
				645	setGroupSize(VT, GroupSize);
				646
				647	for (int i = 0; i < 3; i++)
				648	DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
				649
				650	DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
				651	DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
				652
				653	// Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
				654	// Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
				655	// Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
				656
				657	Vec[0] = Builder.CreateShuffleVector(
				658	InVec[0], UndefValue::get(InVec[0]->getType()), VPAlign2);
				659	Vec[1] = Builder.CreateShuffleVector(
				660	InVec[1], UndefValue::get(InVec[1]->getType()), VPAlign3);
				661	Vec[2] = InVec[2];
				662
				663	// Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
				664	// Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
				665	// Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
				666
				667	for (int i = 0; i < 3; i++)
				668	TempVector[i] =
				669	Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
				670
				671	// Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
				672	// Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
				673	// Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
				674
				675	for (int i = 0; i < 3; i++)
				676	Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
				677	VPAlign[2]);
				678
				679	// TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
				680	// TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
				681	// TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
				682
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	683	unsigned NumOfElm = VT.getVectorNumElements();
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	684	group2Shuffle(VT, GroupSize, VPShuf);
				685	reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm,3, Builder);
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	686	}
				687
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	688	void X86InterleavedAccessGroup::transpose_4x4(
				689	ArrayRef<Instruction *> Matrix,
				690	SmallVectorImpl<Value *> &TransposedMatrix) {
				691	assert(Matrix.size() == 4 && "Invalid matrix size");
				692	TransposedMatrix.resize(4);
				693
				694	// dst = src1[0,1],src2[0,1]
				695	uint32_t IntMask1[] = {0, 1, 4, 5};
				696	ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
				697	Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
				698	Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
				699
				700	// dst = src1[2,3],src2[2,3]
				701	uint32_t IntMask2[] = {2, 3, 6, 7};
				702	Mask = makeArrayRef(IntMask2, 4);
				703	Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
				704	Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
				705
				706	// dst = src1[0],src2[0],src1[2],src2[2]
				707	uint32_t IntMask3[] = {0, 4, 2, 6};
				708	Mask = makeArrayRef(IntMask3, 4);
				709	TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
				710	TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
				711
				712	// dst = src1[1],src2[1],src1[3],src2[3]
				713	uint32_t IntMask4[] = {1, 5, 3, 7};
				714	Mask = makeArrayRef(IntMask4, 4);
				715	TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
				716	TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
				717	}
				718
				719	// Lowers this interleaved access group into X86-specific
				720	// instructions/intrinsics.
				721	bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
				722	SmallVector<Instruction *, 4> DecomposedVectors;
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	723	SmallVector<Value *, 4> TransposedVectors;
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	724	VectorType *ShuffleTy = Shuffles[0]->getType();
				725
				726	if (isa<LoadInst>(Inst)) {
				727	// Try to generate target-sized register(/instruction).
				728	decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
				729
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	730	Type *ShuffleEltTy = Inst->getType();
				731	unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor;
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	732	// Perform matrix-transposition in order to compute interleaved
				733	// results by generating some sort of (optimized) target-specific
				734	// instructions.
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	735
				736	switch (NumSubVecElems) {
				737	default:
				738	return false;
				739	case 4:
				740	transpose_4x4(DecomposedVectors, TransposedVectors);
				741	break;
				742	case 8:
				743	case 16:
				744	case 32:
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	745	case 64:
Michael Zuckerman	5a38594	2017-09-07 14:02:13 +0000	[diff] [blame]	746	deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
				747	NumSubVecElems);
				748	break;
				749	}
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	750
				751	// Now replace the unoptimized-interleaved-vectors with the
				752	// transposed-interleaved vectors.
				753	for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
				754	Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
				755
				756	return true;
				757	}
				758
				759	Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
				760	unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
				761
				762	// Lower the interleaved stores:
				763	// 1. Decompose the interleaved wide shuffle into individual shuffle
				764	// vectors.
Farhana Aleen	e4a89a6	2017-07-21 21:35:00 +0000	[diff] [blame]	765	decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
				766	DecomposedVectors);
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	767
				768	// 2. Transpose the interleaved-vectors into vectors of contiguous
				769	// elements.
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	770	switch (NumSubVecElems) {
				771	case 4:
				772	transpose_4x4(DecomposedVectors, TransposedVectors);
				773	break;
Michael Zuckerman	4a97df0	2017-09-25 14:50:38 +0000	[diff] [blame]	774	case 8:
				775	interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
				776	break;
Michael Zuckerman	680ac10	2017-08-07 13:22:39 +0000	[diff] [blame]	777	case 16:
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	778	case 32:
Michael Zuckerman	e4084f6b	2017-10-02 07:35:25 +0000	[diff] [blame]	779	case 64:
Michael Zuckerman	645f777	2017-09-26 18:49:11 +0000	[diff] [blame]	780	if (Factor == 4)
				781	interleave8bitStride4(DecomposedVectors, TransposedVectors,
				782	NumSubVecElems);
				783	if (Factor == 3)
				784	interleave8bitStride3(DecomposedVectors, TransposedVectors,
				785	NumSubVecElems);
Michael Zuckerman	c1918ad	2017-07-26 08:10:14 +0000	[diff] [blame]	786	break;
				787	default:
				788	return false;
				789	}
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	790
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	791	// 3. Concatenate the contiguous-vectors back into a wide vector.
				792	Value *WideVec = concatenateVectors(Builder, TransposedVectors);
				793
				794	// 4. Generate a store instruction for wide-vec.
				795	StoreInst *SI = cast<StoreInst>(Inst);
				796	Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
				797	SI->getAlignment());
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	798
				799	return true;
				800	}
				801
				802	// Lower interleaved load(s) into target specific instructions/
				803	// intrinsics. Lowering sequence varies depending on the vector-types, factor,
				804	// number of shuffles and ISA.
				805	// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	806	bool X86TargetLowering::lowerInterleavedLoad(
				807	LoadInst LI, ArrayRef<ShuffleVectorInst > Shuffles,
				808	ArrayRef<unsigned> Indices, unsigned Factor) const {
				809	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
				810	"Invalid interleave factor");
				811	assert(!Shuffles.empty() && "Empty shufflevector input");
				812	assert(Shuffles.size() == Indices.size() &&
				813	"Unmatched number of shufflevectors and indices");
				814
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	815	// Create an interleaved access group.
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	816	IRBuilder<> Builder(LI);
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	817	X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
				818	Builder);
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	819
David L Kreitzer	0e3ae30	2016-12-01 19:56:39 +0000	[diff] [blame]	820	return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
David L Kreitzer	01a057a	2016-10-14 18:20:41 +0000	[diff] [blame]	821	}
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	822
				823	bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
				824	ShuffleVectorInst *SVI,
				825	unsigned Factor) const {
				826	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
				827	"Invalid interleave factor");
				828
Farhana Aleen	9bd593e	2017-06-22 23:56:31 +0000	[diff] [blame]	829	assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
Farhana Aleen	4b652a5	2017-06-22 22:59:04 +0000	[diff] [blame]	830	"Invalid interleaved store");
				831
				832	// Holds the indices of SVI that correspond to the starting index of each
				833	// interleaved shuffle.
				834	SmallVector<unsigned, 4> Indices;
				835	auto Mask = SVI->getShuffleMask();
				836	for (unsigned i = 0; i < Factor; i++)
				837	Indices.push_back(Mask[i]);
				838
				839	ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
				840
				841	// Create an interleaved access group.
				842	IRBuilder<> Builder(SI);
				843	X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
				844	Builder);
				845
				846	return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
				847	}