Blame - llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp - toolchain/llvm-project

blob: 1b23c9ef1a6ac9137db52db59407c2b7f36e93e2 [file] [log] [blame]

Martin Elshuber	fef3036	2018-11-19 14:26:10 +0000	[diff] [blame]	1	//===- InterleavedLoadCombine.cpp - Combine Interleaved Loads ---- C++ --===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// \file
				11	//
				12	// This file defines the interleaved-load-combine pass. The pass searches for
				13	// ShuffleVectorInstruction that execute interleaving loads. If a matching
				14	// pattern is found, it adds a combined load and further instructions in a
				15	// pattern that is detectable by InterleavedAccesPass. The old instructions are
				16	// left dead to be removed later. The pass is specifically designed to be
				17	// executed just before InterleavedAccesPass to find any left-over instances
				18	// that are not detected within former passes.
				19	//
				20	//===----------------------------------------------------------------------===//
				21
				22	#include "llvm/ADT/Statistic.h"
				23	#include "llvm/Analysis/MemoryLocation.h"
				24	#include "llvm/Analysis/MemorySSA.h"
				25	#include "llvm/Analysis/MemorySSAUpdater.h"
				26	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
				27	#include "llvm/Analysis/TargetTransformInfo.h"
				28	#include "llvm/CodeGen/Passes.h"
				29	#include "llvm/CodeGen/TargetLowering.h"
				30	#include "llvm/CodeGen/TargetPassConfig.h"
				31	#include "llvm/CodeGen/TargetSubtargetInfo.h"
				32	#include "llvm/IR/DataLayout.h"
				33	#include "llvm/IR/Dominators.h"
				34	#include "llvm/IR/Function.h"
				35	#include "llvm/IR/Instructions.h"
				36	#include "llvm/IR/LegacyPassManager.h"
				37	#include "llvm/IR/Module.h"
				38	#include "llvm/Pass.h"
				39	#include "llvm/Support/Debug.h"
				40	#include "llvm/Support/ErrorHandling.h"
				41	#include "llvm/Support/raw_ostream.h"
				42	#include "llvm/Target/TargetMachine.h"
Martin Elshuber	fef3036	2018-11-19 14:26:10 +0000	[diff] [blame]	43
				44	#include <algorithm>
				45	#include <cassert>
				46	#include <list>
				47
				48	using namespace llvm;
				49
				50	#define DEBUG_TYPE "interleaved-load-combine"
				51
				52	namespace {
				53
				54	/// Statistic counter
				55	STATISTIC(NumInterleavedLoadCombine, "Number of combined loads");
				56
				57	/// Option to disable the pass
				58	static cl::opt<bool> DisableInterleavedLoadCombine(
				59	"disable-" DEBUG_TYPE, cl::init(false), cl::Hidden,
				60	cl::desc("Disable combining of interleaved loads"));
				61
				62	struct VectorInfo;
				63
				64	struct InterleavedLoadCombineImpl {
				65	public:
				66	InterleavedLoadCombineImpl(Function &F, DominatorTree &DT, MemorySSA &MSSA,
				67	TargetMachine &TM)
				68	: F(F), DT(DT), MSSA(MSSA),
				69	TLI(*TM.getSubtargetImpl(F)->getTargetLowering()),
				70	TTI(TM.getTargetTransformInfo(F)) {}
				71
				72	/// Scan the function for interleaved load candidates and execute the
				73	/// replacement if applicable.
				74	bool run();
				75
				76	private:
				77	/// Function this pass is working on
				78	Function &F;
				79
				80	/// Dominator Tree Analysis
				81	DominatorTree &DT;
				82
				83	/// Memory Alias Analyses
				84	MemorySSA &MSSA;
				85
				86	/// Target Lowering Information
				87	const TargetLowering &TLI;
				88
				89	/// Target Transform Information
				90	const TargetTransformInfo TTI;
				91
				92	/// Find the instruction in sets LIs that dominates all others, return nullptr
				93	/// if there is none.
				94	LoadInst findFirstLoad(const std::set<LoadInst > &LIs);
				95
				96	/// Replace interleaved load candidates. It does additional
				97	/// analyses if this makes sense. Returns true on success and false
				98	/// of nothing has been changed.
				99	bool combine(std::list<VectorInfo> &InterleavedLoad,
				100	OptimizationRemarkEmitter &ORE);
				101
				102	/// Given a set of VectorInfo containing candidates for a given interleave
				103	/// factor, find a set that represents a 'factor' interleaved load.
				104	bool findPattern(std::list<VectorInfo> &Candidates,
				105	std::list<VectorInfo> &InterleavedLoad, unsigned Factor,
				106	const DataLayout &DL);
				107	}; // InterleavedLoadCombine
				108
				109	/// First Order Polynomial on an n-Bit Integer Value
				110	///
				111	/// Polynomial(Value) = Value * B + A + E*2^(n-e)
				112	///
				113	/// A and B are the coefficients. E*2^(n-e) is an error within 'e' most
				114	/// significant bits. It is introduced if an exact computation cannot be proven
				115	/// (e.q. division by 2).
				116	///
				117	/// As part of this optimization multiple loads will be combined. It necessary
				118	/// to prove that loads are within some relative offset to each other. This
				119	/// class is used to prove relative offsets of values loaded from memory.
				120	///
				121	/// Representing an integer in this form is sound since addition in two's
				122	/// complement is associative (trivial) and multiplication distributes over the
				123	/// addition (see Proof(1) in Polynomial::mul). Further, both operations
				124	/// commute.
				125	//
				126	// Example:
				127	// declare @fn(i64 %IDX, <4 x float>* %PTR) {
				128	// %Pa1 = add i64 %IDX, 2
				129	// %Pa2 = lshr i64 %Pa1, 1
				130	// %Pa3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pa2
				131	// %Va = load <4 x float>, <4 x float>* %Pa3
				132	//
				133	// %Pb1 = add i64 %IDX, 4
				134	// %Pb2 = lshr i64 %Pb1, 1
				135	// %Pb3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pb2
				136	// %Vb = load <4 x float>, <4 x float>* %Pb3
				137	// ... }
				138	//
				139	// The goal is to prove that two loads load consecutive addresses.
				140	//
				141	// In this case the polynomials are constructed by the following
				142	// steps.
				143	//
				144	// The number tag #e specifies the error bits.
				145	//
				146	// Pa_0 = %IDX #0
				147	// Pa_1 = %IDX + 2 #0 \| add 2
				148	// Pa_2 = %IDX/2 + 1 #1 \| lshr 1
				149	// Pa_3 = %IDX/2 + 1 #1 \| GEP, step signext to i64
				150	// Pa_4 = (%IDX/2)*16 + 16 #0 \| GEP, multiply index by sizeof(4) for floats
				151	// Pa_5 = (%IDX/2)*16 + 16 #0 \| GEP, add offset of leading components
				152	//
				153	// Pb_0 = %IDX #0
				154	// Pb_1 = %IDX + 4 #0 \| add 2
				155	// Pb_2 = %IDX/2 + 2 #1 \| lshr 1
				156	// Pb_3 = %IDX/2 + 2 #1 \| GEP, step signext to i64
				157	// Pb_4 = (%IDX/2)*16 + 32 #0 \| GEP, multiply index by sizeof(4) for floats
				158	// Pb_5 = (%IDX/2)*16 + 16 #0 \| GEP, add offset of leading components
				159	//
				160	// Pb_5 - Pa_5 = 16 #0 \| subtract to get the offset
				161	//
				162	// Remark: %PTR is not maintained within this class. So in this instance the
				163	// offset of 16 can only be assumed if the pointers are equal.
				164	//
				165	class Polynomial {
				166	/// Operations on B
				167	enum BOps {
				168	LShr,
				169	Mul,
				170	SExt,
				171	Trunc,
				172	};
				173
				174	/// Number of Error Bits e
				175	unsigned ErrorMSBs;
				176
				177	/// Value
				178	Value *V;
				179
				180	/// Coefficient B
				181	SmallVector<std::pair<BOps, APInt>, 4> B;
				182
				183	/// Coefficient A
				184	APInt A;
				185
				186	public:
				187	Polynomial(Value *V) : ErrorMSBs((unsigned)-1), V(V), B(), A() {
				188	IntegerType *Ty = dyn_cast<IntegerType>(V->getType());
				189	if (Ty) {
				190	ErrorMSBs = 0;
				191	this->V = V;
				192	A = APInt(Ty->getBitWidth(), 0);
				193	}
				194	}
				195
				196	Polynomial(const APInt &A, unsigned ErrorMSBs = 0)
				197	: ErrorMSBs(ErrorMSBs), V(NULL), B(), A(A) {}
				198
				199	Polynomial(unsigned BitWidth, uint64_t A, unsigned ErrorMSBs = 0)
				200	: ErrorMSBs(ErrorMSBs), V(NULL), B(), A(BitWidth, A) {}
				201
				202	Polynomial() : ErrorMSBs((unsigned)-1), V(NULL), B(), A() {}
				203
				204	/// Increment and clamp the number of undefined bits.
				205	void incErrorMSBs(unsigned amt) {
				206	if (ErrorMSBs == (unsigned)-1)
				207	return;
				208
				209	ErrorMSBs += amt;
				210	if (ErrorMSBs > A.getBitWidth())
				211	ErrorMSBs = A.getBitWidth();
				212	}
				213
				214	/// Decrement and clamp the number of undefined bits.
				215	void decErrorMSBs(unsigned amt) {
				216	if (ErrorMSBs == (unsigned)-1)
				217	return;
				218
				219	if (ErrorMSBs > amt)
				220	ErrorMSBs -= amt;
				221	else
				222	ErrorMSBs = 0;
				223	}
				224
				225	/// Apply an add on the polynomial
				226	Polynomial &add(const APInt &C) {
				227	// Note: Addition is associative in two's complement even when in case of
				228	// signed overflow.
				229	//
				230	// Error bits can only propagate into higher significant bits. As these are
				231	// already regarded as undefined, there is no change.
				232	//
				233	// Theorem: Adding a constant to a polynomial does not change the error
				234	// term.
				235	//
				236	// Proof:
				237	//
				238	// Since the addition is associative and commutes:
				239	//
				240	// (B + A + E2^(n-e)) + C = B + (A + C) + E2^(n-e)
				241	// [qed]
				242
				243	if (C.getBitWidth() != A.getBitWidth()) {
				244	ErrorMSBs = (unsigned)-1;
				245	return *this;
				246	}
				247
				248	A += C;
				249	return *this;
				250	}
				251
				252	/// Apply a multiplication onto the polynomial.
				253	Polynomial &mul(const APInt &C) {
				254	// Note: Multiplication distributes over the addition
				255	//
				256	// Theorem: Multiplication distributes over the addition
				257	//
				258	// Proof(1):
				259	//
				260	// (B+A)*C =-
				261	// = (B + A) + (B + A) + .. {C Times}
				262	// addition is associative and commutes, hence
				263	// = B + B + .. {C Times} .. + A + A + .. {C times}
				264	// = BC + AC
				265	// (see (function add) for signed values and overflows)
				266	// [qed]
				267	//
				268	// Theorem: If C has c trailing zeros, errors bits in A or B are shifted out
				269	// to the left.
				270	//
				271	// Proof(2):
				272	//
				273	// Let B' and A' be the n-Bit inputs with some unknown errors EA,
				274	// EB at e leading bits. B' and A' can be written down as:
				275	//
				276	// B' = B + 2^(n-e)*EB
				277	// A' = A + 2^(n-e)*EA
				278	//
				279	// Let C' be an input with c trailing zero bits. C' can be written as
				280	//
				281	// C' = C*2^c
				282	//
				283	// Therefore we can compute the result by using distributivity and
				284	// commutativity.
				285	//
				286	// (B'C' + A'C') = [B + 2^(n-e)EB] C' + [A + 2^(n-e)EA] C' =
				287	// = [B + 2^(n-e)EB + A + 2^(n-e)EA] * C' =
				288	// = (B'+A') * C' =
				289	// = [B + 2^(n-e)EB + A + 2^(n-e)EA] * C' =
				290	// = [B + A + 2^(n-e)EB + 2^(n-e)EA] * C' =
				291	// = (B + A) * C' + [2^(n-e)EB + 2^(n-e)EA)] * C' =
				292	// = (B + A) * C' + [2^(n-e)EB + 2^(n-e)EA)] * C*2^c =
				293	// = (B + A) * C' + C(EB + EA)2^(n-e)*2^c =
				294	//
				295	// Let EC be the final error with EC = C*(EB + EA)
				296	//
				297	// = (B + A)C' + EC2^(n-e)*2^c =
				298	// = (B + A)C' + EC2^(n-(e-c))
				299	//
				300	// Since EC is multiplied by 2^(n-(e-c)) the resulting error contains c
				301	// less error bits than the input. c bits are shifted out to the left.
				302	// [qed]
				303
				304	if (C.getBitWidth() != A.getBitWidth()) {
				305	ErrorMSBs = (unsigned)-1;
				306	return *this;
				307	}
				308
				309	// Multiplying by one is a no-op.
				310	if (C.isOneValue()) {
				311	return *this;
				312	}
				313
				314	// Multiplying by zero removes the coefficient B and defines all bits.
				315	if (C.isNullValue()) {
				316	ErrorMSBs = 0;
				317	deleteB();
				318	}
				319
				320	// See Proof(2): Trailing zero bits indicate a left shift. This removes
				321	// leading bits from the result even if they are undefined.
				322	decErrorMSBs(C.countTrailingZeros());
				323
				324	A *= C;
				325	pushBOperation(Mul, C);
				326	return *this;
				327	}
				328
				329	/// Apply a logical shift right on the polynomial
				330	Polynomial &lshr(const APInt &C) {
				331	// Theorem(1): (B + A + E2^(n-e)) >> 1 => (B >> 1) + (A >> 1) + E'2^(n-e')
				332	// where
				333	// e' = e + 1,
				334	// E is a e-bit number,
				335	// E' is a e'-bit number,
				336	// holds under the following precondition:
				337	// pre(1): A % 2 = 0
				338	// pre(2): e < n, (see Theorem(2) for the trivial case with e=n)
				339	// where >> expresses a logical shift to the right, with adding zeros.
				340	//
				341	// We need to show that for every, E there is a E'
				342	//
				343	// B = b_h * 2^(n-1) + b_m * 2 + b_l
				344	// A = a_h * 2^(n-1) + a_m * 2 (pre(1))
				345	//
				346	// where a_h, b_h, b_l are single bits, and a_m, b_m are (n-2) bit numbers
				347	//
				348	// Let X = (B + A + E*2^(n-e)) >> 1
				349	// Let Y = (B >> 1) + (A >> 1) + E*2^(n-e) >> 1
				350	//
				351	// X = [B + A + E*2^(n-e)] >> 1 =
				352	// = [ b_h * 2^(n-1) + b_m * 2 + b_l +
				353	// + a_h * 2^(n-1) + a_m * 2 +
				354	// + E * 2^(n-e) ] >> 1 =
				355	//
				356	// The sum is built by putting the overflow of [a_m + b+n] into the term
				357	// 2^(n-1). As there are no more bits beyond 2^(n-1) the overflow within
				358	// this bit is discarded. This is expressed by % 2.
				359	//
				360	// The bit in position 0 cannot overflow into the term (b_m + a_m).
				361	//
				362	// = [ ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-1) +
				363	// + ((b_m + a_m) % 2^(n-2)) * 2 +
				364	// + b_l + E * 2^(n-e) ] >> 1 =
				365	//
				366	// The shift is computed by dividing the terms by 2 and by cutting off
				367	// b_l.
				368	//
				369	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
				370	// + ((b_m + a_m) % 2^(n-2)) +
				371	// + E * 2^(n-(e+1)) =
				372	//
				373	// by the definition in the Theorem e+1 = e'
				374	//
				375	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
				376	// + ((b_m + a_m) % 2^(n-2)) +
				377	// + E * 2^(n-e') =
				378	//
				379	// Compute Y by applying distributivity first
				380	//
				381	// Y = (B >> 1) + (A >> 1) + E*2^(n-e') =
				382	// = (b_h * 2^(n-1) + b_m * 2 + b_l) >> 1 +
				383	// + (a_h * 2^(n-1) + a_m * 2) >> 1 +
				384	// + E * 2^(n-e) >> 1 =
				385	//
				386	// Again, the shift is computed by dividing the terms by 2 and by cutting
				387	// off b_l.
				388	//
				389	// = b_h * 2^(n-2) + b_m +
				390	// + a_h * 2^(n-2) + a_m +
				391	// + E * 2^(n-(e+1)) =
				392	//
				393	// Again, the sum is built by putting the overflow of [a_m + b+n] into
				394	// the term 2^(n-1). But this time there is room for a second bit in the
				395	// term 2^(n-2) we add this bit to a new term and denote it o_h in a
				396	// second step.
				397	//
				398	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] >> 1) * 2^(n-1) +
				399	// + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
				400	// + ((b_m + a_m) % 2^(n-2)) +
				401	// + E * 2^(n-(e+1)) =
				402	//
				403	// Let o_h = [b_h + a_h + (b_m + a_m) >> (n-2)] >> 1
				404	// Further replace e+1 by e'.
				405	//
				406	// = o_h * 2^(n-1) +
				407	// + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
				408	// + ((b_m + a_m) % 2^(n-2)) +
				409	// + E * 2^(n-e') =
				410	//
				411	// Move o_h into the error term and construct E'. To ensure that there is
				412	// no 2^x with negative x, this step requires pre(2) (e < n).
				413	//
				414	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
				415	// + ((b_m + a_m) % 2^(n-2)) +
				416	// + o_h * 2^(e'-1) * 2^(n-e') + \| pre(2), move 2^(e'-1)
				417	// \| out of the old exponent
				418	// + E * 2^(n-e') =
				419	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
				420	// + ((b_m + a_m) % 2^(n-2)) +
				421	// + [o_h * 2^(e'-1) + E] * 2^(n-e') + \| move 2^(e'-1) out of
				422	// \| the old exponent
				423	//
				424	// Let E' = o_h * 2^(e'-1) + E
				425	//
				426	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
				427	// + ((b_m + a_m) % 2^(n-2)) +
				428	// + E' * 2^(n-e')
				429	//
				430	// Because X and Y are distinct only in there error terms and E' can be
				431	// constructed as shown the theorem holds.
				432	// [qed]
				433	//
				434	// For completeness in case of the case e=n it is also required to show that
				435	// distributivity can be applied.
				436	//
				437	// In this case Theorem(1) transforms to (the pre-condition on A can also be
				438	// dropped)
				439	//
				440	// Theorem(2): (B + A + E) >> 1 => (B >> 1) + (A >> 1) + E'
				441	// where
				442	// A, B, E, E' are two's complement numbers with the same bit
				443	// width
				444	//
				445	// Let A + B + E = X
				446	// Let (B >> 1) + (A >> 1) = Y
				447	//
				448	// Therefore we need to show that for every X and Y there is an E' which
				449	// makes the equation
				450	//
				451	// X = Y + E'
				452	//
				453	// hold. This is trivially the case for E' = X - Y.
				454	//
				455	// [qed]
				456	//
				457	// Remark: Distributing lshr with and arbitrary number n can be expressed as
				458	// ((((B + A) lshr 1) lshr 1) ... ) {n times}.
				459	// This construction induces n additional error bits at the left.
				460
				461	if (C.getBitWidth() != A.getBitWidth()) {
				462	ErrorMSBs = (unsigned)-1;
				463	return *this;
				464	}
				465
				466	if (C.isNullValue())
				467	return *this;
				468
				469	// Test if the result will be zero
				470	unsigned shiftAmt = C.getZExtValue();
				471	if (shiftAmt >= C.getBitWidth())
				472	return mul(APInt(C.getBitWidth(), 0));
				473
				474	// The proof that shiftAmt LSBs are zero for at least one summand is only
				475	// possible for the constant number.
				476	//
				477	// If this can be proven add shiftAmt to the error counter
				478	// `ErrorMSBs`. Otherwise set all bits as undefined.
				479	if (A.countTrailingZeros() < shiftAmt)
				480	ErrorMSBs = A.getBitWidth();
				481	else
				482	incErrorMSBs(shiftAmt);
				483
				484	// Apply the operation.
				485	pushBOperation(LShr, C);
				486	A = A.lshr(shiftAmt);
				487
				488	return *this;
				489	}
				490
				491	/// Apply a sign-extend or truncate operation on the polynomial.
				492	Polynomial &sextOrTrunc(unsigned n) {
				493	if (n < A.getBitWidth()) {
				494	// Truncate: Clearly undefined Bits on the MSB side are removed
				495	// if there are any.
				496	decErrorMSBs(A.getBitWidth() - n);
				497	A = A.trunc(n);
				498	pushBOperation(Trunc, APInt(sizeof(n) * 8, n));
				499	}
				500	if (n > A.getBitWidth()) {
				501	// Extend: Clearly extending first and adding later is different
				502	// to adding first and extending later in all extended bits.
				503	incErrorMSBs(n - A.getBitWidth());
				504	A = A.sext(n);
				505	pushBOperation(SExt, APInt(sizeof(n) * 8, n));
				506	}
				507
				508	return *this;
				509	}
				510
				511	/// Test if there is a coefficient B.
				512	bool isFirstOrder() const { return V != nullptr; }
				513
				514	/// Test coefficient B of two Polynomials are equal.
				515	bool isCompatibleTo(const Polynomial &o) const {
				516	// The polynomial use different bit width.
				517	if (A.getBitWidth() != o.A.getBitWidth())
				518	return false;
				519
				520	// If neither Polynomial has the Coefficient B.
				521	if (!isFirstOrder() && !o.isFirstOrder())
				522	return true;
				523
				524	// The index variable is different.
				525	if (V != o.V)
				526	return false;
				527
				528	// Check the operations.
				529	if (B.size() != o.B.size())
				530	return false;
				531
				532	auto ob = o.B.begin();
				533	for (auto &b : B) {
				534	if (b != *ob)
				535	return false;
				536	ob++;
				537	}
				538
				539	return true;
				540	}
				541
				542	/// Subtract two polynomials, return an undefined polynomial if
				543	/// subtraction is not possible.
				544	Polynomial operator-(const Polynomial &o) const {
				545	// Return an undefined polynomial if incompatible.
				546	if (!isCompatibleTo(o))
				547	return Polynomial();
				548
				549	// If the polynomials are compatible (meaning they have the same
				550	// coefficient on B), B is eliminated. Thus a polynomial solely
				551	// containing A is returned
				552	return Polynomial(A - o.A, std::max(ErrorMSBs, o.ErrorMSBs));
				553	}
				554
				555	/// Subtract a constant from a polynomial,
				556	Polynomial operator-(uint64_t C) const {
				557	Polynomial Result(*this);
				558	Result.A -= C;
				559	return Result;
				560	}
				561
				562	/// Add a constant to a polynomial,
				563	Polynomial operator+(uint64_t C) const {
				564	Polynomial Result(*this);
				565	Result.A += C;
				566	return Result;
				567	}
				568
				569	/// Returns true if it can be proven that two Polynomials are equal.
				570	bool isProvenEqualTo(const Polynomial &o) {
				571	// Subtract both polynomials and test if it is fully defined and zero.
				572	Polynomial r = *this - o;
				573	return (r.ErrorMSBs == 0) && (!r.isFirstOrder()) && (r.A.isNullValue());
				574	}
				575
				576	/// Print the polynomial into a stream.
				577	void print(raw_ostream &OS) const {
				578	OS << "[{#ErrBits:" << ErrorMSBs << "} ";
				579
				580	if (V) {
				581	for (auto b : B)
				582	OS << "(";
				583	OS << "(" << *V << ") ";
				584
				585	for (auto b : B) {
				586	switch (b.first) {
				587	case LShr:
				588	OS << "LShr ";
				589	break;
				590	case Mul:
				591	OS << "Mul ";
				592	break;
				593	case SExt:
				594	OS << "SExt ";
				595	break;
				596	case Trunc:
				597	OS << "Trunc ";
				598	break;
				599	}
				600
				601	OS << b.second << ") ";
				602	}
				603	}
				604
				605	OS << "+ " << A << "]";
				606	}
				607
				608	private:
				609	void deleteB() {
				610	V = nullptr;
				611	B.clear();
				612	}
				613
				614	void pushBOperation(const BOps Op, const APInt &C) {
				615	if (isFirstOrder()) {
				616	B.push_back(std::make_pair(Op, C));
				617	return;
				618	}
				619	}
				620	};
				621
Martin Elshuber	fef3036	2018-11-19 14:26:10 +0000	[diff] [blame]	622	/// VectorInfo stores abstract the following information for each vector
				623	/// element:
				624	///
				625	/// 1) The the memory address loaded into the element as Polynomial
				626	/// 2) a set of load instruction necessary to construct the vector,
				627	/// 3) a set of all other instructions that are necessary to create the vector and
				628	/// 4) a pointer value that can be used as relative base for all elements.
				629	struct VectorInfo {
				630	private:
				631	VectorInfo(const VectorInfo &c) : VTy(c.VTy) {
				632	llvm_unreachable(
				633	"Copying VectorInfo is neither implemented nor necessary,");
				634	}
				635
				636	public:
				637	/// Information of a Vector Element
				638	struct ElementInfo {
				639	/// Offset Polynomial.
				640	Polynomial Ofs;
				641
				642	/// The Load Instruction used to Load the entry. LI is null if the pointer
				643	/// of the load instruction does not point on to the entry
				644	LoadInst *LI;
				645
				646	ElementInfo(Polynomial Offset = Polynomial(), LoadInst *LI = nullptr)
				647	: Ofs(Offset), LI(LI) {}
				648	};
				649
				650	/// Basic-block the load instructions are within
				651	BasicBlock *BB;
				652
				653	/// Pointer value of all participation load instructions
				654	Value *PV;
				655
				656	/// Participating load instructions
				657	std::set<LoadInst *> LIs;
				658
				659	/// Participating instructions
				660	std::set<Instruction *> Is;
				661
				662	/// Final shuffle-vector instruction
				663	ShuffleVectorInst *SVI;
				664
				665	/// Information of the offset for each vector element
				666	ElementInfo *EI;
				667
				668	/// Vector Type
				669	VectorType *const VTy;
				670
				671	VectorInfo(VectorType *VTy)
				672	: BB(nullptr), PV(nullptr), LIs(), Is(), SVI(nullptr), VTy(VTy) {
				673	EI = new ElementInfo[VTy->getNumElements()];
				674	}
				675
				676	virtual ~VectorInfo() { delete[] EI; }
				677
				678	unsigned getDimension() const { return VTy->getNumElements(); }
				679
				680	/// Test if the VectorInfo can be part of an interleaved load with the
				681	/// specified factor.
				682	///
				683	/// \param Factor of the interleave
				684	/// \param DL Targets Datalayout
				685	///
				686	/// \returns true if this is possible and false if not
				687	bool isInterleaved(unsigned Factor, const DataLayout &DL) const {
				688	unsigned Size = DL.getTypeAllocSize(VTy->getElementType());
				689	for (unsigned i = 1; i < getDimension(); i++) {
				690	if (!EI[i].Ofs.isProvenEqualTo(EI[0].Ofs + i * Factor * Size)) {
				691	return false;
				692	}
				693	}
				694	return true;
				695	}
				696
				697	/// Recursively computes the vector information stored in V.
				698	///
				699	/// This function delegates the work to specialized implementations
				700	///
				701	/// \param V Value to operate on
				702	/// \param Result Result of the computation
				703	///
				704	/// \returns false if no sensible information can be gathered.
				705	static bool compute(Value *V, VectorInfo &Result, const DataLayout &DL) {
				706	ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V);
				707	if (SVI)
				708	return computeFromSVI(SVI, Result, DL);
				709	LoadInst *LI = dyn_cast<LoadInst>(V);
				710	if (LI)
				711	return computeFromLI(LI, Result, DL);
				712	BitCastInst *BCI = dyn_cast<BitCastInst>(V);
				713	if (BCI)
				714	return computeFromBCI(BCI, Result, DL);
				715	return false;
				716	}
				717
				718	/// BitCastInst specialization to compute the vector information.
				719	///
				720	/// \param BCI BitCastInst to operate on
				721	/// \param Result Result of the computation
				722	///
				723	/// \returns false if no sensible information can be gathered.
				724	static bool computeFromBCI(BitCastInst *BCI, VectorInfo &Result,
				725	const DataLayout &DL) {
				726	Instruction *Op = dyn_cast<Instruction>(BCI->getOperand(0));
				727
				728	if (!Op)
				729	return false;
				730
				731	VectorType *VTy = dyn_cast<VectorType>(Op->getType());
				732	if (!VTy)
				733	return false;
				734
				735	// We can only cast from large to smaller vectors
				736	if (Result.VTy->getNumElements() % VTy->getNumElements())
				737	return false;
				738
				739	unsigned Factor = Result.VTy->getNumElements() / VTy->getNumElements();
				740	unsigned NewSize = DL.getTypeAllocSize(Result.VTy->getElementType());
				741	unsigned OldSize = DL.getTypeAllocSize(VTy->getElementType());
				742
				743	if (NewSize * Factor != OldSize)
				744	return false;
				745
				746	VectorInfo Old(VTy);
				747	if (!compute(Op, Old, DL))
				748	return false;
				749
				750	for (unsigned i = 0; i < Result.VTy->getNumElements(); i += Factor) {
				751	for (unsigned j = 0; j < Factor; j++) {
				752	Result.EI[i + j] =
				753	ElementInfo(Old.EI[i / Factor].Ofs + j * NewSize,
				754	j == 0 ? Old.EI[i / Factor].LI : nullptr);
				755	}
				756	}
				757
				758	Result.BB = Old.BB;
				759	Result.PV = Old.PV;
				760	Result.LIs.insert(Old.LIs.begin(), Old.LIs.end());
				761	Result.Is.insert(Old.Is.begin(), Old.Is.end());
				762	Result.Is.insert(BCI);
				763	Result.SVI = nullptr;
				764
				765	return true;
				766	}
				767
				768	/// ShuffleVectorInst specialization to compute vector information.
				769	///
				770	/// \param SVI ShuffleVectorInst to operate on
				771	/// \param Result Result of the computation
				772	///
				773	/// Compute the left and the right side vector information and merge them by
				774	/// applying the shuffle operation. This function also ensures that the left
				775	/// and right side have compatible loads. This means that all loads are with
				776	/// in the same basic block and are based on the same pointer.
				777	///
				778	/// \returns false if no sensible information can be gathered.
				779	static bool computeFromSVI(ShuffleVectorInst *SVI, VectorInfo &Result,
				780	const DataLayout &DL) {
				781	VectorType *ArgTy = dyn_cast<VectorType>(SVI->getOperand(0)->getType());
				782	assert(ArgTy && "ShuffleVector Operand is not a VectorType");
				783
				784	// Compute the left hand vector information.
				785	VectorInfo LHS(ArgTy);
				786	if (!compute(SVI->getOperand(0), LHS, DL))
				787	LHS.BB = nullptr;
				788
				789	// Compute the right hand vector information.
				790	VectorInfo RHS(ArgTy);
				791	if (!compute(SVI->getOperand(1), RHS, DL))
				792	RHS.BB = nullptr;
				793
				794	// Neither operand produced sensible results?
				795	if (!LHS.BB && !RHS.BB)
				796	return false;
				797	// Only RHS produced sensible results?
				798	else if (!LHS.BB) {
				799	Result.BB = RHS.BB;
				800	Result.PV = RHS.PV;
				801	}
				802	// Only LHS produced sensible results?
				803	else if (!RHS.BB) {
				804	Result.BB = LHS.BB;
				805	Result.PV = LHS.PV;
				806	}
				807	// Both operands produced sensible results?
Martin Elshuber	5a47dc6	2018-11-19 18:35:31 +0000	[diff] [blame^]	808	else if ((LHS.BB == RHS.BB) && (LHS.PV == RHS.PV)) {
Martin Elshuber	fef3036	2018-11-19 14:26:10 +0000	[diff] [blame]	809	Result.BB = LHS.BB;
				810	Result.PV = LHS.PV;
				811	}
				812	// Both operands produced sensible results but they are incompatible.
				813	else {
				814	return false;
				815	}
				816
				817	// Merge and apply the operation on the offset information.
				818	if (LHS.BB) {
				819	Result.LIs.insert(LHS.LIs.begin(), LHS.LIs.end());
				820	Result.Is.insert(LHS.Is.begin(), LHS.Is.end());
				821	}
				822	if (RHS.BB) {
				823	Result.LIs.insert(RHS.LIs.begin(), RHS.LIs.end());
				824	Result.Is.insert(RHS.Is.begin(), RHS.Is.end());
				825	}
				826	Result.Is.insert(SVI);
				827	Result.SVI = SVI;
				828
				829	int j = 0;
				830	for (int i : SVI->getShuffleMask()) {
				831	assert((i < 2 * (signed)ArgTy->getNumElements()) &&
				832	"Invalid ShuffleVectorInst (index out of bounds)");
				833
				834	if (i < 0)
				835	Result.EI[j] = ElementInfo();
				836	else if (i < (signed)ArgTy->getNumElements()) {
				837	if (LHS.BB)
				838	Result.EI[j] = LHS.EI[i];
				839	else
				840	Result.EI[j] = ElementInfo();
				841	} else {
				842	if (RHS.BB)
				843	Result.EI[j] = RHS.EI[i - ArgTy->getNumElements()];
				844	else
				845	Result.EI[j] = ElementInfo();
				846	}
				847	j++;
				848	}
				849
				850	return true;
				851	}
				852
				853	/// LoadInst specialization to compute vector information.
				854	///
				855	/// This function also acts as abort condition to the recursion.
				856	///
				857	/// \param LI LoadInst to operate on
				858	/// \param Result Result of the computation
				859	///
				860	/// \returns false if no sensible information can be gathered.
				861	static bool computeFromLI(LoadInst *LI, VectorInfo &Result,
				862	const DataLayout &DL) {
				863	Value *BasePtr;
				864	Polynomial Offset;
				865
				866	if (LI->isVolatile())
				867	return false;
				868
				869	if (LI->isAtomic())
				870	return false;
				871
				872	// Get the base polynomial
				873	computePolynomialFromPointer(*LI->getPointerOperand(), Offset, BasePtr, DL);
				874
				875	Result.BB = LI->getParent();
				876	Result.PV = BasePtr;
				877	Result.LIs.insert(LI);
				878	Result.Is.insert(LI);
				879
				880	for (unsigned i = 0; i < Result.getDimension(); i++) {
				881	Value *Idx[2] = {
				882	ConstantInt::get(Type::getInt32Ty(LI->getContext()), 0),
				883	ConstantInt::get(Type::getInt32Ty(LI->getContext()), i),
				884	};
				885	int64_t Ofs = DL.getIndexedOffsetInType(Result.VTy, makeArrayRef(Idx, 2));
				886	Result.EI[i] = ElementInfo(Offset + Ofs, i == 0 ? LI : nullptr);
				887	}
				888
				889	return true;
				890	}
				891
				892	/// Recursively compute polynomial of a value.
				893	///
				894	/// \param BO Input binary operation
				895	/// \param Result Result polynomial
				896	static void computePolynomialBinOp(BinaryOperator &BO, Polynomial &Result) {
				897	Value *LHS = BO.getOperand(0);
				898	Value *RHS = BO.getOperand(1);
				899
				900	// Find the RHS Constant if any
				901	ConstantInt *C = dyn_cast<ConstantInt>(RHS);
				902	if ((!C) && BO.isCommutative()) {
				903	C = dyn_cast<ConstantInt>(LHS);
				904	if (C)
				905	std::swap(LHS, RHS);
				906	}
				907
				908	switch (BO.getOpcode()) {
				909	case Instruction::Add:
				910	if (!C)
				911	break;
				912
				913	computePolynomial(*LHS, Result);
				914	Result.add(C->getValue());
				915	return;
				916
				917	case Instruction::LShr:
				918	if (!C)
				919	break;
				920
				921	computePolynomial(*LHS, Result);
				922	Result.lshr(C->getValue());
				923	return;
				924
				925	default:
				926	break;
				927	}
				928
				929	Result = Polynomial(&BO);
				930	}
				931
				932	/// Recursively compute polynomial of a value
				933	///
				934	/// \param V input value
				935	/// \param Result result polynomial
				936	static void computePolynomial(Value &V, Polynomial &Result) {
				937	if (isa<BinaryOperator>(&V))
				938	computePolynomialBinOp(*dyn_cast<BinaryOperator>(&V), Result);
				939	else
				940	Result = Polynomial(&V);
				941	}
				942
				943	/// Compute the Polynomial representation of a Pointer type.
				944	///
				945	/// \param Ptr input pointer value
				946	/// \param Result result polynomial
				947	/// \param BasePtr pointer the polynomial is based on
				948	/// \param DL Datalayout of the target machine
				949	static void computePolynomialFromPointer(Value &Ptr, Polynomial &Result,
				950	Value *&BasePtr,
				951	const DataLayout &DL) {
				952	// Not a pointer type? Return an undefined polynomial
				953	PointerType *PtrTy = dyn_cast<PointerType>(Ptr.getType());
				954	if (!PtrTy) {
				955	Result = Polynomial();
				956	BasePtr = nullptr;
				957	}
				958	unsigned PointerBits =
				959	DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace());
				960
				961	/// Skip pointer casts. Return Zero polynomial otherwise
				962	if (isa<CastInst>(&Ptr)) {
				963	CastInst &CI = *cast<CastInst>(&Ptr);
				964	switch (CI.getOpcode()) {
				965	case Instruction::BitCast:
				966	computePolynomialFromPointer(*CI.getOperand(0), Result, BasePtr, DL);
				967	break;
				968	default:
				969	BasePtr = &Ptr;
				970	Polynomial(PointerBits, 0);
				971	break;
				972	}
				973	}
				974	/// Resolve GetElementPtrInst.
				975	else if (isa<GetElementPtrInst>(&Ptr)) {
				976	GetElementPtrInst &GEP = *cast<GetElementPtrInst>(&Ptr);
				977
				978	APInt BaseOffset(PointerBits, 0);
				979
				980	// Check if we can compute the Offset with accumulateConstantOffset
				981	if (GEP.accumulateConstantOffset(DL, BaseOffset)) {
				982	Result = Polynomial(BaseOffset);
				983	BasePtr = GEP.getPointerOperand();
				984	return;
				985	} else {
				986	// Otherwise we allow that the last index operand of the GEP is
				987	// non-constant.
				988	unsigned idxOperand, e;
				989	SmallVector<Value *, 4> Indices;
				990	for (idxOperand = 1, e = GEP.getNumOperands(); idxOperand < e;
				991	idxOperand++) {
				992	ConstantInt *IDX = dyn_cast<ConstantInt>(GEP.getOperand(idxOperand));
				993	if (!IDX)
				994	break;
				995	Indices.push_back(IDX);
				996	}
				997
				998	// It must also be the last operand.
				999	if (idxOperand + 1 != e) {
				1000	Result = Polynomial();
				1001	BasePtr = nullptr;
				1002	return;
				1003	}
				1004
				1005	// Compute the polynomial of the index operand.
				1006	computePolynomial(*GEP.getOperand(idxOperand), Result);
				1007
				1008	// Compute base offset from zero based index, excluding the last
				1009	// variable operand.
				1010	BaseOffset =
				1011	DL.getIndexedOffsetInType(GEP.getSourceElementType(), Indices);
				1012
				1013	// Apply the operations of GEP to the polynomial.
				1014	unsigned ResultSize = DL.getTypeAllocSize(GEP.getResultElementType());
				1015	Result.sextOrTrunc(PointerBits);
				1016	Result.mul(APInt(PointerBits, ResultSize));
				1017	Result.add(BaseOffset);
				1018	BasePtr = GEP.getPointerOperand();
				1019	}
				1020	}
				1021	// All other instructions are handled by using the value as base pointer and
				1022	// a zero polynomial.
				1023	else {
				1024	BasePtr = &Ptr;
				1025	Polynomial(DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()), 0);
				1026	}
				1027	}
				1028
				1029	#ifndef NDEBUG
				1030	void print(raw_ostream &OS) const {
				1031	if (PV)
				1032	OS << *PV;
				1033	else
				1034	OS << "(none)";
				1035	OS << " + ";
				1036	for (unsigned i = 0; i < getDimension(); i++)
				1037	OS << ((i == 0) ? "[" : ", ") << EI[i].Ofs;
				1038	OS << "]";
				1039	}
				1040	#endif
				1041	};
				1042
				1043	#ifndef NDEBUG
				1044	static raw_ostream &operator<<(raw_ostream &OS, const VectorInfo &S) {
				1045	S.print(OS);
				1046	return OS;
				1047	}
				1048	#endif
				1049	} // anonymous namespace
				1050
				1051	bool InterleavedLoadCombineImpl::findPattern(
				1052	std::list<VectorInfo> &Candidates, std::list<VectorInfo> &InterleavedLoad,
				1053	unsigned Factor, const DataLayout &DL) {
				1054	for (auto C0 = Candidates.begin(), E0 = Candidates.end(); C0 != E0; ++C0) {
				1055	unsigned i;
				1056	// Try to find an interleaved load using the front of Worklist as first line
				1057	unsigned Size = DL.getTypeAllocSize(C0->VTy->getElementType());
				1058
				1059	// List containing iterators pointing to the VectorInfos of the candidates
				1060	std::vector<std::list<VectorInfo>::iterator> Res(Factor, Candidates.end());
				1061
				1062	for (auto C = Candidates.begin(), E = Candidates.end(); C != E; C++) {
				1063	if (C->VTy != C0->VTy)
				1064	continue;
				1065	if (C->BB != C0->BB)
				1066	continue;
				1067	if (C->PV != C0->PV)
				1068	continue;
				1069
				1070	// Check the current value matches any of factor - 1 remaining lines
				1071	for (i = 1; i < Factor; i++) {
				1072	if (C->EI[0].Ofs.isProvenEqualTo(C0->EI[0].Ofs + i * Size)) {
				1073	Res[i] = C;
				1074	}
				1075	}
				1076
				1077	for (i = 1; i < Factor; i++) {
				1078	if (Res[i] == Candidates.end())
				1079	break;
				1080	}
				1081	if (i == Factor) {
				1082	Res[0] = C0;
				1083	break;
				1084	}
				1085	}
				1086
				1087	if (Res[0] != Candidates.end()) {
				1088	// Move the result into the output
				1089	for (unsigned i = 0; i < Factor; i++) {
				1090	InterleavedLoad.splice(InterleavedLoad.end(), Candidates, Res[i]);
				1091	}
				1092
				1093	return true;
				1094	}
				1095	}
				1096	return false;
				1097	}
				1098
				1099	LoadInst *
				1100	InterleavedLoadCombineImpl::findFirstLoad(const std::set<LoadInst *> &LIs) {
				1101	assert(!LIs.empty() && "No load instructions given.");
				1102
				1103	// All LIs are within the same BB. Select the first for a reference.
				1104	BasicBlock BB = (LIs.begin())->getParent();
				1105	BasicBlock::iterator FLI =
				1106	std::find_if(BB->begin(), BB->end(), [&LIs](Instruction &I) -> bool {
				1107	return is_contained(LIs, &I);
				1108	});
				1109	assert(FLI != BB->end());
				1110
				1111	return cast<LoadInst>(FLI);
				1112	}
				1113
				1114	bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
				1115	OptimizationRemarkEmitter &ORE) {
				1116	LLVM_DEBUG(dbgs() << "Checking interleaved load\n");
Martin Elshuber	fef3036	2018-11-19 14:26:10 +0000	[diff] [blame]	1117
				1118	// The insertion point is the LoadInst which loads the first values. The
				1119	// following tests are used to proof that the combined load can be inserted
				1120	// just before InsertionPoint.
				1121	LoadInst *InsertionPoint = InterleavedLoad.front().EI[0].LI;
				1122
				1123	// Test if the offset is computed
				1124	if (!InsertionPoint)
				1125	return false;
				1126
				1127	std::set<LoadInst *> LIs;
				1128	std::set<Instruction *> Is;
				1129	std::set<Instruction *> SVIs;
				1130
				1131	unsigned InterleavedCost;
				1132	unsigned InstructionCost = 0;
				1133
				1134	// Get the interleave factor
				1135	unsigned Factor = InterleavedLoad.size();
				1136
				1137	// Merge all input sets used in analysis
				1138	for (auto &VI : InterleavedLoad) {
				1139	// Generate a set of all load instructions to be combined
				1140	LIs.insert(VI.LIs.begin(), VI.LIs.end());
				1141
				1142	// Generate a set of all instructions taking part in load
				1143	// interleaved. This list excludes the instructions necessary for the
				1144	// polynomial construction.
				1145	Is.insert(VI.Is.begin(), VI.Is.end());
				1146
				1147	// Generate the set of the final ShuffleVectorInst.
				1148	SVIs.insert(VI.SVI);
				1149	}
				1150
				1151	// There is nothing to combine.
				1152	if (LIs.size() < 2)
				1153	return false;
				1154
				1155	// Test if all participating instruction will be dead after the
				1156	// transformation. If intermediate results are used, no performance gain can
				1157	// be expected. Also sum the cost of the Instructions beeing left dead.
				1158	for (auto &I : Is) {
				1159	// Compute the old cost
				1160	InstructionCost +=
				1161	TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency);
				1162
				1163	// The final SVIs are allowed not to be dead, all uses will be replaced
				1164	if (SVIs.find(I) != SVIs.end())
				1165	continue;
				1166
				1167	// If there are users outside the set to be eliminated, we abort the
				1168	// transformation. No gain can be expected.
				1169	for (const auto &U : I->users()) {
				1170	if (Is.find(dyn_cast<Instruction>(U)) == Is.end())
				1171	return false;
				1172	}
				1173	}
				1174
				1175	// We know that all LoadInst are within the same BB. This guarantees that
				1176	// either everything or nothing is loaded.
				1177	LoadInst *First = findFirstLoad(LIs);
				1178
				1179	// To be safe that the loads can be combined, iterate over all loads and test
				1180	// that the corresponding defining access dominates first LI. This guarantees
				1181	// that there are no aliasing stores in between the loads.
				1182	auto FMA = MSSA.getMemoryAccess(First);
				1183	for (auto LI : LIs) {
				1184	auto MADef = MSSA.getMemoryAccess(LI)->getDefiningAccess();
				1185	if (!MSSA.dominates(MADef, FMA))
				1186	return false;
				1187	}
				1188	assert(!LIs.empty() && "There are no LoadInst to combine");
				1189
				1190	// It is necessary that insertion point dominates all final ShuffleVectorInst.
				1191	for (auto &VI : InterleavedLoad) {
				1192	if (!DT.dominates(InsertionPoint, VI.SVI))
				1193	return false;
				1194	}
				1195
				1196	// All checks are done. Add instructions detectable by InterleavedAccessPass
				1197	// The old instruction will are left dead.
				1198	IRBuilder<> Builder(InsertionPoint);
				1199	Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();
				1200	unsigned ElementsPerSVI =
				1201	InterleavedLoad.front().SVI->getType()->getNumElements();
				1202	VectorType ILTy = VectorType::get(ETy, Factor ElementsPerSVI);
				1203
				1204	SmallVector<unsigned, 4> Indices;
				1205	for (unsigned i = 0; i < Factor; i++)
				1206	Indices.push_back(i);
				1207	InterleavedCost = TTI.getInterleavedMemoryOpCost(
				1208	Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlignment(),
				1209	InsertionPoint->getPointerAddressSpace());
				1210
				1211	if (InterleavedCost >= InstructionCost) {
				1212	return false;
				1213	}
				1214
				1215	// Create a pointer cast for the wide load.
				1216	auto CI = Builder.CreatePointerCast(InsertionPoint->getOperand(0),
				1217	ILTy->getPointerTo(),
				1218	"interleaved.wide.ptrcast");
				1219
				1220	// Create the wide load and update the MemorySSA.
				1221	auto LI = Builder.CreateAlignedLoad(CI, InsertionPoint->getAlignment(),
				1222	"interleaved.wide.load");
				1223	auto MSSAU = MemorySSAUpdater(&MSSA);
				1224	MemoryUse *MSSALoad = cast<MemoryUse>(MSSAU.createMemoryAccessBefore(
				1225	LI, nullptr, MSSA.getMemoryAccess(InsertionPoint)));
				1226	MSSAU.insertUse(MSSALoad);
				1227
				1228	// Create the final SVIs and replace all uses.
				1229	int i = 0;
				1230	for (auto &VI : InterleavedLoad) {
				1231	SmallVector<uint32_t, 4> Mask;
				1232	for (unsigned j = 0; j < ElementsPerSVI; j++)
				1233	Mask.push_back(i + j * Factor);
				1234
				1235	Builder.SetInsertPoint(VI.SVI);
				1236	auto SVI = Builder.CreateShuffleVector(LI, UndefValue::get(LI->getType()),
				1237	Mask, "interleaved.shuffle");
				1238	VI.SVI->replaceAllUsesWith(SVI);
				1239	i++;
				1240	}
				1241
				1242	NumInterleavedLoadCombine++;
				1243	ORE.emit([&]() {
				1244	return OptimizationRemark(DEBUG_TYPE, "Combined Interleaved Load", LI)
				1245	<< "Load interleaved combined with factor "
				1246	<< ore::NV("Factor", Factor);
				1247	});
				1248
				1249	return true;
				1250	}
				1251
				1252	bool InterleavedLoadCombineImpl::run() {
				1253	OptimizationRemarkEmitter ORE(&F);
				1254	bool changed = false;
				1255	unsigned MaxFactor = TLI.getMaxSupportedInterleaveFactor();
				1256
				1257	auto &DL = F.getParent()->getDataLayout();
				1258
				1259	// Start with the highest factor to avoid combining and recombining.
				1260	for (unsigned Factor = MaxFactor; Factor >= 2; Factor--) {
				1261	std::list<VectorInfo> Candidates;
				1262
				1263	for (BasicBlock &BB : F) {
				1264	for (Instruction &I : BB) {
				1265	if (auto SVI = dyn_cast<ShuffleVectorInst>(&I)) {
				1266
				1267	Candidates.emplace_back(SVI->getType());
				1268
				1269	if (!VectorInfo::computeFromSVI(SVI, Candidates.back(), DL)) {
				1270	Candidates.pop_back();
				1271	continue;
				1272	}
				1273
				1274	if (!Candidates.back().isInterleaved(Factor, DL)) {
				1275	Candidates.pop_back();
				1276	}
				1277	}
				1278	}
				1279	}
				1280
				1281	std::list<VectorInfo> InterleavedLoad;
				1282	while (findPattern(Candidates, InterleavedLoad, Factor, DL)) {
				1283	if (combine(InterleavedLoad, ORE)) {
				1284	changed = true;
				1285	} else {
				1286	// Remove the first element of the Interleaved Load but put the others
				1287	// back on the list and continue searching
				1288	Candidates.splice(Candidates.begin(), InterleavedLoad,
				1289	std::next(InterleavedLoad.begin()),
				1290	InterleavedLoad.end());
				1291	}
				1292	InterleavedLoad.clear();
				1293	}
				1294	}
				1295
				1296	return changed;
				1297	}
				1298
				1299	namespace {
				1300	/// This pass combines interleaved loads into a pattern detectable by
				1301	/// InterleavedAccessPass.
				1302	struct InterleavedLoadCombine : public FunctionPass {
				1303	static char ID;
				1304
				1305	InterleavedLoadCombine() : FunctionPass(ID) {
				1306	initializeInterleavedLoadCombinePass(*PassRegistry::getPassRegistry());
				1307	}
				1308
				1309	StringRef getPassName() const override {
				1310	return "Interleaved Load Combine Pass";
				1311	}
				1312
				1313	bool runOnFunction(Function &F) override {
				1314	if (DisableInterleavedLoadCombine)
				1315	return false;
				1316
				1317	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
				1318	if (!TPC)
				1319	return false;
				1320
				1321	LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName()
				1322	<< "\n");
				1323
				1324	return InterleavedLoadCombineImpl(
				1325	F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
				1326	getAnalysis<MemorySSAWrapperPass>().getMSSA(),
				1327	TPC->getTM<TargetMachine>())
				1328	.run();
				1329	}
				1330
				1331	void getAnalysisUsage(AnalysisUsage &AU) const override {
				1332	AU.addRequired<MemorySSAWrapperPass>();
				1333	AU.addRequired<DominatorTreeWrapperPass>();
				1334	FunctionPass::getAnalysisUsage(AU);
				1335	}
				1336
				1337	private:
				1338	};
				1339	} // anonymous namespace
				1340
				1341	char InterleavedLoadCombine::ID = 0;
				1342
				1343	INITIALIZE_PASS_BEGIN(
				1344	InterleavedLoadCombine, DEBUG_TYPE,
				1345	"Combine interleaved loads into wide loads and shufflevector instructions",
				1346	false, false)
				1347	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
				1348	INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
				1349	INITIALIZE_PASS_END(
				1350	InterleavedLoadCombine, DEBUG_TYPE,
				1351	"Combine interleaved loads into wide loads and shufflevector instructions",
				1352	false, false)
				1353
				1354	FunctionPass *
				1355	llvm::createInterleavedLoadCombinePass() {
				1356	auto P = new InterleavedLoadCombine();
				1357	return P;
				1358	}