Blame - llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp - toolchain/llvm-project

blob: 3ff4155849a69ff48cd010f7cad9f9b316727604 [file] [log] [blame]

Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	1	//
				2	// The LLVM Compiler Infrastructure
				3	//
				4	// This file is distributed under the University of Illinois Open Source
				5	// License. See LICENSE.TXT for details.
				6	//
				7	//===----------------------------------------------------------------------===//
				8	//
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	9	// This file contains a pass that performs optimization on SIMD instructions
				10	// with high latency by splitting them into more efficient series of
				11	// instructions.
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	12	//
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	13	// 1. Rewrite certain SIMD instructions with vector element due to their
				14	// inefficiency on some targets.
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	15	// Example:
				16	// fmla v0.4s, v1.4s, v2.s[1]
				17	// is rewritten into
				18	// dup v3.4s, v2.s[1]
				19	// fmla v0.4s, v1.4s, v3.4s
Eugene Zelenko	11f6907	2017-01-25 00:29:26 +0000	[diff] [blame]	20	//
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	21	// 2. Rewrite Interleaved memory access instructions due to their
				22	// inefficiency on some targets.
				23	// Example:
				24	// st2 {v0.4s, v1.4s}, addr
				25	// is rewritten into
				26	// zip1 v2.4s, v0.4s, v1.4s
				27	// zip2 v3.4s, v0.4s, v1.4s
				28	// stp q2, q3, addr
				29	//
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	30	//===----------------------------------------------------------------------===//
				31
				32	#include "AArch64InstrInfo.h"
Eugene Zelenko	11f6907	2017-01-25 00:29:26 +0000	[diff] [blame]	33	#include "llvm/ADT/SmallVector.h"
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	34	#include "llvm/ADT/Statistic.h"
Eugene Zelenko	11f6907	2017-01-25 00:29:26 +0000	[diff] [blame]	35	#include "llvm/ADT/StringRef.h"
				36	#include "llvm/CodeGen/MachineBasicBlock.h"
				37	#include "llvm/CodeGen/MachineFunction.h"
				38	#include "llvm/CodeGen/MachineFunctionPass.h"
				39	#include "llvm/CodeGen/MachineInstr.h"
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	40	#include "llvm/CodeGen/MachineInstrBuilder.h"
Eugene Zelenko	11f6907	2017-01-25 00:29:26 +0000	[diff] [blame]	41	#include "llvm/CodeGen/MachineOperand.h"
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	42	#include "llvm/CodeGen/MachineRegisterInfo.h"
David Blaikie	3f833ed	2017-11-08 01:01:31 +0000	[diff] [blame]	43	#include "llvm/CodeGen/TargetInstrInfo.h"
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	44	#include "llvm/CodeGen/TargetSchedule.h"
David Blaikie	b3bde2e	2017-11-17 01:07:10 +0000	[diff] [blame]	45	#include "llvm/CodeGen/TargetSubtargetInfo.h"
Eugene Zelenko	11f6907	2017-01-25 00:29:26 +0000	[diff] [blame]	46	#include "llvm/MC/MCInstrDesc.h"
				47	#include "llvm/MC/MCSchedule.h"
				48	#include "llvm/Pass.h"
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	49	#include <unordered_map>
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	50
				51	using namespace llvm;
				52
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	53	#define DEBUG_TYPE "aarch64-simdinstr-opt"
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	54
				55	STATISTIC(NumModifiedInstr,
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	56	"Number of SIMD instructions modified");
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	57
				58	#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	59	"AArch64 SIMD instructions optimization pass"
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	60
				61	namespace {
				62
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	63	struct AArch64SIMDInstrOpt : public MachineFunctionPass {
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	64	static char ID;
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	65
				66	const TargetInstrInfo *TII;
				67	MachineRegisterInfo *MRI;
				68	TargetSchedModel SchedModel;
				69
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	70	// The two maps below are used to cache decisions instead of recomputing:
				71	// This is used to cache instruction replacement decisions within function
				72	// units and across function units.
				73	std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable;
				74	// This is used to cache the decision of whether to leave the Interleave-Store
				75	// instructions replacement pass early or not for a particular target.
				76	std::unordered_map<std::string, bool> InterlEarlyExit;
				77
				78	typedef enum {
				79	VectorElem,
				80	Interleave
				81	} Subpass;
				82
				83	// Instruction represented by OrigOpc is replaced by instructions in ReplOpc.
				84	struct InstReplInfo {
				85	unsigned OrigOpc;
				86	std::vector<unsigned> ReplOpc;
				87	const TargetRegisterClass RC;
				88	};
				89
				90	#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
				91	{OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}
				92	#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
				93	OpcR7, OpcR8, OpcR9, RC) \
				94	{OpcOrg, {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, \
				95	OpcR8, OpcR9}, RC}
				96
				97	// The Instruction Replacement Table:
				98	std::vector<InstReplInfo> IRT = {
				99	// ST2 instructions
				100	RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
				101	AArch64::STPQi, AArch64::FPR128RegClass),
				102	RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
				103	AArch64::STPQi, AArch64::FPR128RegClass),
				104	RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
				105	AArch64::STPDi, AArch64::FPR64RegClass),
				106	RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
				107	AArch64::STPQi, AArch64::FPR128RegClass),
				108	RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
				109	AArch64::STPDi, AArch64::FPR64RegClass),
				110	RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
				111	AArch64::STPQi, AArch64::FPR128RegClass),
				112	RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
				113	AArch64::STPDi, AArch64::FPR64RegClass),
				114	// ST4 instructions
				115	RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
				116	AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,
				117	AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
				118	AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
				119	RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
				120	AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,
				121	AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
				122	AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
				123	RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
				124	AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,
				125	AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
				126	AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
				127	RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
				128	AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,
				129	AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
				130	AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
				131	RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
				132	AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,
				133	AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
				134	AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
				135	RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
				136	AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,
				137	AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
				138	AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
				139	RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
				140	AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,
				141	AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
				142	AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)
				143	};
				144
				145	// A costly instruction is replaced in this work by N efficient instructions
				146	// The maximum of N is curently 10 and it is for ST4 case.
				147	static const unsigned MaxNumRepl = 10;
				148
				149	AArch64SIMDInstrOpt() : MachineFunctionPass(ID) {
				150	initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry());
Eugene Zelenko	11f6907	2017-01-25 00:29:26 +0000	[diff] [blame]	151	}
				152
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	153	/// Based only on latency of instructions, determine if it is cost efficient
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	154	/// to replace the instruction InstDesc by the instructions stored in the
				155	/// array InstDescRepl.
				156	/// Return true if replacement is expected to be faster.
				157	bool shouldReplaceInst(MachineFunction MF, const MCInstrDesc InstDesc,
				158	SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID);
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	159
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	160	/// Determine if we need to exit the instruction replacement optimization
				161	/// subpasses early. This makes sure that Targets with no need for this
				162	/// optimization do not spend any compile time on this subpass other than the
				163	/// simple check performed here. This simple check is done by comparing the
				164	/// latency of the original instruction to the latency of the replacement
				165	/// instructions. We only check for a representative instruction in the class
				166	/// of instructions and not all concerned instructions. For the VectorElem
				167	/// subpass, we check for the FMLA instruction while for the interleave subpass
				168	/// we check for the st2.4s instruction.
				169	/// Return true if early exit of the subpass is recommended.
				170	bool shouldExitEarly(MachineFunction *MF, Subpass SP);
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	171
				172	/// Check whether an equivalent DUP instruction has already been
				173	/// created or not.
				174	/// Return true when the dup instruction already exists. In this case,
				175	/// DestReg will point to the destination of the already created DUP.
				176	bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
				177	unsigned LaneNumber, unsigned *DestReg) const;
				178
				179	/// Certain SIMD instructions with vector element operand are not efficient.
				180	/// Rewrite them into SIMD instructions with vector operands. This rewrite
				181	/// is driven by the latency of the instructions.
				182	/// Return true if the SIMD instruction is modified.
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	183	bool optimizeVectElement(MachineInstr &MI);
				184
				185	/// Process The REG_SEQUENCE instruction, and extract the source
				186	/// operands of the st2/4 instruction from it.
				187	/// Example of such instructions.
				188	/// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
				189	/// Return true when the instruction is processed successfully.
				190	bool processSeqRegInst(MachineInstr DefiningMI, unsigned StReg,
				191	unsigned* StRegKill, unsigned NumArg) const;
				192
				193	/// Load/Store Interleaving instructions are not always beneficial.
				194	/// Replace them by zip instructionand classical load/store.
				195	/// Return true if the SIMD instruction is modified.
				196	bool optimizeLdStInterleave(MachineInstr &MI);
				197
				198	/// Return the number of useful source registers for this
				199	/// instruction (2 for st2 and 4 for st4).
				200	unsigned determineSrcReg(MachineInstr &MI) const;
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	201
				202	bool runOnMachineFunction(MachineFunction &Fn) override;
				203
				204	StringRef getPassName() const override {
				205	return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
				206	}
				207	};
Eugene Zelenko	11f6907	2017-01-25 00:29:26 +0000	[diff] [blame]	208
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	209	char AArch64SIMDInstrOpt::ID = 0;
Eugene Zelenko	11f6907	2017-01-25 00:29:26 +0000	[diff] [blame]	210
				211	} // end anonymous namespace
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	212
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	213	INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt",
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	214	AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
				215
				216	/// Based only on latency of instructions, determine if it is cost efficient
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	217	/// to replace the instruction InstDesc by the instructions stored in the
				218	/// array InstDescRepl.
				219	/// Return true if replacement is expected to be faster.
				220	bool AArch64SIMDInstrOpt::
				221	shouldReplaceInst(MachineFunction MF, const MCInstrDesc InstDesc,
				222	SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) {
				223	// Check if replacement decision is already available in the cached table.
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	224	// if so, return it.
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	225	std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU();
				226	std::pair<unsigned, std::string> InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
				227	if (!SIMDInstrTable.empty() &&
				228	SIMDInstrTable.find(InstID) != SIMDInstrTable.end())
				229	return SIMDInstrTable[InstID];
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	230
				231	unsigned SCIdx = InstDesc->getSchedClass();
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	232	const MCSchedClassDesc *SCDesc =
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	233	SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	234
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	235	// If a subtarget does not define resources for the instructions
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	236	// of interest, then return false for no replacement.
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	237	const MCSchedClassDesc *SCDescRepl;
				238	if (!SCDesc->isValid() \|\| SCDesc->isVariant())
				239	{
				240	SIMDInstrTable[InstID] = false;
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	241	return false;
				242	}
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	243	for (auto IDesc : InstDescRepl)
				244	{
				245	SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(
				246	IDesc->getSchedClass());
				247	if (!SCDescRepl->isValid() \|\| SCDescRepl->isVariant())
				248	{
				249	SIMDInstrTable[InstID] = false;
				250	return false;
				251	}
				252	}
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	253
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	254	// Replacement cost.
				255	unsigned ReplCost = 0;
				256	for (auto IDesc :InstDescRepl)
				257	ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());
				258
				259	if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)
				260	{
				261	SIMDInstrTable[InstID] = true;
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	262	return true;
				263	}
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	264	else
				265	{
				266	SIMDInstrTable[InstID] = false;
				267	return false;
				268	}
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	269	}
				270
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	271	/// Determine if we need to exit the instruction replacement optimization
				272	/// subpasses early. This makes sure that Targets with no need for this
				273	/// optimization do not spend any compile time on this subpass other than the
				274	/// simple check performed here. This simple check is done by comparing the
				275	/// latency of the original instruction to the latency of the replacement
				276	/// instructions. We only check for a representative instruction in the class of
				277	/// instructions and not all concerned instructions. For the VectorElem subpass,
				278	/// we check for the FMLA instruction while for the interleave subpass we check
				279	/// for the st2.4s instruction.
				280	/// Return true if early exit of the subpass is recommended.
				281	bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) {
				282	const MCInstrDesc* OriginalMCID;
				283	SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	284
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	285	switch (SP) {
				286	case VectorElem:
				287	OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed);
				288	ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane));
				289	ReplInstrMCID.push_back(&TII->get(AArch64::FMULv4f32));
				290	if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))
				291	return false;
				292	break;
				293	case Interleave:
				294	// Check if early exit decision is already available in the cached
				295	// table or not.
				296	std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU();
				297	if (InterlEarlyExit.find(Subtarget) != InterlEarlyExit.end())
				298	return InterlEarlyExit[Subtarget];
				299
				300	for (auto &I : IRT) {
				301	OriginalMCID = &TII->get(I.OrigOpc);
				302	for (auto &Repl : I.ReplOpc)
				303	ReplInstrMCID.push_back(&TII->get(Repl));
				304	if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {
				305	InterlEarlyExit[Subtarget] = false;
				306	return false;
				307	}
				308	ReplInstrMCID.clear();
				309	}
				310	InterlEarlyExit[Subtarget] = true;
				311	break;
				312	}
				313
				314	return true;
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	315	}
				316
				317	/// Check whether an equivalent DUP instruction has already been
				318	/// created or not.
				319	/// Return true when the dup instruction already exists. In this case,
				320	/// DestReg will point to the destination of the already created DUP.
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	321	bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	322	unsigned SrcReg, unsigned LaneNumber,
				323	unsigned *DestReg) const {
				324	for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
				325	MII != MIE;) {
				326	MII--;
				327	MachineInstr CurrentMI = &MII;
				328
				329	if (CurrentMI->getOpcode() == DupOpcode &&
				330	CurrentMI->getNumOperands() == 3 &&
				331	CurrentMI->getOperand(1).getReg() == SrcReg &&
				332	CurrentMI->getOperand(2).getImm() == LaneNumber) {
				333	*DestReg = CurrentMI->getOperand(0).getReg();
				334	return true;
				335	}
				336	}
				337
				338	return false;
				339	}
				340
				341	/// Certain SIMD instructions with vector element operand are not efficient.
				342	/// Rewrite them into SIMD instructions with vector operands. This rewrite
				343	/// is driven by the latency of the instructions.
				344	/// The instruction of concerns are for the time being fmla, fmls, fmul,
				345	/// and fmulx and hence they are hardcoded.
				346	///
				347	/// Example:
				348	/// fmla v0.4s, v1.4s, v2.s[1]
				349	/// is rewritten into
				350	/// dup v3.4s, v2.s[1] // dup not necessary if redundant
				351	/// fmla v0.4s, v1.4s, v3.4s
				352	/// Return true if the SIMD instruction is modified.
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	353	bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) {
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	354	const MCInstrDesc MulMCID, DupMCID;
				355	const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
				356
				357	switch (MI.getOpcode()) {
				358	default:
				359	return false;
				360
				361	// 4X32 instructions
				362	case AArch64::FMLAv4i32_indexed:
				363	DupMCID = &TII->get(AArch64::DUPv4i32lane);
				364	MulMCID = &TII->get(AArch64::FMLAv4f32);
				365	break;
				366	case AArch64::FMLSv4i32_indexed:
				367	DupMCID = &TII->get(AArch64::DUPv4i32lane);
				368	MulMCID = &TII->get(AArch64::FMLSv4f32);
				369	break;
				370	case AArch64::FMULXv4i32_indexed:
				371	DupMCID = &TII->get(AArch64::DUPv4i32lane);
				372	MulMCID = &TII->get(AArch64::FMULXv4f32);
				373	break;
				374	case AArch64::FMULv4i32_indexed:
				375	DupMCID = &TII->get(AArch64::DUPv4i32lane);
				376	MulMCID = &TII->get(AArch64::FMULv4f32);
				377	break;
				378
				379	// 2X64 instructions
				380	case AArch64::FMLAv2i64_indexed:
				381	DupMCID = &TII->get(AArch64::DUPv2i64lane);
				382	MulMCID = &TII->get(AArch64::FMLAv2f64);
				383	break;
				384	case AArch64::FMLSv2i64_indexed:
				385	DupMCID = &TII->get(AArch64::DUPv2i64lane);
				386	MulMCID = &TII->get(AArch64::FMLSv2f64);
				387	break;
				388	case AArch64::FMULXv2i64_indexed:
				389	DupMCID = &TII->get(AArch64::DUPv2i64lane);
				390	MulMCID = &TII->get(AArch64::FMULXv2f64);
				391	break;
				392	case AArch64::FMULv2i64_indexed:
				393	DupMCID = &TII->get(AArch64::DUPv2i64lane);
				394	MulMCID = &TII->get(AArch64::FMULv2f64);
				395	break;
				396
				397	// 2X32 instructions
				398	case AArch64::FMLAv2i32_indexed:
				399	RC = &AArch64::FPR64RegClass;
				400	DupMCID = &TII->get(AArch64::DUPv2i32lane);
				401	MulMCID = &TII->get(AArch64::FMLAv2f32);
				402	break;
				403	case AArch64::FMLSv2i32_indexed:
				404	RC = &AArch64::FPR64RegClass;
				405	DupMCID = &TII->get(AArch64::DUPv2i32lane);
				406	MulMCID = &TII->get(AArch64::FMLSv2f32);
				407	break;
				408	case AArch64::FMULXv2i32_indexed:
				409	RC = &AArch64::FPR64RegClass;
				410	DupMCID = &TII->get(AArch64::DUPv2i32lane);
				411	MulMCID = &TII->get(AArch64::FMULXv2f32);
				412	break;
				413	case AArch64::FMULv2i32_indexed:
				414	RC = &AArch64::FPR64RegClass;
				415	DupMCID = &TII->get(AArch64::DUPv2i32lane);
				416	MulMCID = &TII->get(AArch64::FMULv2f32);
				417	break;
				418	}
				419
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	420	SmallVector<const MCInstrDesc*, 2> ReplInstrMCID;
				421	ReplInstrMCID.push_back(DupMCID);
				422	ReplInstrMCID.push_back(MulMCID);
				423	if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
				424	ReplInstrMCID))
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	425	return false;
				426
				427	const DebugLoc &DL = MI.getDebugLoc();
				428	MachineBasicBlock &MBB = *MI.getParent();
				429	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
				430
				431	// get the operands of the current SIMD arithmetic instruction.
				432	unsigned MulDest = MI.getOperand(0).getReg();
				433	unsigned SrcReg0 = MI.getOperand(1).getReg();
				434	unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
				435	unsigned SrcReg1 = MI.getOperand(2).getReg();
				436	unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
				437	unsigned DupDest;
				438
				439	// Instructions of interest have either 4 or 5 operands.
				440	if (MI.getNumOperands() == 5) {
				441	unsigned SrcReg2 = MI.getOperand(3).getReg();
				442	unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
				443	unsigned LaneNumber = MI.getOperand(4).getImm();
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	444	// Create a new DUP instruction. Note that if an equivalent DUP instruction
				445	// has already been created before, then use that one instread of creating
				446	// a new one.
				447	if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
				448	DupDest = MRI.createVirtualRegister(RC);
				449	BuildMI(MBB, MI, DL, *DupMCID, DupDest)
				450	.addReg(SrcReg2, Src2IsKill)
				451	.addImm(LaneNumber);
				452	}
				453	BuildMI(MBB, MI, DL, *MulMCID, MulDest)
				454	.addReg(SrcReg0, Src0IsKill)
				455	.addReg(SrcReg1, Src1IsKill)
				456	.addReg(DupDest, Src2IsKill);
				457	} else if (MI.getNumOperands() == 4) {
				458	unsigned LaneNumber = MI.getOperand(3).getImm();
				459	if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
				460	DupDest = MRI.createVirtualRegister(RC);
				461	BuildMI(MBB, MI, DL, *DupMCID, DupDest)
				462	.addReg(SrcReg1, Src1IsKill)
				463	.addImm(LaneNumber);
				464	}
				465	BuildMI(MBB, MI, DL, *MulMCID, MulDest)
				466	.addReg(SrcReg0, Src0IsKill)
				467	.addReg(DupDest, Src1IsKill);
				468	} else {
				469	return false;
				470	}
				471
				472	++NumModifiedInstr;
				473	return true;
				474	}
				475
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	476	/// Load/Store Interleaving instructions are not always beneficial.
				477	/// Replace them by zip instructions and classical load/store.
				478	///
				479	/// Example:
				480	/// st2 {v0.4s, v1.4s}, addr
				481	/// is rewritten into
				482	/// zip1 v2.4s, v0.4s, v1.4s
				483	/// zip2 v3.4s, v0.4s, v1.4s
				484	/// stp q2, q3, addr
				485	//
				486	/// Example:
				487	/// st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr
				488	/// is rewritten into
				489	/// zip1 v4.4s, v0.4s, v2.4s
				490	/// zip2 v5.4s, v0.4s, v2.4s
				491	/// zip1 v6.4s, v1.4s, v3.4s
				492	/// zip2 v7.4s, v1.4s, v3.4s
				493	/// zip1 v8.4s, v4.4s, v6.4s
				494	/// zip2 v9.4s, v4.4s, v6.4s
				495	/// zip1 v10.4s, v5.4s, v7.4s
				496	/// zip2 v11.4s, v5.4s, v7.4s
				497	/// stp q8, q9, addr
				498	/// stp q10, q11, addr+32
				499	/// Currently only instructions related to st2 and st4 are considered.
				500	/// Other may be added later.
				501	/// Return true if the SIMD instruction is modified.
				502	bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) {
				503
				504	unsigned SeqReg, AddrReg;
				505	unsigned StReg[4], StRegKill[4];
				506	MachineInstr *DefiningMI;
				507	const DebugLoc &DL = MI.getDebugLoc();
				508	MachineBasicBlock &MBB = *MI.getParent();
				509	SmallVector<unsigned, MaxNumRepl> ZipDest;
				510	SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;
				511
				512	// If current instruction matches any of the rewriting rules, then
				513	// gather information about parameters of the new instructions.
				514	bool Match = false;
				515	for (auto &I : IRT) {
				516	if (MI.getOpcode() == I.OrigOpc) {
				517	SeqReg = MI.getOperand(0).getReg();
				518	AddrReg = MI.getOperand(1).getReg();
				519	DefiningMI = MRI->getUniqueVRegDef(SeqReg);
				520	unsigned NumReg = determineSrcReg(MI);
				521	if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))
				522	return false;
				523
				524	for (auto &Repl : I.ReplOpc) {
				525	ReplInstrMCID.push_back(&TII->get(Repl));
				526	// Generate destination registers but only for non-store instruction.
				527	if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)
				528	ZipDest.push_back(MRI->createVirtualRegister(&I.RC));
				529	}
				530	Match = true;
				531	break;
				532	}
				533	}
				534
				535	if (!Match)
				536	return false;
				537
				538	// Determine if it is profitable to replace MI by the series of instructions
				539	// represented in ReplInstrMCID.
				540	if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
				541	ReplInstrMCID))
				542	return false;
				543
				544	// Generate the replacement instructions composed of zip1, zip2, and stp (at
				545	// this point, the code generation is hardcoded and does not rely on the IRT
				546	// table used above given that code generation for ST2 replacement is somewhat
				547	// different than for ST4 replacement. We could have added more info into the
				548	// table related to how we build new instructions but we may be adding more
				549	// complexity with that).
				550	switch (MI.getOpcode()) {
				551	default:
				552	return false;
				553	case AArch64::ST2Twov16b:
				554	case AArch64::ST2Twov8b:
				555	case AArch64::ST2Twov8h:
				556	case AArch64::ST2Twov4h:
				557	case AArch64::ST2Twov4s:
				558	case AArch64::ST2Twov2s:
				559	case AArch64::ST2Twov2d:
				560	// zip instructions
				561	BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
				562	.addReg(StReg[0])
				563	.addReg(StReg[1]);
				564	BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
				565	.addReg(StReg[0], StRegKill[0])
				566	.addReg(StReg[1], StRegKill[1]);
				567	// stp instructions
				568	BuildMI(MBB, MI, DL, *ReplInstrMCID[2])
				569	.addReg(ZipDest[0])
				570	.addReg(ZipDest[1])
				571	.addReg(AddrReg)
				572	.addImm(0);
				573	break;
				574	case AArch64::ST4Fourv16b:
				575	case AArch64::ST4Fourv8b:
				576	case AArch64::ST4Fourv8h:
				577	case AArch64::ST4Fourv4h:
				578	case AArch64::ST4Fourv4s:
				579	case AArch64::ST4Fourv2s:
				580	case AArch64::ST4Fourv2d:
				581	// zip instructions
				582	BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
				583	.addReg(StReg[0])
				584	.addReg(StReg[2]);
				585	BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
				586	.addReg(StReg[0], StRegKill[0])
				587	.addReg(StReg[2], StRegKill[2]);
				588	BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2])
				589	.addReg(StReg[1])
				590	.addReg(StReg[3]);
				591	BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3])
				592	.addReg(StReg[1], StRegKill[1])
				593	.addReg(StReg[3], StRegKill[3]);
				594	BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4])
				595	.addReg(ZipDest[0])
				596	.addReg(ZipDest[2]);
				597	BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5])
				598	.addReg(ZipDest[0])
				599	.addReg(ZipDest[2]);
				600	BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6])
				601	.addReg(ZipDest[1])
				602	.addReg(ZipDest[3]);
				603	BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7])
				604	.addReg(ZipDest[1])
				605	.addReg(ZipDest[3]);
				606	// stp instructions
				607	BuildMI(MBB, MI, DL, *ReplInstrMCID[8])
				608	.addReg(ZipDest[4])
				609	.addReg(ZipDest[5])
				610	.addReg(AddrReg)
				611	.addImm(0);
				612	BuildMI(MBB, MI, DL, *ReplInstrMCID[9])
				613	.addReg(ZipDest[6])
				614	.addReg(ZipDest[7])
				615	.addReg(AddrReg)
				616	.addImm(2);
				617	break;
				618	}
				619
				620	++NumModifiedInstr;
				621	return true;
				622	}
				623
				624	/// Process The REG_SEQUENCE instruction, and extract the source
				625	/// operands of the st2/4 instruction from it.
				626	/// Example of such instruction.
				627	/// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
				628	/// Return true when the instruction is processed successfully.
				629	bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI,
				630	unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const {
				631	assert (DefiningMI != NULL);
				632	if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE)
				633	return false;
				634
				635	for (unsigned i=0; i<NumArg; i++) {
				636	StReg[i] = DefiningMI->getOperand(2*i+1).getReg();
				637	StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill());
				638
				639	// Sanity check for the other arguments.
				640	if (DefiningMI->getOperand(2*i+2).isImm()) {
				641	switch (DefiningMI->getOperand(2*i+2).getImm()) {
				642	default:
				643	return false;
				644	case AArch64::dsub0:
				645	case AArch64::dsub1:
				646	case AArch64::dsub2:
				647	case AArch64::dsub3:
				648	case AArch64::qsub0:
				649	case AArch64::qsub1:
				650	case AArch64::qsub2:
				651	case AArch64::qsub3:
				652	break;
				653	}
				654	}
				655	else
				656	return false;
				657	}
				658	return true;
				659	}
				660
				661	/// Return the number of useful source registers for this instruction
				662	/// (2 for ST2 and 4 for ST4).
				663	unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const {
				664	switch (MI.getOpcode()) {
				665	default:
				666	llvm_unreachable("Unsupported instruction for this pass");
				667	case AArch64::ST2Twov16b:
				668	case AArch64::ST2Twov8b:
				669	case AArch64::ST2Twov8h:
				670	case AArch64::ST2Twov4h:
				671	case AArch64::ST2Twov4s:
				672	case AArch64::ST2Twov2s:
				673	case AArch64::ST2Twov2d:
				674	return 2;
				675	case AArch64::ST4Fourv16b:
				676	case AArch64::ST4Fourv8b:
				677	case AArch64::ST4Fourv8h:
				678	case AArch64::ST4Fourv4h:
				679	case AArch64::ST4Fourv4s:
				680	case AArch64::ST4Fourv2s:
				681	case AArch64::ST4Fourv2d:
				682	return 4;
				683	}
				684	}
				685
				686	bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) {
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	687	if (skipFunction(*MF.getFunction()))
				688	return false;
				689
				690	TII = MF.getSubtarget().getInstrInfo();
				691	MRI = &MF.getRegInfo();
				692	const TargetSubtargetInfo &ST = MF.getSubtarget();
				693	const AArch64InstrInfo *AAII =
				694	static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
				695	if (!AAII)
				696	return false;
				697	SchedModel.init(ST.getSchedModel(), &ST, AAII);
				698	if (!SchedModel.hasInstrSchedModel())
				699	return false;
				700
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	701	bool Changed = false;
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	702	for (auto OptimizationKind : {VectorElem, Interleave}) {
				703	if (!shouldExitEarly(&MF, OptimizationKind)) {
				704	SmallVector<MachineInstr *, 8> RemoveMIs;
				705	for (MachineBasicBlock &MBB : MF) {
				706	for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
				707	MII != MIE;) {
				708	MachineInstr &MI = *MII;
				709	bool InstRewrite;
				710	if (OptimizationKind == VectorElem)
				711	InstRewrite = optimizeVectElement(MI) ;
				712	else
				713	InstRewrite = optimizeLdStInterleave(MI);
				714	if (InstRewrite) {
				715	// Add MI to the list of instructions to be removed given that it
				716	// has been replaced.
				717	RemoveMIs.push_back(&MI);
				718	Changed = true;
				719	}
				720	++MII;
				721	}
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	722	}
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	723	for (MachineInstr *MI : RemoveMIs)
				724	MI->eraseFromParent();
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	725	}
				726	}
				727
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	728	return Changed;
				729	}
				730
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	731	/// createAArch64SIMDInstrOptPass - returns an instance of the
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	732	/// vector by element optimization pass.
Abderrazek Zaafrani	2c80e4c	2017-12-08 00:58:49 +0000	[diff] [blame^]	733	FunctionPass *llvm::createAArch64SIMDInstrOptPass() {
				734	return new AArch64SIMDInstrOpt();
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	735	}