Blame - llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp - toolchain/llvm-project

blob: f53af2315ec9e147f4e1c4bc38fc755651ad6007 [file] [log] [blame]

Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	1	//=- AArch64VectorByElementOpt.cpp - AArch64 vector by element inst opt pass =//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// This file contains a pass that performs optimization for vector by element
				11	// SIMD instructions.
				12	//
				13	// Certain SIMD instructions with vector element operand are not efficient.
				14	// Rewrite them into SIMD instructions with vector operands. This rewrite
				15	// is driven by the latency of the instructions.
				16	//
				17	// Example:
				18	// fmla v0.4s, v1.4s, v2.s[1]
				19	// is rewritten into
				20	// dup v3.4s, v2.s[1]
				21	// fmla v0.4s, v1.4s, v3.4s
Eugene Zelenko	11f6907	2017-01-25 00:29:26 +0000	[diff] [blame^]	22	//
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	23	//===----------------------------------------------------------------------===//
				24
				25	#include "AArch64InstrInfo.h"
Eugene Zelenko	11f6907	2017-01-25 00:29:26 +0000	[diff] [blame^]	26	#include "llvm/ADT/SmallVector.h"
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	27	#include "llvm/ADT/Statistic.h"
Eugene Zelenko	11f6907	2017-01-25 00:29:26 +0000	[diff] [blame^]	28	#include "llvm/ADT/StringRef.h"
				29	#include "llvm/CodeGen/MachineBasicBlock.h"
				30	#include "llvm/CodeGen/MachineFunction.h"
				31	#include "llvm/CodeGen/MachineFunctionPass.h"
				32	#include "llvm/CodeGen/MachineInstr.h"
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	33	#include "llvm/CodeGen/MachineInstrBuilder.h"
Eugene Zelenko	11f6907	2017-01-25 00:29:26 +0000	[diff] [blame^]	34	#include "llvm/CodeGen/MachineOperand.h"
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	35	#include "llvm/CodeGen/MachineRegisterInfo.h"
				36	#include "llvm/CodeGen/TargetSchedule.h"
Eugene Zelenko	11f6907	2017-01-25 00:29:26 +0000	[diff] [blame^]	37	#include "llvm/MC/MCInstrDesc.h"
				38	#include "llvm/MC/MCSchedule.h"
				39	#include "llvm/Pass.h"
				40	#include "llvm/Target/TargetInstrInfo.h"
				41	#include "llvm/Target/TargetSubtargetInfo.h"
				42	#include <map>
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	43
				44	using namespace llvm;
				45
				46	#define DEBUG_TYPE "aarch64-vectorbyelement-opt"
				47
				48	STATISTIC(NumModifiedInstr,
				49	"Number of vector by element instructions modified");
				50
				51	#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
				52	"AArch64 vector by element instruction optimization pass"
				53
				54	namespace {
				55
				56	struct AArch64VectorByElementOpt : public MachineFunctionPass {
				57	static char ID;
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	58
				59	const TargetInstrInfo *TII;
				60	MachineRegisterInfo *MRI;
				61	TargetSchedModel SchedModel;
				62
Eugene Zelenko	11f6907	2017-01-25 00:29:26 +0000	[diff] [blame^]	63	AArch64VectorByElementOpt() : MachineFunctionPass(ID) {
				64	initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry());
				65	}
				66
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	67	/// Based only on latency of instructions, determine if it is cost efficient
				68	/// to replace the instruction InstDesc by the two instructions InstDescRep1
				69	/// and InstDescRep2.
				70	/// Return true if replacement is recommended.
				71	bool
				72	shouldReplaceInstruction(MachineFunction MF, const MCInstrDesc InstDesc,
				73	const MCInstrDesc *InstDescRep1,
				74	const MCInstrDesc *InstDescRep2,
				75	std::map<unsigned, bool> &VecInstElemTable) const;
				76
				77	/// Determine if we need to exit the vector by element instruction
				78	/// optimization pass early. This makes sure that Targets with no need
				79	/// for this optimization do not spent any compile time on this pass.
				80	/// This check is done by comparing the latency of an indexed FMLA
				81	/// instruction to the latency of the DUP + the latency of a vector
				82	/// FMLA instruction. We do not check on other related instructions such
				83	/// as FMLS as we assume that if the situation shows up for one
				84	/// instruction, then it is likely to show up for the related ones.
				85	/// Return true if early exit of the pass is recommended.
				86	bool earlyExitVectElement(MachineFunction *MF);
				87
				88	/// Check whether an equivalent DUP instruction has already been
				89	/// created or not.
				90	/// Return true when the dup instruction already exists. In this case,
				91	/// DestReg will point to the destination of the already created DUP.
				92	bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
				93	unsigned LaneNumber, unsigned *DestReg) const;
				94
				95	/// Certain SIMD instructions with vector element operand are not efficient.
				96	/// Rewrite them into SIMD instructions with vector operands. This rewrite
				97	/// is driven by the latency of the instructions.
				98	/// Return true if the SIMD instruction is modified.
				99	bool optimizeVectElement(MachineInstr &MI,
				100	std::map<unsigned, bool> *VecInstElemTable) const;
				101
				102	bool runOnMachineFunction(MachineFunction &Fn) override;
				103
				104	StringRef getPassName() const override {
				105	return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
				106	}
				107	};
Eugene Zelenko	11f6907	2017-01-25 00:29:26 +0000	[diff] [blame^]	108
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	109	char AArch64VectorByElementOpt::ID = 0;
Eugene Zelenko	11f6907	2017-01-25 00:29:26 +0000	[diff] [blame^]	110
				111	} // end anonymous namespace
Sebastian Pop	eb65d72	2016-10-08 12:30:07 +0000	[diff] [blame]	112
				113	INITIALIZE_PASS(AArch64VectorByElementOpt, "aarch64-vectorbyelement-opt",
				114	AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
				115
				116	/// Based only on latency of instructions, determine if it is cost efficient
				117	/// to replace the instruction InstDesc by the two instructions InstDescRep1
				118	/// and InstDescRep2. Note that it is assumed in this fuction that an
				119	/// instruction of type InstDesc is always replaced by the same two
				120	/// instructions as results are cached here.
				121	/// Return true if replacement is recommended.
				122	bool AArch64VectorByElementOpt::shouldReplaceInstruction(
				123	MachineFunction MF, const MCInstrDesc InstDesc,
				124	const MCInstrDesc InstDescRep1, const MCInstrDesc InstDescRep2,
				125	std::map<unsigned, bool> &VecInstElemTable) const {
				126	// Check if replacment decision is alredy available in the cached table.
				127	// if so, return it.
				128	if (!VecInstElemTable.empty() &&
				129	VecInstElemTable.find(InstDesc->getOpcode()) != VecInstElemTable.end())
				130	return VecInstElemTable[InstDesc->getOpcode()];
				131
				132	unsigned SCIdx = InstDesc->getSchedClass();
				133	unsigned SCIdxRep1 = InstDescRep1->getSchedClass();
				134	unsigned SCIdxRep2 = InstDescRep2->getSchedClass();
				135	const MCSchedClassDesc *SCDesc =
				136	SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
				137	const MCSchedClassDesc *SCDescRep1 =
				138	SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep1);
				139	const MCSchedClassDesc *SCDescRep2 =
				140	SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep2);
				141
				142	// If a subtarget does not define resources for any of the instructions
				143	// of interest, then return false for no replacement.
				144	if (!SCDesc->isValid() \|\| SCDesc->isVariant() \|\| !SCDescRep1->isValid() \|\|
				145	SCDescRep1->isVariant() \|\| !SCDescRep2->isValid() \|\|
				146	SCDescRep2->isVariant()) {
				147	VecInstElemTable[InstDesc->getOpcode()] = false;
				148	return false;
				149	}
				150
				151	if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) >
				152	SchedModel.computeInstrLatency(InstDescRep1->getOpcode()) +
				153	SchedModel.computeInstrLatency(InstDescRep2->getOpcode())) {
				154	VecInstElemTable[InstDesc->getOpcode()] = true;
				155	return true;
				156	}
				157	VecInstElemTable[InstDesc->getOpcode()] = false;
				158	return false;
				159	}
				160
				161	/// Determine if we need to exit the vector by element instruction
				162	/// optimization pass early. This makes sure that Targets with no need
				163	/// for this optimization do not spent any compile time on this pass.
				164	/// This check is done by comparing the latency of an indexed FMLA
				165	/// instruction to the latency of the DUP + the latency of a vector
				166	/// FMLA instruction. We do not check on other related instructions such
				167	/// as FMLS as we assume that if the situation shows up for one
				168	/// instruction, then it is likely to show up for the related ones.
				169	/// Return true if early exit of the pass is recommended.
				170	bool AArch64VectorByElementOpt::earlyExitVectElement(MachineFunction *MF) {
				171	std::map<unsigned, bool> VecInstElemTable;
				172	const MCInstrDesc *IndexMulMCID = &TII->get(AArch64::FMLAv4i32_indexed);
				173	const MCInstrDesc *DupMCID = &TII->get(AArch64::DUPv4i32lane);
				174	const MCInstrDesc *MulMCID = &TII->get(AArch64::FMULv4f32);
				175
				176	if (!shouldReplaceInstruction(MF, IndexMulMCID, DupMCID, MulMCID,
				177	VecInstElemTable))
				178	return true;
				179	return false;
				180	}
				181
				182	/// Check whether an equivalent DUP instruction has already been
				183	/// created or not.
				184	/// Return true when the dup instruction already exists. In this case,
				185	/// DestReg will point to the destination of the already created DUP.
				186	bool AArch64VectorByElementOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
				187	unsigned SrcReg, unsigned LaneNumber,
				188	unsigned *DestReg) const {
				189	for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
				190	MII != MIE;) {
				191	MII--;
				192	MachineInstr CurrentMI = &MII;
				193
				194	if (CurrentMI->getOpcode() == DupOpcode &&
				195	CurrentMI->getNumOperands() == 3 &&
				196	CurrentMI->getOperand(1).getReg() == SrcReg &&
				197	CurrentMI->getOperand(2).getImm() == LaneNumber) {
				198	*DestReg = CurrentMI->getOperand(0).getReg();
				199	return true;
				200	}
				201	}
				202
				203	return false;
				204	}
				205
				206	/// Certain SIMD instructions with vector element operand are not efficient.
				207	/// Rewrite them into SIMD instructions with vector operands. This rewrite
				208	/// is driven by the latency of the instructions.
				209	/// The instruction of concerns are for the time being fmla, fmls, fmul,
				210	/// and fmulx and hence they are hardcoded.
				211	///
				212	/// Example:
				213	/// fmla v0.4s, v1.4s, v2.s[1]
				214	/// is rewritten into
				215	/// dup v3.4s, v2.s[1] // dup not necessary if redundant
				216	/// fmla v0.4s, v1.4s, v3.4s
				217	/// Return true if the SIMD instruction is modified.
				218	bool AArch64VectorByElementOpt::optimizeVectElement(
				219	MachineInstr &MI, std::map<unsigned, bool> *VecInstElemTable) const {
				220	const MCInstrDesc MulMCID, DupMCID;
				221	const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
				222
				223	switch (MI.getOpcode()) {
				224	default:
				225	return false;
				226
				227	// 4X32 instructions
				228	case AArch64::FMLAv4i32_indexed:
				229	DupMCID = &TII->get(AArch64::DUPv4i32lane);
				230	MulMCID = &TII->get(AArch64::FMLAv4f32);
				231	break;
				232	case AArch64::FMLSv4i32_indexed:
				233	DupMCID = &TII->get(AArch64::DUPv4i32lane);
				234	MulMCID = &TII->get(AArch64::FMLSv4f32);
				235	break;
				236	case AArch64::FMULXv4i32_indexed:
				237	DupMCID = &TII->get(AArch64::DUPv4i32lane);
				238	MulMCID = &TII->get(AArch64::FMULXv4f32);
				239	break;
				240	case AArch64::FMULv4i32_indexed:
				241	DupMCID = &TII->get(AArch64::DUPv4i32lane);
				242	MulMCID = &TII->get(AArch64::FMULv4f32);
				243	break;
				244
				245	// 2X64 instructions
				246	case AArch64::FMLAv2i64_indexed:
				247	DupMCID = &TII->get(AArch64::DUPv2i64lane);
				248	MulMCID = &TII->get(AArch64::FMLAv2f64);
				249	break;
				250	case AArch64::FMLSv2i64_indexed:
				251	DupMCID = &TII->get(AArch64::DUPv2i64lane);
				252	MulMCID = &TII->get(AArch64::FMLSv2f64);
				253	break;
				254	case AArch64::FMULXv2i64_indexed:
				255	DupMCID = &TII->get(AArch64::DUPv2i64lane);
				256	MulMCID = &TII->get(AArch64::FMULXv2f64);
				257	break;
				258	case AArch64::FMULv2i64_indexed:
				259	DupMCID = &TII->get(AArch64::DUPv2i64lane);
				260	MulMCID = &TII->get(AArch64::FMULv2f64);
				261	break;
				262
				263	// 2X32 instructions
				264	case AArch64::FMLAv2i32_indexed:
				265	RC = &AArch64::FPR64RegClass;
				266	DupMCID = &TII->get(AArch64::DUPv2i32lane);
				267	MulMCID = &TII->get(AArch64::FMLAv2f32);
				268	break;
				269	case AArch64::FMLSv2i32_indexed:
				270	RC = &AArch64::FPR64RegClass;
				271	DupMCID = &TII->get(AArch64::DUPv2i32lane);
				272	MulMCID = &TII->get(AArch64::FMLSv2f32);
				273	break;
				274	case AArch64::FMULXv2i32_indexed:
				275	RC = &AArch64::FPR64RegClass;
				276	DupMCID = &TII->get(AArch64::DUPv2i32lane);
				277	MulMCID = &TII->get(AArch64::FMULXv2f32);
				278	break;
				279	case AArch64::FMULv2i32_indexed:
				280	RC = &AArch64::FPR64RegClass;
				281	DupMCID = &TII->get(AArch64::DUPv2i32lane);
				282	MulMCID = &TII->get(AArch64::FMULv2f32);
				283	break;
				284	}
				285
				286	if (!shouldReplaceInstruction(MI.getParent()->getParent(),
				287	&TII->get(MI.getOpcode()), DupMCID, MulMCID,
				288	*VecInstElemTable))
				289	return false;
				290
				291	const DebugLoc &DL = MI.getDebugLoc();
				292	MachineBasicBlock &MBB = *MI.getParent();
				293	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
				294
				295	// get the operands of the current SIMD arithmetic instruction.
				296	unsigned MulDest = MI.getOperand(0).getReg();
				297	unsigned SrcReg0 = MI.getOperand(1).getReg();
				298	unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
				299	unsigned SrcReg1 = MI.getOperand(2).getReg();
				300	unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
				301	unsigned DupDest;
				302
				303	// Instructions of interest have either 4 or 5 operands.
				304	if (MI.getNumOperands() == 5) {
				305	unsigned SrcReg2 = MI.getOperand(3).getReg();
				306	unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
				307	unsigned LaneNumber = MI.getOperand(4).getImm();
				308
				309	// Create a new DUP instruction. Note that if an equivalent DUP instruction
				310	// has already been created before, then use that one instread of creating
				311	// a new one.
				312	if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
				313	DupDest = MRI.createVirtualRegister(RC);
				314	BuildMI(MBB, MI, DL, *DupMCID, DupDest)
				315	.addReg(SrcReg2, Src2IsKill)
				316	.addImm(LaneNumber);
				317	}
				318	BuildMI(MBB, MI, DL, *MulMCID, MulDest)
				319	.addReg(SrcReg0, Src0IsKill)
				320	.addReg(SrcReg1, Src1IsKill)
				321	.addReg(DupDest, Src2IsKill);
				322	} else if (MI.getNumOperands() == 4) {
				323	unsigned LaneNumber = MI.getOperand(3).getImm();
				324	if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
				325	DupDest = MRI.createVirtualRegister(RC);
				326	BuildMI(MBB, MI, DL, *DupMCID, DupDest)
				327	.addReg(SrcReg1, Src1IsKill)
				328	.addImm(LaneNumber);
				329	}
				330	BuildMI(MBB, MI, DL, *MulMCID, MulDest)
				331	.addReg(SrcReg0, Src0IsKill)
				332	.addReg(DupDest, Src1IsKill);
				333	} else {
				334	return false;
				335	}
				336
				337	++NumModifiedInstr;
				338	return true;
				339	}
				340
				341	bool AArch64VectorByElementOpt::runOnMachineFunction(MachineFunction &MF) {
				342	if (skipFunction(*MF.getFunction()))
				343	return false;
				344
				345	TII = MF.getSubtarget().getInstrInfo();
				346	MRI = &MF.getRegInfo();
				347	const TargetSubtargetInfo &ST = MF.getSubtarget();
				348	const AArch64InstrInfo *AAII =
				349	static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
				350	if (!AAII)
				351	return false;
				352	SchedModel.init(ST.getSchedModel(), &ST, AAII);
				353	if (!SchedModel.hasInstrSchedModel())
				354	return false;
				355
				356	// A simple check to exit this pass early for targets that do not need it.
				357	if (earlyExitVectElement(&MF))
				358	return false;
				359
				360	bool Changed = false;
				361	std::map<unsigned, bool> VecInstElemTable;
				362	SmallVector<MachineInstr *, 8> RemoveMIs;
				363
				364	for (MachineBasicBlock &MBB : MF) {
				365	for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
				366	MII != MIE;) {
				367	MachineInstr &MI = *MII;
				368	if (optimizeVectElement(MI, &VecInstElemTable)) {
				369	// Add MI to the list of instructions to be removed given that it has
				370	// been replaced.
				371	RemoveMIs.push_back(&MI);
				372	Changed = true;
				373	}
				374	++MII;
				375	}
				376	}
				377
				378	for (MachineInstr *MI : RemoveMIs)
				379	MI->eraseFromParent();
				380
				381	return Changed;
				382	}
				383
				384	/// createAArch64VectorByElementOptPass - returns an instance of the
				385	/// vector by element optimization pass.
				386	FunctionPass *llvm::createAArch64VectorByElementOptPass() {
				387	return new AArch64VectorByElementOpt();
				388	}