Blame - llvm/lib/Target/AMDGPU/SIInsertSkips.cpp - toolchain/llvm-project

blob: ade0451ce503005bcf586f08418cf4a891dc8fcd [file] [log] [blame]

Matt Arsenault	78fc9da	2016-08-22 19:33:16 +0000	[diff] [blame]	1	//===-- SIInsertSkips.cpp - Use predicates for control flow ----------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	/// \file
				11	/// \brief This pass inserts branches on the 0 exec mask over divergent branches
				12	/// branches when it's expected that jumping over the untaken control flow will
				13	/// be cheaper than having every workitem no-op through it.
				14	//
				15
				16	#include "AMDGPU.h"
				17	#include "AMDGPUSubtarget.h"
				18	#include "SIInstrInfo.h"
				19	#include "SIMachineFunctionInfo.h"
				20	#include "llvm/CodeGen/MachineFrameInfo.h"
				21	#include "llvm/CodeGen/MachineFunction.h"
				22	#include "llvm/CodeGen/MachineFunctionPass.h"
				23	#include "llvm/CodeGen/MachineInstrBuilder.h"
				24	#include "llvm/MC/MCAsmInfo.h"
				25
				26	using namespace llvm;
				27
				28	#define DEBUG_TYPE "si-insert-skips"
				29
				30	namespace {
				31
				32	static cl::opt<unsigned> SkipThresholdFlag(
				33	"amdgpu-skip-threshold",
				34	cl::desc("Number of instructions before jumping over divergent control flow"),
				35	cl::init(12), cl::Hidden);
				36
				37	class SIInsertSkips : public MachineFunctionPass {
				38	private:
				39	const SIRegisterInfo *TRI;
				40	const SIInstrInfo *TII;
				41	unsigned SkipThreshold;
				42
				43	bool shouldSkip(const MachineBasicBlock &From,
				44	const MachineBasicBlock &To) const;
				45
				46	bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
				47
				48	void kill(MachineInstr &MI);
				49
				50	MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
				51	MachineBasicBlock::iterator I) const;
				52
				53	bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
				54
				55	public:
				56	static char ID;
				57
				58	SIInsertSkips() :
				59	MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { }
				60
				61	bool runOnMachineFunction(MachineFunction &MF) override;
				62
				63	const char *getPassName() const override {
				64	return "SI insert s_cbranch_execz instructions";
				65	}
				66
				67	void getAnalysisUsage(AnalysisUsage &AU) const override {
				68	MachineFunctionPass::getAnalysisUsage(AU);
				69	}
				70	};
				71
				72	} // End anonymous namespace
				73
				74	char SIInsertSkips::ID = 0;
				75
				76	INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
				77	"SI insert s_cbranch_execz instructions", false, false)
				78
				79	char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
				80
				81	static bool opcodeEmitsNoInsts(unsigned Opc) {
				82	switch (Opc) {
				83	case TargetOpcode::IMPLICIT_DEF:
				84	case TargetOpcode::KILL:
				85	case TargetOpcode::BUNDLE:
				86	case TargetOpcode::CFI_INSTRUCTION:
				87	case TargetOpcode::EH_LABEL:
				88	case TargetOpcode::GC_LABEL:
				89	case TargetOpcode::DBG_VALUE:
				90	return true;
				91	default:
				92	return false;
				93	}
				94	}
				95
				96	bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
				97	const MachineBasicBlock &To) const {
				98	if (From.succ_empty())
				99	return false;
				100
				101	unsigned NumInstr = 0;
				102	const MachineFunction *MF = From.getParent();
				103
				104	for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
				105	MBBI != End && MBBI != ToI; ++MBBI) {
				106	const MachineBasicBlock &MBB = *MBBI;
				107
				108	for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
				109	NumInstr < SkipThreshold && I != E; ++I) {
				110	if (opcodeEmitsNoInsts(I->getOpcode()))
				111	continue;
				112
				113	// FIXME: Since this is required for correctness, this should be inserted
				114	// during SILowerControlFlow.
				115
				116	// When a uniform loop is inside non-uniform control flow, the branch
				117	// leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
				118	// when EXEC = 0. We should skip the loop lest it becomes infinite.
				119	if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ \|\|
				120	I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
				121	return true;
				122
				123	if (I->isInlineAsm()) {
				124	const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
				125	const char *AsmStr = I->getOperand(0).getSymbolName();
				126
				127	// inlineasm length estimate is number of bytes assuming the longest
				128	// instruction.
				129	uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
				130	NumInstr += MaxAsmSize / MAI->getMaxInstLength();
				131	} else {
				132	++NumInstr;
				133	}
				134
				135	if (NumInstr >= SkipThreshold)
				136	return true;
				137	}
				138	}
				139
				140	return false;
				141	}
				142
				143	bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
				144	MachineBasicBlock &MBB = *MI.getParent();
				145	MachineFunction *MF = MBB.getParent();
				146
				147	if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS \|\|
				148	!shouldSkip(MBB, MBB.getParent()->back()))
				149	return false;
				150
				151	MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
				152
				153	const DebugLoc &DL = MI.getDebugLoc();
				154
				155	// If the exec mask is non-zero, skip the next two instructions
				156	BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
				157	.addMBB(&NextBB);
				158
				159	MachineBasicBlock::iterator Insert = SkipBB->begin();
				160
				161	// Exec mask is zero: Export to NULL target...
				162	BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
				163	.addImm(0)
				164	.addImm(0x09) // V_008DFC_SQ_EXP_NULL
				165	.addImm(0)
				166	.addImm(1)
				167	.addImm(1)
				168	.addReg(AMDGPU::VGPR0, RegState::Undef)
				169	.addReg(AMDGPU::VGPR0, RegState::Undef)
				170	.addReg(AMDGPU::VGPR0, RegState::Undef)
				171	.addReg(AMDGPU::VGPR0, RegState::Undef);
				172
				173	// ... and terminate wavefront.
				174	BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
				175
				176	return true;
				177	}
				178
				179	void SIInsertSkips::kill(MachineInstr &MI) {
				180	MachineBasicBlock &MBB = *MI.getParent();
				181	DebugLoc DL = MI.getDebugLoc();
				182	const MachineOperand &Op = MI.getOperand(0);
				183
				184	#ifndef NDEBUG
				185	CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
				186	// Kill is only allowed in pixel / geometry shaders.
				187	assert(CallConv == CallingConv::AMDGPU_PS \|\|
				188	CallConv == CallingConv::AMDGPU_GS);
				189	#endif
				190	// Clear this thread from the exec mask if the operand is negative.
				191	if (Op.isImm()) {
				192	// Constant operand: Set exec mask to 0 or do nothing
				193	if (Op.getImm() & 0x80000000) {
				194	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
				195	.addImm(0);
				196	}
				197	} else {
				198	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
				199	.addImm(0)
				200	.addOperand(Op);
				201	}
				202	}
				203
				204	MachineBasicBlock *SIInsertSkips::insertSkipBlock(
				205	MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
				206	MachineFunction *MF = MBB.getParent();
				207
				208	MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
				209	MachineFunction::iterator MBBI(MBB);
				210	++MBBI;
				211
				212	MF->insert(MBBI, SkipBB);
				213	MBB.addSuccessor(SkipBB);
				214
				215	return SkipBB;
				216	}
				217
				218	// Returns true if a branch over the block was inserted.
				219	bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
				220	MachineBasicBlock &SrcMBB) {
				221	MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
				222
				223	if (!shouldSkip(*SrcMBB.succ_begin(), DestBB))
				224	return false;
				225
				226	const DebugLoc &DL = MI.getDebugLoc();
				227	MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
				228
				229	BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
				230	.addMBB(DestBB);
				231
				232	return true;
				233	}
				234
				235	bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
				236	const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
				237	TII = ST.getInstrInfo();
				238	TRI = &TII->getRegisterInfo();
				239	SkipThreshold = SkipThresholdFlag;
				240
				241	bool HaveKill = false;
				242	bool MadeChange = false;
				243
				244	// Track depth of exec mask, divergent branches.
				245	SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
				246
				247	MachineFunction::iterator NextBB;
				248
				249	MachineBasicBlock *EmptyMBBAtEnd = nullptr;
				250
				251	for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
				252	BI != BE; BI = NextBB) {
				253	NextBB = std::next(BI);
				254	MachineBasicBlock &MBB = *BI;
				255
				256	if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
				257	// Reached convergence point for last divergent branch.
				258	ExecBranchStack.pop_back();
				259	}
				260
				261	if (HaveKill && ExecBranchStack.empty()) {
				262	HaveKill = false;
				263
				264	// TODO: Insert skip if exec is 0?
				265	}
				266
				267	MachineBasicBlock::iterator I, Next;
				268	for (I = MBB.begin(); I != MBB.end(); I = Next) {
				269	Next = std::next(I);
				270
				271	MachineInstr &MI = *I;
				272
				273	switch (MI.getOpcode()) {
				274	case AMDGPU::SI_MASK_BRANCH: {
				275	ExecBranchStack.push_back(MI.getOperand(0).getMBB());
				276	MadeChange \|= skipMaskBranch(MI, MBB);
				277	break;
				278	}
				279	case AMDGPU::S_BRANCH: {
				280	// Optimize out branches to the next block.
				281	// FIXME: Shouldn't this be handled by BranchFolding?
				282	if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB()))
				283	MI.eraseFromParent();
				284	break;
				285	}
				286	case AMDGPU::SI_KILL_TERMINATOR: {
				287	MadeChange = true;
				288	kill(MI);
				289
				290	if (ExecBranchStack.empty()) {
				291	if (skipIfDead(MI, *NextBB)) {
				292	NextBB = std::next(BI);
				293	BE = MF.end();
				294	Next = MBB.end();
				295	}
				296	} else {
				297	HaveKill = true;
				298	}
				299
				300	MI.eraseFromParent();
				301	break;
				302	}
				303	case AMDGPU::SI_RETURN: {
				304	// FIXME: Should move somewhere else
				305	assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
				306
				307	// Graphics shaders returning non-void shouldn't contain S_ENDPGM,
				308	// because external bytecode will be appended at the end.
				309	if (BI != --MF.end() \|\| I != MBB.getFirstTerminator()) {
				310	// SI_RETURN is not the last instruction. Add an empty block at
				311	// the end and jump there.
				312	if (!EmptyMBBAtEnd) {
				313	EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
				314	MF.insert(MF.end(), EmptyMBBAtEnd);
				315	}
				316
				317	MBB.addSuccessor(EmptyMBBAtEnd);
				318	BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
				319	.addMBB(EmptyMBBAtEnd);
				320	I->eraseFromParent();
				321	}
				322	}
				323	default:
				324	break;
				325	}
				326	}
				327	}
				328
				329	return MadeChange;
				330	}