blob: 277b647f676b9fc153e52f6d419a5394d9a20820 [file] [log] [blame]
Tom Stellardf98f2ce2012-12-11 21:25:42 +00001//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief This pass lowers the pseudo control flow instructions (SI_IF_NZ, ELSE, ENDIF)
12/// to predicated instructions.
13///
14/// All control flow (except loops) is handled using predicated instructions and
15/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector
16/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs
17/// by writting to the 64-bit EXEC register (each bit corresponds to a
18/// single vector ALU). Typically, for predicates, a vector ALU will write
19/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
20/// Vector ALU) and then the ScalarALU will AND the VCC register with the
21/// EXEC to update the predicates.
22///
23/// For example:
24/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
25/// SI_IF_NZ %VCC
26/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
27/// ELSE
28/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
29/// ENDIF
30///
31/// becomes:
32///
33/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask
34/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
35/// S_CBRANCH_EXECZ label0 // This instruction is an
36/// // optimization which allows us to
37/// // branch if all the bits of
38/// // EXEC are zero.
39/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
40///
41/// label0:
42/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block
43/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
44/// S_BRANCH_EXECZ label1 // Use our branch optimization
45/// // instruction again.
46/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block
47/// label1:
48/// %EXEC = S_OR_B64 %EXEC, %SGPR2 // Re-enable saved exec mask bits
49//===----------------------------------------------------------------------===//
50
51#include "AMDGPU.h"
52#include "SIInstrInfo.h"
53#include "SIMachineFunctionInfo.h"
54#include "llvm/CodeGen/MachineFunction.h"
55#include "llvm/CodeGen/MachineFunctionPass.h"
56#include "llvm/CodeGen/MachineInstrBuilder.h"
57#include "llvm/CodeGen/MachineRegisterInfo.h"
58
59using namespace llvm;
60
61namespace {
62
63class SILowerControlFlowPass : public MachineFunctionPass {
64
65private:
66 static char ID;
67 const TargetInstrInfo *TII;
68 std::vector<unsigned> PredicateStack;
69 std::vector<unsigned> UnusedRegisters;
70
71 unsigned allocReg();
72 void freeReg(unsigned Reg);
73
74public:
75 SILowerControlFlowPass(TargetMachine &tm) :
76 MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
77
78 virtual bool runOnMachineFunction(MachineFunction &MF);
79
80 const char *getPassName() const {
81 return "SI Lower control flow instructions";
82 }
83
84};
85
86} // End anonymous namespace
87
88char SILowerControlFlowPass::ID = 0;
89
90FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
91 return new SILowerControlFlowPass(tm);
92}
93
94bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
95
96 // Find all the unused registers that can be used for the predicate stack.
97 for (TargetRegisterClass::iterator I = AMDGPU::SReg_64RegClass.begin(),
98 S = AMDGPU::SReg_64RegClass.end();
99 I != S; ++I) {
100 unsigned Reg = *I;
101 if (!MF.getRegInfo().isPhysRegUsed(Reg)) {
102 UnusedRegisters.insert(UnusedRegisters.begin(), Reg);
103 }
104 }
105
106 for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
107 BB != BB_E; ++BB) {
108 MachineBasicBlock &MBB = *BB;
109 for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
110 I != MBB.end(); I = Next) {
111 Next = llvm::next(I);
112 MachineInstr &MI = *I;
113 unsigned Reg;
114 switch (MI.getOpcode()) {
115 default: break;
116 case AMDGPU::SI_IF_NZ:
117 Reg = allocReg();
118 BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
119 Reg)
120 .addOperand(MI.getOperand(0)); // VCC
121 BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64),
122 Reg)
123 .addReg(Reg)
124 .addReg(AMDGPU::EXEC);
125 MI.eraseFromParent();
126 PredicateStack.push_back(Reg);
127 break;
128
129 case AMDGPU::ELSE:
130 Reg = PredicateStack.back();
131 BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
132 Reg)
133 .addReg(Reg);
134 BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64),
135 AMDGPU::EXEC)
136 .addReg(Reg)
137 .addReg(AMDGPU::EXEC);
138 MI.eraseFromParent();
139 break;
140
141 case AMDGPU::ENDIF:
142 Reg = PredicateStack.back();
143 PredicateStack.pop_back();
144 BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_B64),
145 AMDGPU::EXEC)
146 .addReg(AMDGPU::EXEC)
147 .addReg(Reg);
148 freeReg(Reg);
149
150 if (MF.getInfo<SIMachineFunctionInfo>()->ShaderType == ShaderType::PIXEL &&
151 PredicateStack.empty()) {
152 // If the exec mask is non-zero, skip the next two instructions
153 BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_CBRANCH_EXECNZ))
154 .addImm(3)
155 .addReg(AMDGPU::EXEC);
156
157 // Exec mask is zero: Export to NULL target...
158 BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::EXP))
159 .addImm(0)
160 .addImm(0x09) // V_008DFC_SQ_EXP_NULL
161 .addImm(0)
162 .addImm(1)
163 .addImm(1)
164 .addReg(AMDGPU::SREG_LIT_0)
165 .addReg(AMDGPU::SREG_LIT_0)
166 .addReg(AMDGPU::SREG_LIT_0)
167 .addReg(AMDGPU::SREG_LIT_0);
168
169 // ... and terminate wavefront
170 BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_ENDPGM));
171 }
172 MI.eraseFromParent();
173 break;
174 }
175 }
176 }
177 return true;
178}
179
180unsigned SILowerControlFlowPass::allocReg() {
181
182 assert(!UnusedRegisters.empty() && "Ran out of registers for predicate stack");
183 unsigned Reg = UnusedRegisters.back();
184 UnusedRegisters.pop_back();
185 return Reg;
186}
187
188void SILowerControlFlowPass::freeReg(unsigned Reg) {
189
190 UnusedRegisters.push_back(Reg);
191}