| //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// |
| // |
| // The LLVM Compiler Infrastructure |
| // |
| // This file is distributed under the University of Illinois Open Source |
| // License. See LICENSE.TXT for details. |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| /// \file |
| /// \brief This pass lowers the pseudo control flow instructions (SI_IF_NZ, ELSE, ENDIF) |
| /// to predicated instructions. |
| /// |
| /// All control flow (except loops) is handled using predicated instructions and |
| /// a predicate stack. Each Scalar ALU controls the operations of 64 Vector |
| /// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs |
| /// by writting to the 64-bit EXEC register (each bit corresponds to a |
| /// single vector ALU). Typically, for predicates, a vector ALU will write |
| /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each |
| /// Vector ALU) and then the ScalarALU will AND the VCC register with the |
| /// EXEC to update the predicates. |
| /// |
| /// For example: |
| /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 |
| /// SI_IF_NZ %VCC |
| /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 |
| /// ELSE |
| /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 |
| /// ENDIF |
| /// |
| /// becomes: |
| /// |
| /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask |
| /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask |
| /// S_CBRANCH_EXECZ label0 // This instruction is an |
| /// // optimization which allows us to |
| /// // branch if all the bits of |
| /// // EXEC are zero. |
| /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch |
| /// |
| /// label0: |
| /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block |
| /// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask |
| /// S_BRANCH_EXECZ label1 // Use our branch optimization |
| /// // instruction again. |
| /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block |
| /// label1: |
| /// %EXEC = S_OR_B64 %EXEC, %SGPR2 // Re-enable saved exec mask bits |
| //===----------------------------------------------------------------------===// |
| |
| #include "AMDGPU.h" |
| #include "SIInstrInfo.h" |
| #include "SIMachineFunctionInfo.h" |
| #include "llvm/CodeGen/MachineFunction.h" |
| #include "llvm/CodeGen/MachineFunctionPass.h" |
| #include "llvm/CodeGen/MachineInstrBuilder.h" |
| #include "llvm/CodeGen/MachineRegisterInfo.h" |
| |
| using namespace llvm; |
| |
| namespace { |
| |
| class SILowerControlFlowPass : public MachineFunctionPass { |
| |
| private: |
| static char ID; |
| const TargetInstrInfo *TII; |
| std::vector<unsigned> PredicateStack; |
| std::vector<unsigned> UnusedRegisters; |
| |
| unsigned allocReg(); |
| void freeReg(unsigned Reg); |
| |
| public: |
| SILowerControlFlowPass(TargetMachine &tm) : |
| MachineFunctionPass(ID), TII(tm.getInstrInfo()) { } |
| |
| virtual bool runOnMachineFunction(MachineFunction &MF); |
| |
| const char *getPassName() const { |
| return "SI Lower control flow instructions"; |
| } |
| |
| }; |
| |
| } // End anonymous namespace |
| |
| char SILowerControlFlowPass::ID = 0; |
| |
| FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) { |
| return new SILowerControlFlowPass(tm); |
| } |
| |
| bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { |
| |
| // Find all the unused registers that can be used for the predicate stack. |
| for (TargetRegisterClass::iterator I = AMDGPU::SReg_64RegClass.begin(), |
| S = AMDGPU::SReg_64RegClass.end(); |
| I != S; ++I) { |
| unsigned Reg = *I; |
| if (!MF.getRegInfo().isPhysRegUsed(Reg)) { |
| UnusedRegisters.insert(UnusedRegisters.begin(), Reg); |
| } |
| } |
| |
| for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); |
| BB != BB_E; ++BB) { |
| MachineBasicBlock &MBB = *BB; |
| for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); |
| I != MBB.end(); I = Next) { |
| Next = llvm::next(I); |
| MachineInstr &MI = *I; |
| unsigned Reg; |
| switch (MI.getOpcode()) { |
| default: break; |
| case AMDGPU::SI_IF_NZ: |
| Reg = allocReg(); |
| BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), |
| Reg) |
| .addOperand(MI.getOperand(0)); // VCC |
| BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64), |
| Reg) |
| .addReg(Reg) |
| .addReg(AMDGPU::EXEC); |
| MI.eraseFromParent(); |
| PredicateStack.push_back(Reg); |
| break; |
| |
| case AMDGPU::ELSE: |
| Reg = PredicateStack.back(); |
| BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_SAVEEXEC_B64), |
| Reg) |
| .addReg(Reg); |
| BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64), |
| AMDGPU::EXEC) |
| .addReg(Reg) |
| .addReg(AMDGPU::EXEC); |
| MI.eraseFromParent(); |
| break; |
| |
| case AMDGPU::ENDIF: |
| Reg = PredicateStack.back(); |
| PredicateStack.pop_back(); |
| BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_B64), |
| AMDGPU::EXEC) |
| .addReg(AMDGPU::EXEC) |
| .addReg(Reg); |
| freeReg(Reg); |
| |
| if (MF.getInfo<SIMachineFunctionInfo>()->ShaderType == ShaderType::PIXEL && |
| PredicateStack.empty()) { |
| // If the exec mask is non-zero, skip the next two instructions |
| BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_CBRANCH_EXECNZ)) |
| .addImm(3) |
| .addReg(AMDGPU::EXEC); |
| |
| // Exec mask is zero: Export to NULL target... |
| BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::EXP)) |
| .addImm(0) |
| .addImm(0x09) // V_008DFC_SQ_EXP_NULL |
| .addImm(0) |
| .addImm(1) |
| .addImm(1) |
| .addReg(AMDGPU::SREG_LIT_0) |
| .addReg(AMDGPU::SREG_LIT_0) |
| .addReg(AMDGPU::SREG_LIT_0) |
| .addReg(AMDGPU::SREG_LIT_0); |
| |
| // ... and terminate wavefront |
| BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_ENDPGM)); |
| } |
| MI.eraseFromParent(); |
| break; |
| } |
| } |
| } |
| return true; |
| } |
| |
| unsigned SILowerControlFlowPass::allocReg() { |
| |
| assert(!UnusedRegisters.empty() && "Ran out of registers for predicate stack"); |
| unsigned Reg = UnusedRegisters.back(); |
| UnusedRegisters.pop_back(); |
| return Reg; |
| } |
| |
| void SILowerControlFlowPass::freeReg(unsigned Reg) { |
| |
| UnusedRegisters.push_back(Reg); |
| } |