[AMDGPU] Allow hoisting of comparisons out of a loop and eliminate condition copies Codegen prepare sinks comparisons close to a user is we have only one register for conditions. For AMDGPU we have many SGPRs capable to hold vector conditions. Changed BE to report we have many condition registers. That way IR LICM pass would hoist an invariant comparison out of a loop and codegen prepare will not sink it. With that done a condition is calculated in one block and used in another. Current behavior is to store workitem's condition in a VGPR using v_cndmask_b32 and then restore it with yet another v_cmp instruction from that v_cndmask's result. To mitigate the issue a propagation of source SGPR pair in place of v_cmp is implemented. Additional side effect of this is that we may consume less VGPRs at a cost of more SGPRs in case if holding of multiple conditions is needed, and that is a clear win in most cases. Differential Revision: https://reviews.llvm.org/D26114 llvm-svn: 288053

commit: 0ee250eee84da1fbcf94eaf5595420abdb8228e5 [log] [tgz]
author: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> Mon Nov 28 18:58:49 2016 +0000
committer: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> Mon Nov 28 18:58:49 2016 +0000
tree: 246845b71dfb94ea6e5ee56c34ef332fa21c6e30
parent: caaa82d90d8ca32d505ab388cef18abe4760b758 [diff] [blame]
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 9f1178c..7ed18f2 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp

@@ -80,6 +80,11 @@
   void emitLoop(MachineInstr &MI);
   void emitEndCf(MachineInstr &MI);
 
+  void findMaskOperands(MachineInstr &MI, unsigned OpNo,
+                        SmallVectorImpl<MachineOperand> &Src) const;
+
+  void combineMasks(MachineInstr &MI);
+
 public:
   static char ID;
 
@@ -336,6 +341,62 @@
     LIS->handleMove(*NewMI);
 }
 
+// Returns replace operands for a logical operation, either single result
+// for exec or two operands if source was another equivalent operation.
+void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
+       SmallVectorImpl<MachineOperand> &Src) const {
+  MachineOperand &Op = MI.getOperand(OpNo);
+  if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
+    Src.push_back(Op);
+    return;
+  }
+
+  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
+  if (!Def || Def->getParent() != MI.getParent() ||
+      !(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode())))
+    return;
+
+  // Make sure we do not modify exec between def and use.
+  // A copy with implcitly defined exec inserted earlier is an exclusion, it
+  // does not really modify exec.
+  for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
+    if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
+        !(I->isCopy() && I->getOperand(0).getReg() != AMDGPU::EXEC))
+      return;
+
+  for (const auto &SrcOp : Def->explicit_operands())
+    if (SrcOp.isUse() && (!SrcOp.isReg() ||
+        TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
+        SrcOp.getReg() == AMDGPU::EXEC))
+      Src.push_back(SrcOp);
+}
+
+// Search and combine pairs of equivalent instructions, like
+// S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y
+// S_OR_B64  x, (S_OR_B64  x, y) => S_OR_B64  x, y
+// One of the operands is exec mask.
+void SILowerControlFlow::combineMasks(MachineInstr &MI) {
+  assert(MI.getNumExplicitOperands() == 3);
+  SmallVector<MachineOperand, 4> Ops;
+  unsigned OpToReplace = 1;
+  findMaskOperands(MI, 1, Ops);
+  if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy
+  findMaskOperands(MI, 2, Ops);
+  if (Ops.size() != 3) return;
+
+  unsigned UniqueOpndIdx;
+  if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2;
+  else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
+  else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
+  else return;
+
+  unsigned Reg = MI.getOperand(OpToReplace).getReg();
+  MI.RemoveOperand(OpToReplace);
+  MI.addOperand(Ops[UniqueOpndIdx]);
+  if (MRI->use_empty(Reg))
+    MRI->getUniqueVRegDef(Reg)->eraseFromParent();
+}
+
 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   TII = ST.getInstrInfo();
@@ -351,9 +412,9 @@
     NextBB = std::next(BI);
     MachineBasicBlock &MBB = *BI;
 
-    MachineBasicBlock::iterator I, Next;
+    MachineBasicBlock::iterator I, Next, Last;
 
-    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+    for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) {
       Next = std::next(I);
       MachineInstr &MI = *I;
 
@@ -386,9 +447,20 @@
         emitEndCf(MI);
         break;
 
+      case AMDGPU::S_AND_B64:
+      case AMDGPU::S_OR_B64:
+        // Cleanup bit manipulations on exec mask
+        combineMasks(MI);
+        Last = I;
+        continue;
+
       default:
-        break;
+        Last = I;
+        continue;
       }
+
+      // Replay newly inserted code to combine masks
+      Next = (Last == MBB.end()) ? MBB.begin() : Last;
     }
   }
commit	0ee250eee84da1fbcf94eaf5595420abdb8228e5	[log] [tgz]
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	Mon Nov 28 18:58:49 2016 +0000
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	Mon Nov 28 18:58:49 2016 +0000
tree	246845b71dfb94ea6e5ee56c34ef332fa21c6e30
parent	caaa82d90d8ca32d505ab388cef18abe4760b758 [diff] [blame]