Reapply "AMDGPU: Add ds_gws_init / ds_gws_barrier intrinsics"

This reapplies r363678, using the correct chain for the CopyToReg for
v0. glueCopyToM0 counterintuitively changes the operands of the
original node.

llvm-svn: 363870
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 0c880a3..eb1e1be 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -218,7 +218,9 @@
   void SelectFMAD_FMA(SDNode *N);
   void SelectATOMIC_CMP_SWAP(SDNode *N);
   void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
+  void SelectDS_GWS(SDNode *N, unsigned IntrID);
   void SelectINTRINSIC_W_CHAIN(SDNode *N);
+  void SelectINTRINSIC_VOID(SDNode *N);
 
 protected:
   // Include the pieces autogenerated from the target description.
@@ -832,6 +834,10 @@
     SelectINTRINSIC_W_CHAIN(N);
     return;
   }
+  case ISD::INTRINSIC_VOID: {
+    SelectINTRINSIC_VOID(N);
+    return;
+  }
   }
 
   SelectCode(N);
@@ -2034,6 +2040,72 @@
   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
 }
 
+void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
+  SDLoc SL(N);
+  SDValue VSrc0 = N->getOperand(2);
+  SDValue BaseOffset = N->getOperand(3);
+  int ImmOffset = 0;
+  MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
+  MachineMemOperand *MMO = M->getMemOperand();
+
+  // Don't worry if the offset ends up in a VGPR. Only one lane will have
+  // effect, so SIFixSGPRCopies will validly insert readfirstlane.
+
+  // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
+  // offset field) % 64. Some versions of the programming guide omit the m0
+  // part, or claim it's from offset 0.
+  if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
+    // If we have a constant offset, try to use the default value for m0 as a
+    // base to possibly avoid setting it up.
+    glueCopyToM0(N, CurDAG->getTargetConstant(-1, SL, MVT::i32));
+    ImmOffset = ConstOffset->getZExtValue() + 1;
+  } else {
+    if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
+      ImmOffset = BaseOffset.getConstantOperandVal(1);
+      BaseOffset = BaseOffset.getOperand(0);
+    }
+
+    // Prefer to do the shift in an SGPR since it should be possible to use m0
+    // as the result directly. If it's already an SGPR, it will be eliminated
+    // later.
+    SDNode *SGPROffset
+      = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
+                               BaseOffset);
+    // Shift to offset in m0
+    SDNode *M0Base
+      = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
+                               SDValue(SGPROffset, 0),
+                               CurDAG->getTargetConstant(16, SL, MVT::i32));
+    glueCopyToM0(N, SDValue(M0Base, 0));
+  }
+
+  // The manual doesn't mention this, but it seems only v0 works.
+  SDValue V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32);
+
+  SDValue CopyToV0 = CurDAG->getCopyToReg(
+    N->getOperand(0), SL, V0, VSrc0,
+    N->getOperand(N->getNumOperands() - 1));
+
+  SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
+
+  // TODO: Can this just be removed from the instruction?
+  SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1);
+
+  unsigned Opc = IntrID == Intrinsic::amdgcn_ds_gws_init ?
+    AMDGPU::DS_GWS_INIT : AMDGPU::DS_GWS_BARRIER;
+
+  SDValue Ops[] = {
+    V0,
+    OffsetField,
+    GDS,
+    CopyToV0, // Chain
+    CopyToV0.getValue(1) // Glue
+  };
+
+  SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
+  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
+}
+
 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
   switch (IntrID) {
@@ -2044,6 +2116,18 @@
     SelectDSAppendConsume(N, IntrID);
     return;
   }
+  }
+
+  SelectCode(N);
+}
+
+void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
+  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  switch (IntrID) {
+  case Intrinsic::amdgcn_ds_gws_init:
+  case Intrinsic::amdgcn_ds_gws_barrier:
+    SelectDS_GWS(N, IntrID);
+    return;
   default:
     break;
   }