Reapply "AMDGPU: Add ds_gws_init / ds_gws_barrier intrinsics"

This reapplies r363678, using the correct chain for the CopyToReg for
v0. glueCopyToM0 counterintuitively changes the operands of the
original node.

llvm-svn: 363870
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 0c880a3..eb1e1be 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -218,7 +218,9 @@
   void SelectFMAD_FMA(SDNode *N);
   void SelectATOMIC_CMP_SWAP(SDNode *N);
   void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
+  void SelectDS_GWS(SDNode *N, unsigned IntrID);
   void SelectINTRINSIC_W_CHAIN(SDNode *N);
+  void SelectINTRINSIC_VOID(SDNode *N);
 
 protected:
   // Include the pieces autogenerated from the target description.
@@ -832,6 +834,10 @@
     SelectINTRINSIC_W_CHAIN(N);
     return;
   }
+  case ISD::INTRINSIC_VOID: {
+    SelectINTRINSIC_VOID(N);
+    return;
+  }
   }
 
   SelectCode(N);
@@ -2034,6 +2040,72 @@
   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
 }
 
+void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
+  SDLoc SL(N);
+  SDValue VSrc0 = N->getOperand(2);
+  SDValue BaseOffset = N->getOperand(3);
+  int ImmOffset = 0;
+  MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
+  MachineMemOperand *MMO = M->getMemOperand();
+
+  // Don't worry if the offset ends up in a VGPR. Only one lane will have
+  // effect, so SIFixSGPRCopies will validly insert readfirstlane.
+
+  // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
+  // offset field) % 64. Some versions of the programming guide omit the m0
+  // part, or claim it's from offset 0.
+  if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
+    // If we have a constant offset, try to use the default value for m0 as a
+    // base to possibly avoid setting it up.
+    glueCopyToM0(N, CurDAG->getTargetConstant(-1, SL, MVT::i32));
+    ImmOffset = ConstOffset->getZExtValue() + 1;
+  } else {
+    if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
+      ImmOffset = BaseOffset.getConstantOperandVal(1);
+      BaseOffset = BaseOffset.getOperand(0);
+    }
+
+    // Prefer to do the shift in an SGPR since it should be possible to use m0
+    // as the result directly. If it's already an SGPR, it will be eliminated
+    // later.
+    SDNode *SGPROffset
+      = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
+                               BaseOffset);
+    // Shift to offset in m0
+    SDNode *M0Base
+      = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
+                               SDValue(SGPROffset, 0),
+                               CurDAG->getTargetConstant(16, SL, MVT::i32));
+    glueCopyToM0(N, SDValue(M0Base, 0));
+  }
+
+  // The manual doesn't mention this, but it seems only v0 works.
+  SDValue V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32);
+
+  SDValue CopyToV0 = CurDAG->getCopyToReg(
+    N->getOperand(0), SL, V0, VSrc0,
+    N->getOperand(N->getNumOperands() - 1));
+
+  SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
+
+  // TODO: Can this just be removed from the instruction?
+  SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1);
+
+  unsigned Opc = IntrID == Intrinsic::amdgcn_ds_gws_init ?
+    AMDGPU::DS_GWS_INIT : AMDGPU::DS_GWS_BARRIER;
+
+  SDValue Ops[] = {
+    V0,
+    OffsetField,
+    GDS,
+    CopyToV0, // Chain
+    CopyToV0.getValue(1) // Glue
+  };
+
+  SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
+  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
+}
+
 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
   switch (IntrID) {
@@ -2044,6 +2116,18 @@
     SelectDSAppendConsume(N, IntrID);
     return;
   }
+  }
+
+  SelectCode(N);
+}
+
+void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
+  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  switch (IntrID) {
+  case Intrinsic::amdgcn_ds_gws_init:
+  case Intrinsic::amdgcn_ds_gws_barrier:
+    SelectDS_GWS(N, IntrID);
+    return;
   default:
     break;
   }
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index db2dcff..248f659 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -467,11 +467,15 @@
 defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>;
 defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>;
 
-def DS_GWS_INIT       : DS_GWS_1D<"ds_gws_init">;
+let isConvergent = 1 in {
+def DS_GWS_INIT       : DS_GWS_1D<"ds_gws_init"> {
+  let mayLoad = 0;
+}
 def DS_GWS_SEMA_V     : DS_GWS_0D<"ds_gws_sema_v">;
 def DS_GWS_SEMA_BR    : DS_GWS_1D<"ds_gws_sema_br">;
 def DS_GWS_SEMA_P     : DS_GWS_0D<"ds_gws_sema_p">;
 def DS_GWS_BARRIER    : DS_GWS_1D<"ds_gws_barrier">;
+}
 
 def DS_ADD_SRC2_U32   : DS_1A<"ds_add_src2_u32">;
 def DS_SUB_SRC2_U32   : DS_1A<"ds_sub_src2_u32">;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 80acf57..54cc459 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -961,6 +961,24 @@
 
     return true;
   }
+  case Intrinsic::amdgcn_ds_gws_init:
+  case Intrinsic::amdgcn_ds_gws_barrier: {
+    Info.opc = ISD::INTRINSIC_VOID;
+
+    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+    Info.ptrVal =
+        MFI->getGWSPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+
+    // This is an abstract access, but we need to specify a type and size.
+    Info.memVT = MVT::i32;
+    Info.size = 4;
+    Info.align = 4;
+
+    Info.flags = MachineMemOperand::MOStore;
+    if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
+      Info.flags = MachineMemOperand::MOLoad;
+    return true;
+  }
   default:
     return false;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 3269423..29c891c 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -536,15 +536,19 @@
     // Put score on the source vgprs. If this is a store, just use those
     // specific register(s).
     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
+      int AddrOpIdx =
+          AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
       // All GDS operations must protect their address register (same as
       // export.)
-      if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
-          Inst.getOpcode() != AMDGPU::DS_CONSUME) {
-        setExpScore(
-            &Inst, TII, TRI, MRI,
-            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
-            CurrScore);
+      if (AddrOpIdx != -1) {
+        setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
+      } else {
+        assert(Inst.getOpcode() == AMDGPU::DS_APPEND ||
+               Inst.getOpcode() == AMDGPU::DS_CONSUME ||
+               Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
+               Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER);
       }
+
       if (Inst.mayStore()) {
         if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
                                        AMDGPU::OpName::data0) != -1) {
@@ -1407,18 +1411,6 @@
       ScoreBrackets.dump();
     });
 
-    // Check to see if this is a GWS instruction. If so, and if this is CI or
-    // VI, then the generated code sequence will include an S_WAITCNT 0.
-    // TODO: Are these the only GWS instructions?
-    if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
-        Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
-        Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
-        Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
-        Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
-      // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
-      ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt());
-    }
-
     // TODO: Remove this work-around after fixing the scheduler and enable the
     // assert above.
     if (VCCZBugWorkAround) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index bb0076c..56935b3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2547,7 +2547,8 @@
   //       given the typical code patterns.
   if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
       Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE ||
-      Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP)
+      Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
+      Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
     return true;
 
   if (MI.isCall() || MI.isInlineAsm())
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index f7d3143..b8d1ac5 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -43,7 +43,8 @@
 public:
   enum AMDGPUPSVKind : unsigned {
     PSVBuffer = PseudoSourceValue::TargetCustom,
-    PSVImage
+    PSVImage,
+    GWSResource
   };
 
 protected:
@@ -87,6 +88,30 @@
   }
 };
 
+class AMDGPUGWSResourcePseudoSourceValue final : public AMDGPUPseudoSourceValue {
+public:
+  explicit AMDGPUGWSResourcePseudoSourceValue(const TargetInstrInfo &TII)
+      : AMDGPUPseudoSourceValue(GWSResource, TII) {}
+
+  static bool classof(const PseudoSourceValue *V) {
+    return V->kind() == GWSResource;
+  }
+
+  // These are inaccessible memory from IR.
+  bool isAliased(const MachineFrameInfo *) const override {
+    return false;
+  }
+
+  // These are inaccessible memory from IR.
+  bool mayAlias(const MachineFrameInfo *) const override {
+    return false;
+  }
+
+  void printCustom(raw_ostream &OS) const override {
+    OS << "GWSResource";
+  }
+};
+
 namespace yaml {
 
 struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
@@ -188,6 +213,7 @@
            std::unique_ptr<const AMDGPUBufferPseudoSourceValue>> BufferPSVs;
   DenseMap<const Value *,
            std::unique_ptr<const AMDGPUImagePseudoSourceValue>> ImagePSVs;
+  std::unique_ptr<const AMDGPUGWSResourcePseudoSourceValue> GWSResourcePSV;
 
 private:
   unsigned LDSWaveSpillSize = 0;
@@ -674,6 +700,15 @@
     return PSV.first->second.get();
   }
 
+  const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) {
+    if (!GWSResourcePSV) {
+      GWSResourcePSV =
+          llvm::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII);
+    }
+
+    return GWSResourcePSV.get();
+  }
+
   unsigned getOccupancy() const {
     return Occupancy;
   }