AMDGPU: Select DS insts without m0 initialization

GFX9 stopped using m0 for most DS instructions. Select
a different instruction without the use. I think this will
be less error prone than trying to manually maintain m0
uses as needed.

llvm-svn: 319270
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 48bfc2d..840c7fb 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -112,7 +112,13 @@
   static bool offsetsCanBeCombined(CombineInfo &CI);
 
   bool findMatchingInst(CombineInfo &CI);
+
+  unsigned read2Opcode(unsigned EltSize) const;
+  unsigned read2ST64Opcode(unsigned EltSize) const;
   MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
+
+  unsigned write2Opcode(unsigned EltSize) const;
+  unsigned write2ST64Opcode(unsigned EltSize) const;
   MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
   MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
   MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
@@ -436,6 +442,20 @@
   return false;
 }
 
+unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
+  if (STM->ldsRequiresM0Init())
+    return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
+  return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
+}
+
+unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
+  if (STM->ldsRequiresM0Init())
+    return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
+
+  return (EltSize == 4) ?
+    AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
+}
+
 MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   CombineInfo &CI) {
   MachineBasicBlock *MBB = CI.I->getParent();
@@ -449,12 +469,8 @@
 
   unsigned NewOffset0 = CI.Offset0;
   unsigned NewOffset1 = CI.Offset1;
-  unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2_B32
-                                   : AMDGPU::DS_READ2_B64;
-
-  if (CI.UseST64)
-    Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2ST64_B32
-                            : AMDGPU::DS_READ2ST64_B64;
+  unsigned Opc = CI.UseST64 ?
+    read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
 
   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
@@ -517,6 +533,20 @@
   return Next;
 }
 
+unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
+  if (STM->ldsRequiresM0Init())
+    return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
+  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
+}
+
+unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
+  if (STM->ldsRequiresM0Init())
+    return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
+
+  return (EltSize == 4) ?
+    AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
+}
+
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
   CombineInfo &CI) {
   MachineBasicBlock *MBB = CI.I->getParent();
@@ -530,12 +560,8 @@
 
   unsigned NewOffset0 = CI.Offset0;
   unsigned NewOffset1 = CI.Offset1;
-  unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2_B32
-                                   : AMDGPU::DS_WRITE2_B64;
-
-  if (CI.UseST64)
-    Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
-                            : AMDGPU::DS_WRITE2ST64_B64;
+  unsigned Opc = CI.UseST64 ?
+    write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
 
   if (NewOffset0 > NewOffset1) {
     // Canonicalize the merged instruction so the smaller offset comes first.
@@ -786,9 +812,13 @@
     CombineInfo CI;
     CI.I = I;
     unsigned Opc = MI.getOpcode();
-    if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
+    if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
+        Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
+
       CI.InstClass = DS_READ_WRITE;
-      CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
+      CI.EltSize =
+        (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
+
       if (findMatchingInst(CI)) {
         Modified = true;
         I = mergeRead2Pair(CI);
@@ -797,10 +827,13 @@
       }
 
       continue;
-    }
-    if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
+    } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
+               Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
+               Opc == AMDGPU::DS_WRITE_B64_gfx9) {
       CI.InstClass = DS_READ_WRITE;
-      CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
+      CI.EltSize
+        = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
+
       if (findMatchingInst(CI)) {
         Modified = true;
         I = mergeWrite2Pair(CI);