R600: Add support for global vector stores with elements less than 32-bits

Tested-by: Aaron Watry <awatry@gmail.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188520 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 7ceab2d..78495ca 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -67,6 +67,13 @@
   setOperationAction(ISD::STORE, MVT::f64, Promote);
   AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
 
+  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
+  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
+  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
+  // XXX: This can be change to Custom, once ExpandVectorStores can
+  // handle 64-bit stores.
+  setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
+
   setOperationAction(ISD::LOAD, MVT::f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
 
@@ -187,6 +194,7 @@
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::STORE: return LowerVectorStore(Op, DAG);
   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
   }
   return Op;
@@ -487,6 +495,59 @@
   return DAG.getMergeValues(Ops, 2, DL);
 }
 
+SDValue AMDGPUTargetLowering::LowerVectorStore(const SDValue &Op,
+                                               SelectionDAG &DAG) const {
+  StoreSDNode *Store = dyn_cast<StoreSDNode>(Op);
+  EVT MemVT = Store->getMemoryVT();
+  unsigned MemBits = MemVT.getSizeInBits();
+
+  // Byte stores are really expensive, so if possible, try to pack
+  // 32-bit vector truncatating store into an i32 store.
+  // XXX: We could also handle optimize other vector bitwidths
+  if (!MemVT.isVector() || MemBits > 32) {
+    return SDValue();
+  }
+
+  SDLoc DL(Op);
+  const SDValue &Value = Store->getValue();
+  EVT VT = Value.getValueType();
+  const SDValue &Ptr = Store->getBasePtr();
+  EVT MemEltVT = MemVT.getVectorElementType();
+  unsigned MemEltBits = MemEltVT.getSizeInBits();
+  unsigned MemNumElements = MemVT.getVectorNumElements();
+  EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
+  SDValue Mask;
+  switch(MemEltBits) {
+  case 8:
+    Mask = DAG.getConstant(0xFF, PackedVT);
+    break;
+  case 16:
+    Mask = DAG.getConstant(0xFFFF, PackedVT);
+    break;
+  default:
+    llvm_unreachable("Cannot lower this vector store");
+  }
+  SDValue PackedValue;
+  for (unsigned i = 0; i < MemNumElements; ++i) {
+    EVT ElemVT = VT.getVectorElementType();
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
+                              DAG.getConstant(i, MVT::i32));
+    Elt = DAG.getZExtOrTrunc(Elt, DL, PackedVT);
+    Elt = DAG.getNode(ISD::AND, DL, PackedVT, Elt, Mask);
+    SDValue Shift = DAG.getConstant(MemEltBits * i, PackedVT);
+    Elt = DAG.getNode(ISD::SHL, DL, PackedVT, Elt, Shift);
+    if (i == 0) {
+      PackedValue = Elt;
+    } else {
+      PackedValue = DAG.getNode(ISD::OR, DL, PackedVT, PackedValue, Elt);
+    }
+  }
+  return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
+                      MachinePointerInfo(Store->getMemOperand()->getValue()),
+                      Store->isVolatile(),  Store->isNonTemporal(),
+                      Store->getAlignment());
+}
+
 //===----------------------------------------------------------------------===//
 // Helper functions
 //===----------------------------------------------------------------------===//