AVX-512: Added shuffle instructions - 
 VPSHUFD, VPERMILPS, VMOVDDUP, VMOVLHPS, VMOVHLPS, VSHUFPS, VALIGN
 single and double forms.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189215 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index a00f848..6a7ca7d 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -3600,7 +3600,7 @@
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
   unsigned NumLaneElts = NumElts/NumLanes;
 
   // Do not handle 64-bit element shuffles with palignr.
@@ -3683,10 +3683,7 @@
 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
 /// reverse of what x86 shuffles want.
-static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool HasFp256,
-                        bool Commuted = false) {
-  if (!HasFp256 && VT.is256BitVector())
-    return false;
+static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
 
   unsigned NumElems = VT.getVectorNumElements();
   unsigned NumLanes = VT.getSizeInBits()/128;
@@ -3695,6 +3692,10 @@
   if (NumLaneElems != 2 && NumLaneElems != 4)
     return false;
 
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  bool symetricMaskRequired =
+    (VT.getSizeInBits() >= 256) && (EltSize == 32);
+
   // VSHUFPSY divides the resulting vector into 4 chunks.
   // The sources are also splitted into 4 chunks, and each destination
   // chunk must come from a different source chunk.
@@ -3714,6 +3715,7 @@
   //
   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
   //
+  SmallVector<int, 4> MaskVal(NumLaneElems, -1);
   unsigned HalfLaneElems = NumLaneElems/2;
   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
     for (unsigned i = 0; i != NumLaneElems; ++i) {
@@ -3724,9 +3726,13 @@
       // For VSHUFPSY, the mask of the second half must be the same as the
       // first but with the appropriate offsets. This works in the same way as
       // VPERMILPS works with masks.
-      if (NumElems != 8 || l == 0 || Mask[i] < 0)
+      if (!symetricMaskRequired || Idx < 0)
         continue;
-      if (!isUndefOrEqual(Idx, Mask[i]+l))
+      if (MaskVal[i] < 0) {
+        MaskVal[i] = Idx - l;
+        continue;
+      }
+      if ((signed)(Idx - l) != MaskVal[i])
         return false;
     }
   }
@@ -4158,31 +4164,32 @@
 /// to the same elements of the low, but to the higher half of the source.
 /// In VPERMILPD the two lanes could be shuffled independently of each other
 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
-static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
-  if (!HasFp256)
+static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (VT.getSizeInBits() < 256 || EltSize < 32)
     return false;
-
+  bool symetricMaskRequired = (EltSize == 32);
   unsigned NumElts = VT.getVectorNumElements();
-  // Only match 256-bit with 32/64-bit types
-  if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8))
-    return false;
 
   unsigned NumLanes = VT.getSizeInBits()/128;
   unsigned LaneSize = NumElts/NumLanes;
+  // 2 or 4 elements in one lane
+  
+  SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
   for (unsigned l = 0; l != NumElts; l += LaneSize) {
     for (unsigned i = 0; i != LaneSize; ++i) {
       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
         return false;
-      if (NumElts != 8 || l == 0)
-        continue;
-      // VPERMILPS handling
-      if (Mask[i] < 0)
-        continue;
-      if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
-        return false;
+      if (symetricMaskRequired) {
+        if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
+          ExpectedMaskVal[i] = Mask[i+l] - l;
+          continue;
+        }
+        if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
+          return false;
+      }
     }
   }
-
   return true;
 }
 
@@ -4431,10 +4438,11 @@
 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
   MVT VT = SVOp->getSimpleValueType(0);
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
+  unsigned EltSize = VT.is512BitVector() ? 1 :
+    VT.getVectorElementType().getSizeInBits() >> 3;
 
   unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
   unsigned NumLaneElts = NumElts/NumLanes;
 
   int Val = 0;
@@ -7407,7 +7415,7 @@
   }
 
   // Normalize the node to match x86 shuffle ops if needed
-  if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true)))
+  if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
     return CommuteVectorShuffle(SVOp, DAG);
 
   // The checks below are all present in isShuffleMaskLegal, but they are
@@ -7430,7 +7438,7 @@
                                 getShufflePSHUFLWImmediate(SVOp),
                                 DAG);
 
-  if (isSHUFPMask(M, VT, HasFp256))
+  if (isSHUFPMask(M, VT))
     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
                                 getShuffleSHUFImmediate(SVOp), DAG);
 
@@ -7449,8 +7457,8 @@
     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
 
   // Handle VPERMILPS/D* permutations
-  if (isVPERMILPMask(M, VT, HasFp256)) {
-    if (HasInt256 && VT == MVT::v8i32)
+  if (isVPERMILPMask(M, VT)) {
+    if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
                                   getShuffleSHUFImmediate(SVOp), DAG);
     return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
@@ -13621,7 +13629,7 @@
   return (SVT.getVectorNumElements() == 2 ||
           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
           isMOVLMask(M, SVT) ||
-          isSHUFPMask(M, SVT, Subtarget->hasFp256()) ||
+          isSHUFPMask(M, SVT) ||
           isPSHUFDMask(M, SVT) ||
           isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
           isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
@@ -13646,8 +13654,8 @@
   if (NumElts == 4 && SVT.is128BitVector()) {
     return (isMOVLMask(Mask, SVT)  ||
             isCommutedMOVLMask(Mask, SVT, true) ||
-            isSHUFPMask(Mask, SVT, Subtarget->hasFp256()) ||
-            isSHUFPMask(Mask, SVT, Subtarget->hasFp256(), /* Commuted */ true));
+            isSHUFPMask(Mask, SVT) ||
+            isSHUFPMask(Mask, SVT, /* Commuted */ true));
   }
   return false;
 }
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 17be5df..cf4a0f5 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -1621,6 +1621,45 @@
 defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64,
                                 VR512, memopv8i64, i512mem>, EVEX_V512,
                                 VEX_W, EVEX_CD8<64, CD8VF>;
+//===----------------------------------------------------------------------===//
+// AVX-512 - PSHUFD
+//
+
+multiclass avx512_pshuf_imm<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                         SDNode OpNode, PatFrag mem_frag, 
+                         X86MemOperand x86memop, ValueType OpVT> {
+  def ri : AVX512Ii8<opc, MRMSrcReg, (outs RC:$dst),
+                     (ins RC:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                       (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>,
+                     EVEX;
+  def mi : AVX512Ii8<opc, MRMSrcMem, (outs RC:$dst),
+                     (ins x86memop:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                       (OpVT (OpNode (mem_frag addr:$src1),
+                              (i8 imm:$src2))))]>, EVEX;
+}
+
+defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32,
+                      i512mem, v16i32>, OpSize, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+let ExeDomain = SSEPackedSingle in
+defm VPERMILPSZ : avx512_pshuf_imm<0x04, "vpermilps", VR512, X86VPermilp,
+                      memopv16f32, i512mem, v16f32>, OpSize, TA, EVEX_V512,
+                      EVEX_CD8<32, CD8VF>;
+let ExeDomain = SSEPackedDouble in
+defm VPERMILPDZ : avx512_pshuf_imm<0x05, "vpermilpd", VR512, X86VPermilp,
+                      memopv8f64, i512mem, v8f64>, OpSize, TA, EVEX_V512,
+                      VEX_W, EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v16i32 (X86VPermilp VR512:$src1, (i8 imm:$imm))),
+          (VPERMILPSZri VR512:$src1, imm:$imm)>;
+def : Pat<(v8i64 (X86VPermilp VR512:$src1, (i8 imm:$imm))),
+          (VPERMILPDZri VR512:$src1, imm:$imm)>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  Logical Instructions
@@ -1774,8 +1813,8 @@
 defm VPTESTMDZ  : avx512_vptest<0x27, "vptestmd", VK16, VR512,  f512mem,
                               memopv16i32, X86testm, v16i32>, EVEX_V512,
                               EVEX_CD8<32, CD8VF>;
-defm VPTESTMQZ  : avx512_vptest<0x27, "vptestmq", VK8, VR512,  f512mem, memopv8i64,
-                              X86testm, v8i64>, EVEX_V512, VEX_W,
+defm VPTESTMQZ  : avx512_vptest<0x27, "vptestmq", VK8, VR512,  f512mem,
+                              memopv8i64, X86testm, v8i64>, EVEX_V512, VEX_W,
                               EVEX_CD8<64, CD8VF>;
 
 //===----------------------------------------------------------------------===//
@@ -1914,3 +1953,99 @@
 defm VPSRAVQZ : avx512_var_shift<0x46, "vpsravq", sra, VR512, v8i64, 
                                i512mem, memopv8i64>, EVEX_V512, VEX_W,
                                EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - MOVDDUP
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT, 
+                        X86MemOperand x86memop, PatFrag memop_frag> {
+def rr  : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX;
+def rm  : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set RC:$dst,
+                      (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX;
+}
+
+defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, memopv8f64>,
+                 VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))),
+          (VMOVDDUPZrm addr:$src)>;
+
+def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
+          (ins VR128X:$src1, VR128X:$src2),
+          "vmovlhps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))],
+           IIC_SSE_MOV_LH>, EVEX_4V;
+def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
+          (ins VR128X:$src1, VR128X:$src2),
+          "vmovhlps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))],
+          IIC_SSE_MOV_LH>, EVEX_4V;
+
+// MOVLHPS patterns
+def : Pat<(v4i32 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+          (VMOVLHPSZrr VR128X:$src1, VR128X:$src2)>;
+def : Pat<(v2i64 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+          (VMOVLHPSZrr (v2i64 VR128X:$src1), VR128X:$src2)>;
+
+// MOVHLPS patterns
+def : Pat<(v4i32 (X86Movhlps VR128X:$src1, VR128X:$src2)),
+          (VMOVHLPSZrr VR128X:$src1, VR128X:$src2)>;
+//===----------------------------------------------------------------------===//
+// VSHUFPS - VSHUFPD Operations
+
+multiclass avx512_shufp<RegisterClass RC, X86MemOperand x86memop,
+                      ValueType vt, string OpcodeStr, PatFrag mem_frag,
+                      Domain d> {
+  def rmi : AVX512PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, x86memop:$src2, i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
+                                       (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
+                   EVEX_4V, TB, Sched<[WriteShuffleLd, ReadAfterLd]>;
+  def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
+                                       (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
+                   EVEX_4V, TB, Sched<[WriteShuffle]>;
+}
+
+defm VSHUFPSZ  : avx512_shufp<VR512, f512mem, v16f32, "vshufps", memopv16f32,
+                  SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VSHUFPDZ  : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", memopv8f64,
+                  SSEPackedDouble>, OpSize, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+
+multiclass avx512_alignr<string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop> {
+  def rri : AVX512AIi8<0x03, MRMSrcReg, (outs RC:$dst),
+                     (ins RC:$src1, RC:$src2, i8imm:$src3),
+                     !strconcat(OpcodeStr,
+                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                     []>, EVEX_4V;
+  def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs RC:$dst),
+                     (ins RC:$src1, x86memop:$src2, i8imm:$src3),
+                     !strconcat(OpcodeStr,
+                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                     []>, EVEX_4V;
+}
+defm VALIGND : avx512_alignr<"valignd", VR512, i512mem>, 
+                 EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VALIGNQ : avx512_alignr<"valignq", VR512, i512mem>, 
+                 VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v16f32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>;
+def : Pat<(v8f64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>;
+def : Pat<(v16i32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>;
+def : Pat<(v8i64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>;
+
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 4eaba38..9b27e27 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1327,7 +1327,7 @@
 // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
 //===----------------------------------------------------------------------===//
 
-let AddedComplexity = 20 in {
+let AddedComplexity = 20, Predicates = [UseAVX] in {
   def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
                       "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1358,7 +1358,7 @@
                         IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
 }
 
-let Predicates = [HasAVX] in {
+let Predicates = [UseAVX] in {
   // MOVLHPS patterns
   def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
             (VMOVLHPSrr VR128:$src1, VR128:$src2)>;