[X86][AVX512DQ] Improve lowering of MUL v2i64 and v4i64

With DQI but without VLX, lower v2i64 and v4i64 MUL operations with v8i64 MUL (vpmullq).

Updated cost table accordingly.

Differential Revision: https://reviews.llvm.org/D26011

llvm-svn: 285304
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5f1bf70..bd378cf 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -19854,6 +19854,25 @@
   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
          "Only know how to lower V2I64/V4I64/V8I64 multiply");
 
+  // AVX512DQ - extend to 512 bit vector.
+  // FIXME: This can possibly be converted to a tablegen pattern.
+  if (Subtarget.hasDQI()) {
+    assert(!Subtarget.hasVLX() && "AVX512DQVL vXi64 multiply is legal");
+    assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
+           "AVX512DQ v8i64 multiply is legal");
+
+    MVT NewVT = MVT::getVectorVT(MVT::i64, 512 / VT.getScalarSizeInBits());
+    SDValue A512 =
+        DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, DAG.getUNDEF(NewVT), A,
+                    DAG.getIntPtrConstant(0, dl));
+    SDValue B512 =
+        DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, DAG.getUNDEF(NewVT), B,
+                    DAG.getIntPtrConstant(0, dl));
+    SDValue MulNode = DAG.getNode(ISD::MUL, dl, NewVT, A512, B512);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MulNode,
+                       DAG.getIntPtrConstant(0, dl));
+  }
+
   //  Ahi = psrlqi(a, 32);
   //  Bhi = psrlqi(b, 32);
   //
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index a2cc73a..f8d3a04 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -204,6 +204,19 @@
       return LT.first * Entry->Cost;
   }
 
+  static const CostTblEntry AVX512DQCostTable[] = {
+    { ISD::MUL,  MVT::v2i64, 1 },
+    { ISD::MUL,  MVT::v4i64, 1 },
+    { ISD::MUL,  MVT::v8i64, 1 }
+  };
+
+  // Look for AVX512DQ lowering tricks for custom cases.
+  if (ST->hasDQI()) {
+    if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+  }
+
   static const CostTblEntry AVX512BWCostTable[] = {
     // Vectorizing division is a bad idea. See the SSE2 table for more comments.
     { ISD::SDIV,  MVT::v64i8,  64*20 },