Use PMULDQ for v2i64 multiplies when SSE4.1 is available. And add
load-folding table entries for PMULDQ and PMULLD.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@51489 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index caa81c4..2a9cef1 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -700,6 +700,7 @@
   if (Subtarget->hasSSE41()) {
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
+    setOperationAction(ISD::MUL,                MVT::v2i64, Legal);
 
     // i8 and i16 vectors are custom , because the source register and source
     // source memory operand types are not the same width.  f32 vectors are
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 4d4867f..1a5fd47 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -569,8 +569,12 @@
     { X86::PMAXUBrr,        X86::PMAXUBrm },
     { X86::PMINSWrr,        X86::PMINSWrm },
     { X86::PMINUBrr,        X86::PMINUBrm },
+    { X86::PMULDQrr,        X86::PMULDQrm },
+    { X86::PMULDQrr_int,    X86::PMULDQrm_int },
     { X86::PMULHUWrr,       X86::PMULHUWrm },
     { X86::PMULHWrr,        X86::PMULHWrm },
+    { X86::PMULLDrr,        X86::PMULLDrm },
+    { X86::PMULLDrr_int,    X86::PMULLDrm_int },
     { X86::PMULLWrr,        X86::PMULLWrm },
     { X86::PMULUDQrr,       X86::PMULUDQrm },
     { X86::PORrr,           X86::PORrm },
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 5931635..f41f8cf 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -3242,19 +3242,18 @@
                                        int_x86_sse41_pmaxud, 1>;
 defm PMAXUW       : SS41I_binop_rm_int<0x3E, "pmaxuw",
                                        int_x86_sse41_pmaxuw, 1>;
-defm PMULDQ       : SS41I_binop_rm_int<0x28, "pmuldq",
-                                       int_x86_sse41_pmuldq, 1>;
 
 
 /// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
 let Constraints = "$src1 = $dst" in {
-  multiclass SS41I_binop_patint<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                Intrinsic IntId128, bit Commutable = 0> {
+  multiclass SS41I_binop_patint<bits<8> opc, string OpcodeStr, ValueType OpVT,
+                                SDNode OpNode, Intrinsic IntId128,
+                                bit Commutable = 0> {
     def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2),
                    !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                   [(set VR128:$dst, (OpNode (v4i32 VR128:$src1),
-                                                    VR128:$src2))]>, OpSize {
+                   [(set VR128:$dst, (OpNode (OpVT VR128:$src1),
+                                                   VR128:$src2))]>, OpSize {
       let isCommutable = Commutable;
     }
     def rr_int : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
@@ -3277,8 +3276,10 @@
                        OpSize;
   }
 }
-defm PMULLD       : SS41I_binop_patint<0x40, "pmulld", mul,
+defm PMULLD       : SS41I_binop_patint<0x40, "pmulld", v4i32, mul,
                                        int_x86_sse41_pmulld, 1>;
+defm PMULDQ       : SS41I_binop_patint<0x28, "pmuldq", v2i64, mul,
+                                       int_x86_sse41_pmuldq, 1>;
 
 
 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate