Add x86 isel logic and patterns to match movlps from clang generated IR for _mm_loadl_pi(). rdar://10134392, rdar://10050222

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@144052 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b15dfac..aab7c73 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6190,6 +6190,10 @@
     V = V.getOperand(0);
   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
     V = V.getOperand(0);
+  if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
+      V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
+    // BUILD_VECTOR (load), undef
+    V = V.getOperand(0);
   if (MayFoldLoad(V))
     return true;
   return false;
@@ -6372,15 +6376,10 @@
   //    turns into:
   //  (MOVLPSmr addr:$src1, VR128:$src2)
   // So, recognize this potential and also use MOVLPS or MOVLPD
-  if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
+  else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
     CanFoldLoad = true;
 
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-
-  // Both of them can't be memory operations though.
-  if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2))
-    CanFoldLoad = false;
-
   if (CanFoldLoad) {
     if (HasXMMInt && NumElems == 2)
       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index db4382a..4b6ba5d 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1035,6 +1035,9 @@
   }
 
   // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
+  def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)),
+                                 (iPTR 0))), addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
   def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
             (MOVLPSmr addr:$src1, VR128:$src2)>;
   def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)),
@@ -1049,6 +1052,9 @@
   def : Pat<(X86Movlps VR128:$src1,
                       (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
             (MOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlps VR128:$src1,
+                      (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+            (MOVLPSrm VR128:$src1, addr:$src2)>;
 
   // Store patterns
   def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),