All "integer" logical ops (pand, por, pxor) are now promoted to v2i64.
Clean up and fix various logical ops issues.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27633 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index f9d3643..08dcf5a 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -794,3 +794,7 @@
 X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
 to choose between movaps, movapd, and movdqa based on types of source and
 destination?
+
+How about andps, andpd, and pand? Do we really care about the type of the packed
+elements? If not, why not always use the "ps" variants which are likely to be
+shorter.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b368bc4..961fa8f 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -275,6 +275,9 @@
   if (Subtarget->hasSSE1()) {
     addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
 
+    setOperationAction(ISD::AND,                MVT::v4f32, Legal);
+    setOperationAction(ISD::OR,                 MVT::v4f32, Legal);
+    setOperationAction(ISD::XOR,                MVT::v4f32, Legal);
     setOperationAction(ISD::ADD,                MVT::v4f32, Legal);
     setOperationAction(ISD::SUB,                MVT::v4f32, Legal);
     setOperationAction(ISD::MUL,                MVT::v4f32, Legal);
@@ -301,36 +304,43 @@
     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
     setOperationAction(ISD::MUL,                MVT::v2f64, Legal);
-    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
+
     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i8, Custom);
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i16, Custom);
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v16i8, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i16, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i32, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
 
-    // Promote v16i8, v8i16, v4i32 selects to v2i64. Custom lower v2i64, v2f64,
-    // and v4f32 selects.
-    for (unsigned VT = (unsigned)MVT::v16i8;
-         VT != (unsigned)MVT::v2i64; VT++) {
-      setOperationAction(ISD::SELECT, (MVT::ValueType)VT, Promote);
-      AddPromotedToType (ISD::SELECT, (MVT::ValueType)VT, MVT::v2i64);
+    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
+    for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) {
+      setOperationAction(ISD::BUILD_VECTOR,        (MVT::ValueType)VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,      (MVT::ValueType)VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT,  (MVT::ValueType)VT, Custom);
+    }
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+
+    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 
+    for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) {
+      setOperationAction(ISD::AND,    (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::AND,    (MVT::ValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::OR,     (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::OR,     (MVT::ValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::XOR,    (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::XOR,    (MVT::ValueType)VT, MVT::v2i64);
       setOperationAction(ISD::LOAD,   (MVT::ValueType)VT, Promote);
       AddPromotedToType (ISD::LOAD,   (MVT::ValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::SELECT, (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::SELECT, (MVT::ValueType)VT, MVT::v2i64);
     }
+
+    // Custom lower v2i64 and v2f64 selects.
+    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
-    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
   }
 
   // We want to custom lower some of our intrinsics.
@@ -2827,6 +2837,7 @@
         return SDOperand();
 
     MVT::ValueType VT = Op.getValueType();
+    // TODO: handle v16i8.
     if (MVT::getSizeInBits(VT) == 16) {
       // Transform it so it match pextrw which produces a 32-bit result.
       MVT::ValueType EVT = (MVT::ValueType)(VT+1);
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index ec52944..36ce4b0 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1019,9 +1019,7 @@
 let isCommutable = 1 in {
 def ANDPSrr : PSI<0x54, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
                   "andps {$src2, $dst|$dst, $src2}",
-                  [(set VR128:$dst,
-                    (and (bc_v4i32 (v4f32 VR128:$src1)),
-                     (bc_v4i32 (v4f32 VR128:$src2))))]>;
+                  [(set VR128:$dst, (v2i64 (and VR128:$src1, VR128:$src2)))]>;
 def ANDPDrr : PDI<0x54, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
                 "andpd {$src2, $dst|$dst, $src2}",
                   [(set VR128:$dst,
@@ -1029,9 +1027,7 @@
                      (bc_v2i64 (v2f64 VR128:$src2))))]>;
 def ORPSrr  : PSI<0x56, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
                   "orps {$src2, $dst|$dst, $src2}",
-                  [(set VR128:$dst,
-                    (or (bc_v4i32 (v4f32 VR128:$src1)),
-                     (bc_v4i32 (v4f32 VR128:$src2))))]>;
+                  [(set VR128:$dst, (v2i64 (or VR128:$src1, VR128:$src2)))]>;
 def ORPDrr  : PDI<0x56, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
                   "orpd {$src2, $dst|$dst, $src2}",
                   [(set VR128:$dst,
@@ -1039,9 +1035,7 @@
                      (bc_v2i64 (v2f64 VR128:$src2))))]>;
 def XORPSrr : PSI<0x57, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
                   "xorps {$src2, $dst|$dst, $src2}",
-                  [(set VR128:$dst,
-                    (xor (bc_v4i32 (v4f32 VR128:$src1)),
-                     (bc_v4i32 (v4f32 VR128:$src2))))]>;
+                  [(set VR128:$dst, (v2i64 (xor VR128:$src1, VR128:$src2)))]>;
 def XORPDrr : PDI<0x57, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
                   "xorpd {$src2, $dst|$dst, $src2}",
                   [(set VR128:$dst,
@@ -1050,9 +1044,8 @@
 }
 def ANDPSrm : PSI<0x54, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
                   "andps {$src2, $dst|$dst, $src2}",
-                [(set VR128:$dst,
-                  (and (bc_v4i32 (v4f32 VR128:$src1)),
-                   (bc_v4i32 (loadv4f32 addr:$src2))))]>;
+                  [(set VR128:$dst, (and VR128:$src1,
+                                          (bc_v2i64 (loadv4f32 addr:$src2))))]>;
 def ANDPDrm : PDI<0x54, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
                   "andpd {$src2, $dst|$dst, $src2}",
                 [(set VR128:$dst,
@@ -1060,9 +1053,8 @@
                    (bc_v2i64 (loadv2f64 addr:$src2))))]>;
 def ORPSrm  : PSI<0x56, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
                   "orps {$src2, $dst|$dst, $src2}",
-                 [(set VR128:$dst,
-                   (or (bc_v4i32 (v4f32 VR128:$src1)),
-                    (bc_v4i32 (loadv4f32 addr:$src2))))]>;
+                  [(set VR128:$dst, (or VR128:$src1,
+                                          (bc_v2i64 (loadv4f32 addr:$src2))))]>;
 def ORPDrm  : PDI<0x56, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
                 "orpd {$src2, $dst|$dst, $src2}",
                  [(set VR128:$dst,
@@ -1070,9 +1062,8 @@
                     (bc_v2i64 (loadv2f64 addr:$src2))))]>;
 def XORPSrm : PSI<0x57, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
                   "xorps {$src2, $dst|$dst, $src2}",
-                [(set VR128:$dst,
-                  (xor (bc_v4i32 (v4f32 VR128:$src1)),
-                   (bc_v4i32 (loadv4f32 addr:$src2))))]>;
+                  [(set VR128:$dst, (xor VR128:$src1,
+                                          (bc_v2i64 (loadv4f32 addr:$src2))))]>;
 def XORPDrm : PDI<0x57, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
                   "xorpd {$src2, $dst|$dst, $src2}",
                 [(set VR128:$dst,
@@ -1080,14 +1071,14 @@
                    (bc_v2i64 (loadv2f64 addr:$src2))))]>;
 def ANDNPSrr : PSI<0x55, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
                   "andnps {$src2, $dst|$dst, $src2}",
-                [(set VR128:$dst,
-                  (and (vnot (bc_v4i32 (v4f32 VR128:$src1))),
-                   (bc_v4i32 (v4f32 VR128:$src2))))]>;
+                  [(set VR128:$dst, (v2i64 (and (xor VR128:$src1,
+                                                (bc_v2i64 (v4i32 immAllOnesV))),
+                                            VR128:$src2)))]>;
 def ANDNPSrm : PSI<0x55, MRMSrcMem, (ops VR128:$dst, VR128:$src1,f128mem:$src2),
                   "andnps {$src2, $dst|$dst, $src2}",
-                  [(set VR128:$dst,
-                    (and (vnot (bc_v4i32 (v4f32 VR128:$src1))),
-                     (bc_v4i32 (loadv4f32 addr:$src2))))]>;
+                  [(set VR128:$dst, (v2i64 (and (xor VR128:$src1,
+                                                (bc_v2i64 (v4i32 immAllOnesV))),
+                                         (bc_v2i64 (loadv4f32 addr:$src2)))))]>;
 def ANDNPDrr : PDI<0x55, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
                   "andnpd {$src2, $dst|$dst, $src2}",
                 [(set VR128:$dst,
@@ -1922,110 +1913,29 @@
 
 // 128-bit logical shifts
 def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
-          (v2i64 (PSLLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
+          (v2i64 (PSLLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>,
+      Requires<[HasSSE2]>;
 def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
-          (v2i64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
+          (v2i64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>,
+      Requires<[HasSSE2]>;
 
-// Logical ops
-def : Pat<(and (bc_v4i32 (v4f32 VR128:$src1)), (loadv4i32 addr:$src2)),
-          (ANDPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(and (bc_v2i64 (v2f64 VR128:$src1)), (loadv2i64 addr:$src2)),
-          (ANDPDrm VR128:$src1, addr:$src2)>;
-def : Pat<(or  (bc_v4i32 (v4f32 VR128:$src1)), (loadv4i32 addr:$src2)),
-          (ORPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(or  (bc_v2i64 (v2f64 VR128:$src1)), (loadv2i64 addr:$src2)),
-          (ORPDrm VR128:$src1, addr:$src2)>;
-def : Pat<(xor (bc_v4i32 (v4f32 VR128:$src1)), (loadv4i32 addr:$src2)),
-          (XORPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(xor (bc_v2i64 (v2f64 VR128:$src1)), (loadv2i64 addr:$src2)),
-          (XORPDrm VR128:$src1, addr:$src2)>;
-def : Pat<(and (vnot (bc_v4i32 (v4f32 VR128:$src1))), (loadv4i32 addr:$src2)),
-          (ANDNPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(and (vnot (bc_v2i64 (v2f64 VR128:$src1))), (loadv2i64 addr:$src2)),
-          (ANDNPDrm VR128:$src1, addr:$src2)>;
+// Some special case pandn patterns.
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
+                  VR128:$src2)),
+          (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
+                  VR128:$src2)),
+          (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
+                  VR128:$src2)),
+          (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
 
-def : Pat<(bc_v4f32 (v4i32 (and VR128:$src1, VR128:$src2))),
-          (ANDPSrr VR128:$src1, VR128:$src2)>;
-def : Pat<(bc_v4f32 (v4i32 (or VR128:$src1, VR128:$src2))),
-          (ORPSrr VR128:$src1, VR128:$src2)>;
-def : Pat<(bc_v4f32 (v4i32 (xor VR128:$src1, VR128:$src2))),
-          (XORPSrr VR128:$src1, VR128:$src2)>;
-def : Pat<(bc_v4f32 (v4i32 (and (vnot VR128:$src1), VR128:$src2))),
-          (ANDNPSrr VR128:$src1, VR128:$src2)>;
-
-def : Pat<(bc_v4f32 (v4i32 (and VR128:$src1, (load addr:$src2)))),
-          (ANDPSrm (v4i32 VR128:$src1), addr:$src2)>;
-def : Pat<(bc_v4f32 (v4i32 (or VR128:$src1, (load addr:$src2)))),
-          (ORPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(bc_v4f32 (v4i32 (xor VR128:$src1, (load addr:$src2)))),
-          (XORPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(bc_v4f32 (v4i32 (and (vnot VR128:$src1), (load addr:$src2)))),
-          (ANDNPSrm VR128:$src1, addr:$src2)>;
-
-def : Pat<(bc_v2f64 (v2i64 (and VR128:$src1, VR128:$src2))),
-          (ANDPDrr VR128:$src1, VR128:$src2)>;
-def : Pat<(bc_v2f64 (v2i64 (or VR128:$src1, VR128:$src2))),
-          (ORPDrr VR128:$src1, VR128:$src2)>;
-def : Pat<(bc_v2f64 (v2i64 (xor VR128:$src1, VR128:$src2))),
-          (XORPDrr VR128:$src1, VR128:$src2)>;
-def : Pat<(bc_v2f64 (v2i64 (and (vnot VR128:$src1), VR128:$src2))),
-          (ANDNPDrr VR128:$src1, VR128:$src2)>;
-
-def : Pat<(bc_v2f64 (v2i64 (and VR128:$src1, (load addr:$src2)))),
-          (ANDPSrm (v2i64 VR128:$src1), addr:$src2)>;
-def : Pat<(bc_v2f64 (v2i64 (or VR128:$src1, (load addr:$src2)))),
-          (ORPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(bc_v2f64 (v2i64 (xor VR128:$src1, (load addr:$src2)))),
-          (XORPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(bc_v2f64 (v2i64 (and (vnot VR128:$src1), (load addr:$src2)))),
-          (ANDNPSrm VR128:$src1, addr:$src2)>;
-
-def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
-          (PANDrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
-          (PANDrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
-          (PANDrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
-          (PORrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
-          (PORrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
-          (PORrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
-          (PXORrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
-          (PXORrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
-          (PXORrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v4i32 (and (vnot VR128:$src1), VR128:$src2)),
-          (PANDNrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v8i16 (and (vnot VR128:$src1), VR128:$src2)),
-          (PANDNrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v16i8 (and (vnot VR128:$src1), VR128:$src2)),
-          (PANDNrr VR128:$src1, VR128:$src2)>;
-
-def : Pat<(v4i32 (and VR128:$src1, (load addr:$src2))),
-          (PANDrm VR128:$src1, addr:$src2)>;
-def : Pat<(v8i16 (and VR128:$src1, (load addr:$src2))),
-          (PANDrm VR128:$src1, addr:$src2)>;
-def : Pat<(v16i8 (and VR128:$src1, (load addr:$src2))),
-          (PANDrm VR128:$src1, addr:$src2)>;
-def : Pat<(v4i32 (or VR128:$src1, (load addr:$src2))),
-          (PORrm VR128:$src1, addr:$src2)>;
-def : Pat<(v8i16 (or VR128:$src1, (load addr:$src2))),
-          (PORrm VR128:$src1, addr:$src2)>;
-def : Pat<(v16i8 (or VR128:$src1, (load addr:$src2))),
-          (PORrm VR128:$src1, addr:$src2)>;
-def : Pat<(v4i32 (xor VR128:$src1, (load addr:$src2))),
-          (PXORrm VR128:$src1, addr:$src2)>;
-def : Pat<(v8i16 (xor VR128:$src1, (load addr:$src2))),
-          (PXORrm VR128:$src1, addr:$src2)>;
-def : Pat<(v16i8 (xor VR128:$src1, (load addr:$src2))),
-          (PXORrm VR128:$src1, addr:$src2)>;
-def : Pat<(v4i32 (and (vnot VR128:$src1), (load addr:$src2))),
-          (PANDNrm VR128:$src1, addr:$src2)>;
-def : Pat<(v8i16 (and (vnot VR128:$src1), (load addr:$src2))),
-          (PANDNrm VR128:$src1, addr:$src2)>;
-def : Pat<(v16i8 (and (vnot VR128:$src1), (load addr:$src2))),
-          (PANDNrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
+                  (load addr:$src2))),
+          (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
+                  (load addr:$src2))),
+          (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
+                  (load addr:$src2))),
+          (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;