handle equality memcmp of 8 bytes on x86-64 with two unaligned loads and a 
compare.  On other targets we end up with a call to memcmp because we don't
want 16 individual byte loads.  We should be able to use movups as well, but
we're failing to select the generated icmp.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@92107 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index e194003..8fe7c45 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5092,17 +5092,8 @@
   return true;
 }
 
-static SDValue getMemCmpLoad(Value *PtrVal, unsigned Size,
+static SDValue getMemCmpLoad(Value *PtrVal, MVT LoadVT, const Type *LoadTy,
                              SelectionDAGBuilder &Builder) {
-  MVT LoadVT;
-  const Type *LoadTy;
-  if (Size == 2) {
-    LoadVT = MVT::i16;
-    LoadTy = Type::getInt16Ty(PtrVal->getContext());
-  } else {
-    LoadVT = MVT::i32;
-    LoadTy = Type::getInt32Ty(PtrVal->getContext()); 
-  }
   
   // Check to see if this load can be trivially constant folded, e.g. if the
   // input is from a string literal.
@@ -5158,16 +5149,61 @@
   
   // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
   // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
-  if (Size && (Size->getValue() == 2 || Size->getValue() == 4) &&
-      IsOnlyUsedInZeroEqualityComparison(&I)) {
-    SDValue LHSVal = getMemCmpLoad(LHS, Size->getZExtValue(), *this);
-    SDValue RHSVal = getMemCmpLoad(RHS, Size->getZExtValue(), *this);
+  if (Size && IsOnlyUsedInZeroEqualityComparison(&I)) {
+    bool ActuallyDoIt = true;
+    MVT LoadVT;
+    const Type *LoadTy;
+    switch (Size->getZExtValue()) {
+    default:
+      LoadVT = MVT::Other;
+      LoadTy = 0;
+      ActuallyDoIt = false;
+      break;
+    case 2:
+      LoadVT = MVT::i16;
+      LoadTy = Type::getInt16Ty(Size->getContext());
+      break;
+    case 4:
+      LoadVT = MVT::i32;
+      LoadTy = Type::getInt32Ty(Size->getContext()); 
+      break;
+    case 8:
+      LoadVT = MVT::i64;
+      LoadTy = Type::getInt64Ty(Size->getContext()); 
+      break;
+        /*
+    case 16:
+      LoadVT = MVT::v4i32;
+      LoadTy = Type::getInt32Ty(Size->getContext()); 
+      LoadTy = VectorType::get(LoadTy, 4);
+      break;
+         */
+    }
     
-    SDValue Res = DAG.getSetCC(getCurDebugLoc(), MVT::i1, LHSVal, RHSVal,
-                               ISD::SETNE);
-    EVT CallVT = TLI.getValueType(I.getType(), true);
-    setValue(&I, DAG.getZExtOrTrunc(Res, getCurDebugLoc(), CallVT));
-    return true;
+    // This turns into unaligned loads.  We only do this if the target natively
+    // supports the MVT we'll be loading or if it is small enough (<= 4) that
+    // we'll only produce a small number of byte loads.
+    
+    // Require that we can find a legal MVT, and only do this if the target
+    // supports unaligned loads of that type.  Expanding into byte loads would
+    // bloat the code.
+    if (ActuallyDoIt && Size->getZExtValue() > 4) {
+      // TODO: Handle 5 byte compare as 4-byte + 1 byte.
+      // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
+      if (!TLI.isTypeLegal(LoadVT) ||!TLI.allowsUnalignedMemoryAccesses(LoadVT))
+        ActuallyDoIt = false;
+    }
+    
+    if (ActuallyDoIt) {
+      SDValue LHSVal = getMemCmpLoad(LHS, LoadVT, LoadTy, *this);
+      SDValue RHSVal = getMemCmpLoad(RHS, LoadVT, LoadTy, *this);
+      
+      SDValue Res = DAG.getSetCC(getCurDebugLoc(), MVT::i1, LHSVal, RHSVal,
+                                 ISD::SETNE);
+      EVT CallVT = TLI.getValueType(I.getType(), true);
+      setValue(&I, DAG.getZExtOrTrunc(Res, getCurDebugLoc(), CallVT));
+      return true;
+    }
   }