Fix sdisel memcpy, memset, memmove lowering:
1. Makes it possible to lower with floating point loads and stores.
2. Avoid unaligned loads / stores unless it's fast.
3. Fix some memcpy lowering logic bug related to when to optimize a
   load from constant string into a constant.
4. Adjust x86 memcpy lowering threshold to make it more sane.
5. Fix x86 target hook so it uses vector and floating point memory
   ops more effectively.
rdar://7774704


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@100090 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
index 3ce9edb..26bf09c 100644
--- a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
+++ b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
 ; rdar://7396984
 
 @str = private constant [28 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 1
diff --git a/test/CodeGen/X86/byval7.ll b/test/CodeGen/X86/byval7.ll
index 0da93ba..686ed9c 100644
--- a/test/CodeGen/X86/byval7.ll
+++ b/test/CodeGen/X86/byval7.ll
@@ -1,10 +1,17 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | egrep {add|lea} | grep 16
+; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
 
 	%struct.S = type { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>,
+                           <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>,
                            <2 x i64> }
 
 define i32 @main() nounwind  {
 entry:
+; CHECK: main:
+; CHECK: movl $1, (%esp)
+; CHECK: leal 16(%esp), %edi
+; CHECK: movl $36, %ecx
+; CHECK: leal 160(%esp), %esi
+; CHECK: rep;movsl
 	%s = alloca %struct.S		; <%struct.S*> [#uses=2]
 	%tmp15 = getelementptr %struct.S* %s, i32 0, i32 0		; <<2 x i64>*> [#uses=1]
 	store <2 x i64> < i64 8589934595, i64 1 >, <2 x i64>* %tmp15, align 16
diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll
index 2dc939e..079c402 100644
--- a/test/CodeGen/X86/memcpy-2.ll
+++ b/test/CodeGen/X86/memcpy-2.ll
@@ -1,15 +1,105 @@
-; RUN: llc < %s -march=x86 -mattr=-sse -mtriple=i686-apple-darwin8.8.0 | grep mov | count 7
-; RUN: llc < %s -march=x86 -mattr=+sse -mtriple=i686-apple-darwin8.8.0 | grep mov | count 5
+; RUN: llc < %s -mattr=+sse2      -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=SSE2
+; RUN: llc < %s -mattr=+sse,-sse2 -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=SSE1
+; RUN: llc < %s -mattr=-sse       -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=NOSSE
 
 	%struct.ParmT = type { [25 x i8], i8, i8* }
 @.str12 = internal constant [25 x i8] c"image\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00"		; <[25 x i8]*> [#uses=1]
 
-declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind 
-
-define void @t(i32 %argc, i8** %argv) nounwind  {
+define void @t1(i32 %argc, i8** %argv) nounwind  {
 entry:
+; SSE2: t1:
+; SSE2: movaps _.str12, %xmm0
+; SSE2: movaps %xmm0
+; SSE2: movb $0
+; SSE2: movl $0
+; SSE2: movl $0
+
+; SSE1: t1:
+; SSE1: movaps _.str12, %xmm0
+; SSE1: movaps %xmm0
+; SSE1: movb $0
+; SSE1: movl $0
+; SSE1: movl $0
+
+; NOSSE: t1:
+; NOSSE: movb $0
+; NOSSE: movl $0
+; NOSSE: movl $0
+; NOSSE: movl $0
+; NOSSE: movl $0
+; NOSSE: movl $101
+; NOSSE: movl $1734438249
 	%parms.i = alloca [13 x %struct.ParmT]		; <[13 x %struct.ParmT]*> [#uses=1]
 	%parms1.i = getelementptr [13 x %struct.ParmT]* %parms.i, i32 0, i32 0, i32 0, i32 0		; <i8*> [#uses=1]
 	call void @llvm.memcpy.i32( i8* %parms1.i, i8* getelementptr ([25 x i8]* @.str12, i32 0, i32 0), i32 25, i32 1 ) nounwind 
 	unreachable
 }
+
+;rdar://7774704
+%struct.s0 = type { [2 x double] }
+
+define void @t2(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp {
+entry:
+; SSE2: t2:
+; SSE2: movaps (%eax), %xmm0
+; SSE2: movaps %xmm0, (%eax)
+
+; SSE1: t2:
+; SSE1: movaps (%eax), %xmm0
+; SSE1: movaps %xmm0, (%eax)
+
+; NOSSE: t2:
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+  %tmp2 = bitcast %struct.s0* %a to i8*           ; <i8*> [#uses=1]
+  %tmp3 = bitcast %struct.s0* %b to i8*           ; <i8*> [#uses=1]
+  tail call void @llvm.memcpy.i32(i8* %tmp2, i8* %tmp3, i32 16, i32 16)
+  ret void
+}
+
+define void @t3(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp {
+entry:
+; SSE2: t3:
+; SSE2: movsd (%eax), %xmm0
+; SSE2: movsd 8(%eax), %xmm1
+; SSE2: movsd %xmm1, 8(%eax)
+; SSE2: movsd %xmm0, (%eax)
+
+; SSE1: t3:
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+; SSE1: movl
+
+; NOSSE: t3:
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+; NOSSE: movl
+  %tmp2 = bitcast %struct.s0* %a to i8*           ; <i8*> [#uses=1]
+  %tmp3 = bitcast %struct.s0* %b to i8*           ; <i8*> [#uses=1]
+  tail call void @llvm.memcpy.i32(i8* %tmp2, i8* %tmp3, i32 16, i32 8)
+  ret void
+}
+
+declare void @llvm.memcpy.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind
diff --git a/test/CodeGen/X86/memset-2.ll b/test/CodeGen/X86/memset-2.ll
index 7deb52f..e2eba76 100644
--- a/test/CodeGen/X86/memset-2.ll
+++ b/test/CodeGen/X86/memset-2.ll
@@ -1,47 +1,13 @@
-; RUN: llc < %s | not grep rep
-; RUN: llc < %s | grep memset
+; RUN: llc < %s | FileCheck %s
 
 target triple = "i386"
 
 declare void @llvm.memset.i32(i8*, i8, i32, i32) nounwind
 
-define fastcc i32 @cli_scanzip(i32 %desc) nounwind {
+define fastcc void @t() nounwind {
 entry:
-	br label %bb8.i.i.i.i
-
-bb8.i.i.i.i:		; preds = %bb8.i.i.i.i, %entry
-	icmp eq i32 0, 0		; <i1>:0 [#uses=1]
-	br i1 %0, label %bb61.i.i.i, label %bb8.i.i.i.i
-
-bb32.i.i.i:		; preds = %bb61.i.i.i
-	ptrtoint i8* %tail.0.i.i.i to i32		; <i32>:1 [#uses=1]
-	sub i32 0, %1		; <i32>:2 [#uses=1]
-	icmp sgt i32 %2, 19		; <i1>:3 [#uses=1]
-	br i1 %3, label %bb34.i.i.i, label %bb61.i.i.i
-
-bb34.i.i.i:		; preds = %bb32.i.i.i
-	load i32* null, align 4		; <i32>:4 [#uses=1]
-	icmp eq i32 %4, 101010256		; <i1>:5 [#uses=1]
-	br i1 %5, label %bb8.i11.i.i.i, label %bb61.i.i.i
-
-bb8.i11.i.i.i:		; preds = %bb8.i11.i.i.i, %bb34.i.i.i
-	icmp eq i32 0, 0		; <i1>:6 [#uses=1]
-	br i1 %6, label %cli_dbgmsg.exit49.i, label %bb8.i11.i.i.i
-
-cli_dbgmsg.exit49.i:		; preds = %bb8.i11.i.i.i
-	icmp eq [32768 x i8]* null, null		; <i1>:7 [#uses=1]
-	br i1 %7, label %bb1.i28.i, label %bb8.i.i
-
-bb61.i.i.i:		; preds = %bb61.i.i.i, %bb34.i.i.i, %bb32.i.i.i, %bb8.i.i.i.i
-	%tail.0.i.i.i = getelementptr [1024 x i8]* null, i32 0, i32 0		; <i8*> [#uses=2]
-	load i8* %tail.0.i.i.i, align 1		; <i8>:8 [#uses=1]
-	icmp eq i8 %8, 80		; <i1>:9 [#uses=1]
-	br i1 %9, label %bb32.i.i.i, label %bb61.i.i.i
-
-bb1.i28.i:		; preds = %cli_dbgmsg.exit49.i
-	call void @llvm.memset.i32( i8* null, i8 0, i32 88, i32 1 ) nounwind
-	unreachable
-
-bb8.i.i:		; preds = %bb8.i.i, %cli_dbgmsg.exit49.i
-	br label %bb8.i.i
+; CHECK: t:
+; CHECK: call memset
+  call void @llvm.memset.i32( i8* null, i8 0, i32 188, i32 1 ) nounwind
+  unreachable
 }
diff --git a/test/CodeGen/X86/memset64-on-x86-32.ll b/test/CodeGen/X86/memset64-on-x86-32.ll
index da8fc51..8b817b4 100644
--- a/test/CodeGen/X86/memset64-on-x86-32.ll
+++ b/test/CodeGen/X86/memset64-on-x86-32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | grep stosl
+; RUN: llc < %s -mtriple=i386-apple-darwin | grep movl | count 20
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin | grep movq | count 10
 
 define void @bork() nounwind {
diff --git a/test/CodeGen/X86/small-byval-memcpy.ll b/test/CodeGen/X86/small-byval-memcpy.ll
index 9ec9182e..711dc51 100644
--- a/test/CodeGen/X86/small-byval-memcpy.ll
+++ b/test/CodeGen/X86/small-byval-memcpy.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | not grep movs
+; RUN: llc < %s | grep movsd | count 8
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
diff --git a/test/CodeGen/X86/unaligned-load.ll b/test/CodeGen/X86/unaligned-load.ll
index b61803d..2e9b248 100644
--- a/test/CodeGen/X86/unaligned-load.ll
+++ b/test/CodeGen/X86/unaligned-load.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2  -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=CORE2 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=corei7 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=COREI7 %s
 
 @.str1 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 8
 @.str3 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, 2'ND STRING\00", align 8
@@ -11,7 +12,11 @@
 bb:
   %String2Loc9 = getelementptr inbounds [31 x i8]* %String2Loc, i64 0, i64 0
   call void @llvm.memcpy.i64(i8* %String2Loc9, i8* getelementptr inbounds ([31 x i8]* @.str3, i64 0, i64 0), i64 31, i32 1)
-; CHECK: movups _.str3
+; CORE2: movsd _.str3+16
+; CORE2: movsd _.str3+8
+; CORE2: movsd _.str3
+
+; COREI7: movups _.str3
   br label %bb
 
 return:
@@ -20,8 +25,14 @@
 
 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
 
-; CHECK: .align  3
-; CHECK-NEXT: _.str1:
-; CHECK-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING"
-; CHECK: .align 3
-; CHECK-NEXT: _.str3:
+; CORE2: .align  3
+; CORE2-NEXT: _.str1:
+; CORE2-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING"
+; CORE2: .align 3
+; CORE2-NEXT: _.str3:
+
+; COREI7: .align  3
+; COREI7-NEXT: _.str1:
+; COREI7-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING"
+; COREI7: .align 3
+; COREI7-NEXT: _.str3: