Drop ISD::MEMSET, ISD::MEMMOVE, and ISD::MEMCPY, which are not Legal
on any current target and aren't optimized in DAGCombiner. Instead
of using intermediate nodes, expand the operations, choosing between
simple loads/stores, target-specific code, and library calls,
immediately.
Previously, the code to emit optimized code for these operations
was only used at initial SelectionDAG construction time; now it is
used at all times. This fixes some cases where rep;movs was being
used for small copies where simple loads/stores would be better.
This also cleans up code that checks for alignments less than 4;
let the targets make that decision instead of doing it in
target-independent code. This allows x86 to use rep;movs in
low-alignment cases.
Also, this fixes a bug that resulted in the use of rep;stos for
memsets of 0 with non-constant memory size when the alignment was
at least 4. It's better to use the library in this case, which
can be significantly faster when the size is large.
This also preserves more SourceValue information when memory
intrinsics are lowered into simple loads/stores.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@49572 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/test/CodeGen/X86/2004-02-12-Memcpy.llx b/test/CodeGen/X86/2004-02-12-Memcpy.llx
index 151c5a5..59364c1 100644
--- a/test/CodeGen/X86/2004-02-12-Memcpy.llx
+++ b/test/CodeGen/X86/2004-02-12-Memcpy.llx
@@ -1,5 +1,4 @@
-; RUN: llvm-as < %s | llc -march=x86 -mtriple=i686-pc-linux-gnu | grep movs | count 1
-; RUN: llvm-as < %s | llc -march=x86 -mtriple=i686-pc-linux-gnu | grep memcpy | count 2
+; RUN: llvm-as < %s | llc -march=x86 -mtriple=i686-pc-linux-gnu | grep movs | count 3
@A = global [32 x i32] zeroinitializer
@B = global [32 x i32] zeroinitializer
diff --git a/test/CodeGen/X86/byval2.ll b/test/CodeGen/X86/byval2.ll
index f438160..f85c8ff 100644
--- a/test/CodeGen/X86/byval2.ll
+++ b/test/CodeGen/X86/byval2.ll
@@ -1,7 +1,9 @@
; RUN: llvm-as < %s | llc -march=x86-64 | grep rep.movsq | count 2
; RUN: llvm-as < %s | llc -march=x86 | grep rep.movsl | count 2
-%struct.s = type { i64, i64, i64 }
+%struct.s = type { i64, i64, i64, i64, i64, i64, i64, i64,
+ i64, i64, i64, i64, i64, i64, i64, i64,
+ i64 }
define void @g(i64 %a, i64 %b, i64 %c) {
entry:
diff --git a/test/CodeGen/X86/byval3.ll b/test/CodeGen/X86/byval3.ll
index b3794ec..074bab4 100644
--- a/test/CodeGen/X86/byval3.ll
+++ b/test/CodeGen/X86/byval3.ll
@@ -1,7 +1,11 @@
; RUN: llvm-as < %s | llc -march=x86-64 | grep rep.movsl | count 2
; RUN: llvm-as < %s | llc -march=x86 | grep rep.movsl | count 2
-%struct.s = type { i32, i32, i32, i32, i32, i32 }
+%struct.s = type { i32, i32, i32, i32, i32, i32, i32, i32,
+ i32, i32, i32, i32, i32, i32, i32, i32,
+ i32, i32, i32, i32, i32, i32, i32, i32,
+ i32, i32, i32, i32, i32, i32, i32, i32,
+ i32 }
define void @g(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6) {
entry:
diff --git a/test/CodeGen/X86/byval4.ll b/test/CodeGen/X86/byval4.ll
index 591749f..d2fa9e2 100644
--- a/test/CodeGen/X86/byval4.ll
+++ b/test/CodeGen/X86/byval4.ll
@@ -1,7 +1,15 @@
; RUN: llvm-as < %s | llc -march=x86-64 | grep rep.movsw | count 2
; RUN: llvm-as < %s | llc -march=x86 | grep rep.movsl | count 2
-%struct.s = type { i16, i16, i16, i16, i16, i16 }
+%struct.s = type { i16, i16, i16, i16, i16, i16, i16, i16,
+ i16, i16, i16, i16, i16, i16, i16, i16,
+ i16, i16, i16, i16, i16, i16, i16, i16,
+ i16, i16, i16, i16, i16, i16, i16, i16,
+ i16, i16, i16, i16, i16, i16, i16, i16,
+ i16, i16, i16, i16, i16, i16, i16, i16,
+ i16, i16, i16, i16, i16, i16, i16, i16,
+ i16, i16, i16, i16, i16, i16, i16, i16,
+ i16 }
define void @g(i16 signext %a1, i16 signext %a2, i16 signext %a3,
diff --git a/test/CodeGen/X86/byval5.ll b/test/CodeGen/X86/byval5.ll
index 4965d16..fd9c197 100644
--- a/test/CodeGen/X86/byval5.ll
+++ b/test/CodeGen/X86/byval5.ll
@@ -1,7 +1,23 @@
; RUN: llvm-as < %s | llc -march=x86-64 | grep rep.movsb | count 2
; RUN: llvm-as < %s | llc -march=x86 | grep rep.movsl | count 2
-%struct.s = type { i8, i8, i8, i8, i8, i8 }
+%struct.s = type { i8, i8, i8, i8, i8, i8, i8, i8,
+ i8, i8, i8, i8, i8, i8, i8, i8,
+ i8, i8, i8, i8, i8, i8, i8, i8,
+ i8, i8, i8, i8, i8, i8, i8, i8,
+ i8, i8, i8, i8, i8, i8, i8, i8,
+ i8, i8, i8, i8, i8, i8, i8, i8,
+ i8, i8, i8, i8, i8, i8, i8, i8,
+ i8, i8, i8, i8, i8, i8, i8, i8,
+ i8, i8, i8, i8, i8, i8, i8, i8,
+ i8, i8, i8, i8, i8, i8, i8, i8,
+ i8, i8, i8, i8, i8, i8, i8, i8,
+ i8, i8, i8, i8, i8, i8, i8, i8,
+ i8, i8, i8, i8, i8, i8, i8, i8,
+ i8, i8, i8, i8, i8, i8, i8, i8,
+ i8, i8, i8, i8, i8, i8, i8, i8,
+ i8, i8, i8, i8, i8, i8, i8, i8,
+ i8 }
define void @g(i8 signext %a1, i8 signext %a2, i8 signext %a3,
diff --git a/test/CodeGen/X86/byval7.ll b/test/CodeGen/X86/byval7.ll
index 4199bf0..fcbc59b 100644
--- a/test/CodeGen/X86/byval7.ll
+++ b/test/CodeGen/X86/byval7.ll
@@ -1,6 +1,7 @@
; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah | grep add | grep 16
- %struct.S = type { <2 x i64> }
+ %struct.S = type { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>,
+ <2 x i64> }
define i32 @main() nounwind {
entry:
diff --git a/test/CodeGen/X86/small-byval-memcpy.ll b/test/CodeGen/X86/small-byval-memcpy.ll
new file mode 100644
index 0000000..dedd948
--- /dev/null
+++ b/test/CodeGen/X86/small-byval-memcpy.ll
@@ -0,0 +1,22 @@
+; RUN: llvm-as < %s | llc | not grep movs
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin8"
+
+define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval align 4 %z) nounwind {
+entry:
+ %iz = alloca { x86_fp80, x86_fp80 } ; <{ x86_fp80, x86_fp80 }*> [#uses=3]
+ %tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1 ; <x86_fp80*> [#uses=1]
+ %tmp2 = load x86_fp80* %tmp1, align 16 ; <x86_fp80> [#uses=1]
+ %tmp3 = sub x86_fp80 0xK80000000000000000000, %tmp2 ; <x86_fp80> [#uses=1]
+ %tmp4 = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 1 ; <x86_fp80*> [#uses=1]
+ %real = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 0 ; <x86_fp80*> [#uses=1]
+ %tmp6 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 0 ; <x86_fp80*> [#uses=1]
+ %tmp7 = load x86_fp80* %tmp6, align 16 ; <x86_fp80> [#uses=1]
+ store x86_fp80 %tmp3, x86_fp80* %real, align 16
+ store x86_fp80 %tmp7, x86_fp80* %tmp4, align 16
+ call void @ccoshl( { x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval align 4 %iz ) nounwind
+ ret void
+}
+
+declare void @ccoshl({ x86_fp80, x86_fp80 }* noalias sret , { x86_fp80, x86_fp80 }* byval align 4 ) nounwind
diff --git a/test/CodeGen/X86/variable-sized-darwin-bzero.ll b/test/CodeGen/X86/variable-sized-darwin-bzero.ll
new file mode 100644
index 0000000..b0cdf49
--- /dev/null
+++ b/test/CodeGen/X86/variable-sized-darwin-bzero.ll
@@ -0,0 +1,8 @@
+; RUN: llvm-as < %s | llc -march=x86 -mtriple=i686-apple-darwin10 | grep __bzero
+
+declare void @llvm.memset.i64(i8*, i8, i64, i32)
+
+define void @foo(i8* %p, i64 %n) {
+ call void @llvm.memset.i64(i8* %p, i8 0, i64 %n, i32 4)
+ ret void
+}