Replace the "movnt" intrinsics with a native store + nontemporal metadata bit.
<rdar://problem/8460511>


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@130791 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/test/Assembler/AutoUpgradeIntrinsics.ll b/test/Assembler/AutoUpgradeIntrinsics.ll
index e4e2d3a..417493f 100644
--- a/test/Assembler/AutoUpgradeIntrinsics.ll
+++ b/test/Assembler/AutoUpgradeIntrinsics.ll
@@ -10,6 +10,7 @@
 ; RUN:   not grep {llvm\\.x86\\.sse2\\.loadu}
 ; RUN: llvm-as < %s | llvm-dis | \
 ; RUN:   grep {llvm\\.x86\\.mmx\\.ps} | grep {x86_mmx} | count 16
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
 
 declare i32 @llvm.ctpop.i28(i28 %val)
 declare i32 @llvm.cttz.i29(i29 %val)
@@ -91,3 +92,20 @@
   %v2 = call <2 x double> @llvm.x86.sse2.loadu.pd(double* %b)
   ret void
 }
+
+declare void @llvm.x86.sse.movnt.ps(i8*, <4 x float>) nounwind readnone 
+declare void @llvm.x86.sse2.movnt.dq(i8*, <2 x double>) nounwind readnone 
+declare void @llvm.x86.sse2.movnt.pd(i8*, <2 x double>) nounwind readnone 
+declare void @llvm.x86.sse2.movnt.i(i8*, i32) nounwind readnone 
+
+define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D) {
+; CHECK: store{{.*}}nontemporal
+  call void @llvm.x86.sse.movnt.ps(i8* %B, <4 x float> %A)
+; CHECK: store{{.*}}nontemporal
+  call void @llvm.x86.sse2.movnt.dq(i8* %B, <2 x double> %C)
+; CHECK: store{{.*}}nontemporal
+  call void @llvm.x86.sse2.movnt.pd(i8* %B, <2 x double> %C)
+; CHECK: store{{.*}}nontemporal
+  call void @llvm.x86.sse2.movnt.i(i8* %B, i32 %D)
+  ret void
+}
diff --git a/test/CodeGen/X86/nontemporal.ll b/test/CodeGen/X86/nontemporal.ll
new file mode 100644
index 0000000..1d09535
--- /dev/null
+++ b/test/CodeGen/X86/nontemporal.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+
+define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E) {
+; CHECK: movntps
+  %cast = bitcast i8* %B to <4 x float>*
+  store <4 x float> %A, <4 x float>* %cast, align 16, !nontemporal !0
+; CHECK: movntdq
+  %cast1 = bitcast i8* %B to <2 x i64>*
+  store <2 x i64> %E, <2 x i64>* %cast1, align 16, !nontemporal !0
+; CHECK: movntpd
+  %cast2 = bitcast i8* %B to <2 x double>*
+  store <2 x double> %C, <2 x double>* %cast2, align 16, !nontemporal !0
+; CHECK: movnti
+  %cast3 = bitcast i8* %B to i32*
+  store i32 %D, i32* %cast3, align 16, !nontemporal !0
+  ret void
+}
+
+!0 = metadata !{i32 1}