Remove the pmulld intrinsic and autoupdate it as a vector multiply.
Rewrite the pmulld patterns, and make sure that they fold in loads of
arguments into the instruction.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@99910 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/test/Bitcode/sse41_pmulld.ll b/test/Bitcode/sse41_pmulld.ll
new file mode 100644
index 0000000..caf8547
--- /dev/null
+++ b/test/Bitcode/sse41_pmulld.ll
@@ -0,0 +1,2 @@
+; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.pmulld}
+; RUN: llvm-dis < %s.bc | grep mul
\ No newline at end of file
diff --git a/test/Bitcode/sse41_pmulld.ll.bc b/test/Bitcode/sse41_pmulld.ll.bc
new file mode 100644
index 0000000..bd66f0a
--- /dev/null
+++ b/test/Bitcode/sse41_pmulld.ll.bc
Binary files differ
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index e2746a8..bf5229a 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1,6 +1,6 @@
; RUN: llc < %s -march=x86 -mattr=sse41 -stack-alignment=16 > %t
; RUN: grep pmul %t | count 12
-; RUN: grep mov %t | count 12
+; RUN: grep mov %t | count 11
define <4 x i32> @a(<4 x i32> %i) nounwind {
%A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
diff --git a/test/CodeGen/X86/pmulld.ll b/test/CodeGen/X86/pmulld.ll
new file mode 100644
index 0000000..3ef5941
--- /dev/null
+++ b/test/CodeGen/X86/pmulld.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=x86-64 -mattr=+sse41 -asm-verbose=0 | FileCheck %s
+
+define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
+; CHECK: test1:
+; CHECK-NEXT: pmulld
+ %C = mul <4 x i32> %A, %B
+ ret <4 x i32> %C
+}
+
+define <4 x i32> @test1a(<4 x i32> %A, <4 x i32> *%Bp) nounwind {
+; CHECK: test1a:
+; CHECK-NEXT: pmulld
+ %B = load <4 x i32>* %Bp
+ %C = mul <4 x i32> %A, %B
+ ret <4 x i32> %C
+}