Remove the pmulld intrinsic and autoupdate it as a vector multiply.

Rewrite the pmulld patterns, and make sure that they fold in loads of
arguments into the instruction.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@99910 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/test/CodeGen/X86/pmulld.ll b/test/CodeGen/X86/pmulld.ll
new file mode 100644
index 0000000..3ef5941
--- /dev/null
+++ b/test/CodeGen/X86/pmulld.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=x86-64 -mattr=+sse41 -asm-verbose=0 | FileCheck %s
+
+define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
+; CHECK: test1:
+; CHECK-NEXT: pmulld
+  %C = mul <4 x i32> %A, %B
+  ret <4 x i32> %C
+}
+
+define <4 x i32> @test1a(<4 x i32> %A, <4 x i32> *%Bp) nounwind {
+; CHECK: test1a:
+; CHECK-NEXT: pmulld
+  %B = load <4 x i32>* %Bp
+  %C = mul <4 x i32> %A, %B
+  ret <4 x i32> %C
+}