[X86] Remove some composite MOVSS/MOVSD isel patterns.

These patterns looked for a MOVSS/SD followed by a scalar_to_vector. Or a scalar_to_vector followed by a load.

In both cases we emitted a MOVSS/SD for the MOVSS/SD part, a REG_CLASS for the scalar_to_vector, and a MOVSS/SD for the load.

But we have patterns that do each of those 3 things individually so there's no reason to build large patterns.

Most of the test changes are just reorderings. The one test that had a meaningful change is pr30430.ll and it appears to be a regression. But its doing -O0 so I think it missed a lot of opportunities and was just getting lucky before.

llvm-svn: 336762
diff --git a/llvm/test/CodeGen/X86/build-vector-512.ll b/llvm/test/CodeGen/X86/build-vector-512.ll
index aba8b13..f4c2065 100644
--- a/llvm/test/CodeGen/X86/build-vector-512.ll
+++ b/llvm/test/CodeGen/X86/build-vector-512.ll
@@ -39,6 +39,8 @@
 ;
 ; AVX-64-LABEL: test_buildvector_v16f32:
 ; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; AVX-64-NEXT:    vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero
 ; AVX-64-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
 ; AVX-64-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
 ; AVX-64-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
@@ -46,12 +48,10 @@
 ; AVX-64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
 ; AVX-64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX-64-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
+; AVX-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm9[0],mem[0],xmm9[2,3]
 ; AVX-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
 ; AVX-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
-; AVX-64-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; AVX-64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm8[0],mem[0],xmm8[2,3]
 ; AVX-64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
 ; AVX-64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
index e176986..eff8319 100644
--- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
@@ -107,11 +107,11 @@
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: test_buildvector_v4f32_load:
@@ -135,9 +135,9 @@
 define <4 x float> @test_buildvector_v4f32_partial_load(float %f0, float %f1, float %f2, float* %p3) {
 ; SSE2-LABEL: test_buildvector_v4f32_partial_load:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/gather-addresses.ll b/llvm/test/CodeGen/X86/gather-addresses.ll
index 1e9dd96..ca8fd2a 100644
--- a/llvm/test/CodeGen/X86/gather-addresses.ll
+++ b/llvm/test/CodeGen/X86/gather-addresses.ll
@@ -26,8 +26,8 @@
 ; LIN-SSE2-NEXT:    movslq %edx, %rdx
 ; LIN-SSE2-NEXT:    movslq %esi, %rsi
 ; LIN-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; LIN-SSE2-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; LIN-SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; LIN-SSE2-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; LIN-SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
 ; LIN-SSE2-NEXT:    retq
 ;
@@ -42,10 +42,10 @@
 ; LIN-SSE4-NEXT:    cltq
 ; LIN-SSE4-NEXT:    movslq %ecx, %rcx
 ; LIN-SSE4-NEXT:    movslq %edx, %rdx
+; LIN-SSE4-NEXT:    movslq %esi, %rsi
 ; LIN-SSE4-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; LIN-SSE4-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; LIN-SSE4-NEXT:    movslq %esi, %rax
 ; LIN-SSE4-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; LIN-SSE4-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; LIN-SSE4-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
 ; LIN-SSE4-NEXT:    retq
 ;
@@ -60,13 +60,13 @@
 ; WIN-SSE2-NEXT:    movd %xmm1, %r10d
 ; WIN-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; WIN-SSE2-NEXT:    movd %xmm0, %edx
-; WIN-SSE2-NEXT:    movslq %r8d, %rax
+; WIN-SSE2-NEXT:    movslq %r8d, %r11
 ; WIN-SSE2-NEXT:    movslq %r9d, %r8
-; WIN-SSE2-NEXT:    movslq %r10d, %r9
+; WIN-SSE2-NEXT:    movslq %r10d, %rax
 ; WIN-SSE2-NEXT:    movslq %edx, %rdx
 ; WIN-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; WIN-SSE2-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; WIN-SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; WIN-SSE2-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; WIN-SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
 ; WIN-SSE2-NEXT:    retq
 ;
@@ -79,12 +79,12 @@
 ; WIN-SSE4-NEXT:    pextrd $2, %xmm0, %r8d
 ; WIN-SSE4-NEXT:    pextrd $3, %xmm0, %r9d
 ; WIN-SSE4-NEXT:    cltq
-; WIN-SSE4-NEXT:    movslq %edx, %rdx
-; WIN-SSE4-NEXT:    movslq %r8d, %r8
+; WIN-SSE4-NEXT:    movslq %edx, %r10
+; WIN-SSE4-NEXT:    movslq %r8d, %rdx
+; WIN-SSE4-NEXT:    movslq %r9d, %r8
 ; WIN-SSE4-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; WIN-SSE4-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; WIN-SSE4-NEXT:    movslq %r9d, %rax
 ; WIN-SSE4-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; WIN-SSE4-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; WIN-SSE4-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
 ; WIN-SSE4-NEXT:    retq
 ;
@@ -97,13 +97,13 @@
 ; LIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; LIN32-NEXT:    movdqa (%edx), %xmm0
 ; LIN32-NEXT:    pand (%ecx), %xmm0
-; LIN32-NEXT:    pextrd $1, %xmm0, %ecx
-; LIN32-NEXT:    pextrd $2, %xmm0, %edx
-; LIN32-NEXT:    pextrd $3, %xmm0, %esi
-; LIN32-NEXT:    movd %xmm0, %edi
+; LIN32-NEXT:    movd %xmm0, %ecx
+; LIN32-NEXT:    pextrd $1, %xmm0, %edx
+; LIN32-NEXT:    pextrd $2, %xmm0, %esi
+; LIN32-NEXT:    pextrd $3, %xmm0, %edi
 ; LIN32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; LIN32-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; LIN32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; LIN32-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; LIN32-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
 ; LIN32-NEXT:    popl %esi
 ; LIN32-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 7770b99..3cbfe03 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -431,33 +431,33 @@
 ; CHECK-I686-NEXT:    pushl %esi
 ; CHECK-I686-NEXT:    subl $56, %esp
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-I686-NEXT:    movzwl 2(%esi), %eax
-; CHECK-I686-NEXT:    movl %eax, (%esp)
-; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
-; CHECK-I686-NEXT:    fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
 ; CHECK-I686-NEXT:    movzwl 4(%esi), %eax
 ; CHECK-I686-NEXT:    movl %eax, (%esp)
 ; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
-; CHECK-I686-NEXT:    fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
-; CHECK-I686-NEXT:    movzwl 6(%esi), %eax
+; CHECK-I686-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; CHECK-I686-NEXT:    movzwl 2(%esi), %eax
 ; CHECK-I686-NEXT:    movl %eax, (%esp)
 ; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
 ; CHECK-I686-NEXT:    movzwl (%esi), %eax
 ; CHECK-I686-NEXT:    movl %eax, (%esp)
+; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
+; CHECK-I686-NEXT:    movzwl 6(%esi), %eax
+; CHECK-I686-NEXT:    movl %eax, (%esp)
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
 ; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
 ; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-I686-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-I686-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; CHECK-I686-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-I686-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; CHECK-I686-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-I686-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-I686-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-I686-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; CHECK-I686-NEXT:    addl $56, %esp
 ; CHECK-I686-NEXT:    popl %esi
 ; CHECK-I686-NEXT:    retl
@@ -474,24 +474,24 @@
 ; CHECK-LIBCALL-NEXT:    movq %rdi, %rbx
 ; CHECK-LIBCALL-NEXT:    movzwl 4(%rdi), %edi
 ; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-LIBCALL-NEXT:    movzwl 6(%rbx), %edi
 ; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-LIBCALL-NEXT:    movzwl (%rbx), %edi
 ; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-LIBCALL-NEXT:    movzwl 2(%rbx), %edi
 ; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
 ; CHECK-LIBCALL-NEXT:    cvtss2sd %xmm0, %xmm1
-; CHECK-LIBCALL-NEXT:    movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload
+; CHECK-LIBCALL-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-LIBCALL-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-LIBCALL-NEXT:    cvtss2sd %xmm0, %xmm0
 ; CHECK-LIBCALL-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-LIBCALL-NEXT:    movss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Reload
+; CHECK-LIBCALL-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-LIBCALL-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-LIBCALL-NEXT:    cvtss2sd %xmm1, %xmm2
-; CHECK-LIBCALL-NEXT:    movss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Reload
+; CHECK-LIBCALL-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; CHECK-LIBCALL-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; CHECK-LIBCALL-NEXT:    cvtss2sd %xmm1, %xmm1
 ; CHECK-LIBCALL-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
@@ -530,24 +530,24 @@
 ; CHECK-I686-NEXT:    movzwl 6(%esi), %eax
 ; CHECK-I686-NEXT:    movl %eax, (%esp)
 ; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
-; CHECK-I686-NEXT:    fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
 ; CHECK-I686-NEXT:    movzwl 4(%esi), %eax
 ; CHECK-I686-NEXT:    movl %eax, (%esp)
 ; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
-; CHECK-I686-NEXT:    fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
 ; CHECK-I686-NEXT:    movzwl 2(%esi), %eax
 ; CHECK-I686-NEXT:    movl %eax, (%esp)
 ; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
-; CHECK-I686-NEXT:    fstpt {{[0-9]+}}(%esp) # 10-byte Folded Spill
+; CHECK-I686-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
 ; CHECK-I686-NEXT:    movzwl (%esi), %eax
 ; CHECK-I686-NEXT:    movl %eax, (%esp)
 ; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
 ; CHECK-I686-NEXT:    fstpl {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; CHECK-I686-NEXT:    fstpl {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; CHECK-I686-NEXT:    fstpl {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    fldt {{[0-9]+}}(%esp) # 10-byte Folded Reload
+; CHECK-I686-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
 ; CHECK-I686-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; CHECK-I686-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-I686-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
@@ -654,24 +654,24 @@
 ; CHECK-I686-NEXT:    pushl %edi
 ; CHECK-I686-NEXT:    pushl %esi
 ; CHECK-I686-NEXT:    subl $44, %esp
-; CHECK-I686-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp) # 16-byte Spill
+; CHECK-I686-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; CHECK-I686-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-I686-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
 ; CHECK-I686-NEXT:    movss %xmm1, (%esp)
 ; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
 ; CHECK-I686-NEXT:    movw %ax, %si
-; CHECK-I686-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
 ; CHECK-I686-NEXT:    movss %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
 ; CHECK-I686-NEXT:    movw %ax, %di
-; CHECK-I686-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
 ; CHECK-I686-NEXT:    movss %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
 ; CHECK-I686-NEXT:    movw %ax, %bx
-; CHECK-I686-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    movss %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __gnu_f2h_ieee
 ; CHECK-I686-NEXT:    movw %ax, (%ebp)
@@ -699,7 +699,7 @@
 ; BWON-NOF16C-NEXT:    subq $40, %rsp
 ; BWON-NOF16C-NEXT:    movq %rdi, %rbx
 ; BWON-NOF16C-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
-; BWON-NOF16C-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; BWON-NOF16C-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; BWON-NOF16C-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
 ; BWON-NOF16C-NEXT:    callq __truncdfhf2
 ; BWON-NOF16C-NEXT:    movl %eax, %r14d
@@ -707,7 +707,7 @@
 ; BWON-NOF16C-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
 ; BWON-NOF16C-NEXT:    callq __truncdfhf2
 ; BWON-NOF16C-NEXT:    movl %eax, %r15d
-; BWON-NOF16C-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; BWON-NOF16C-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; BWON-NOF16C-NEXT:    callq __truncdfhf2
 ; BWON-NOF16C-NEXT:    movl %eax, %ebp
 ; BWON-NOF16C-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
@@ -732,7 +732,7 @@
 ; BWOFF-NEXT:    subq $40, %rsp
 ; BWOFF-NEXT:    movq %rdi, %rbx
 ; BWOFF-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
-; BWOFF-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; BWOFF-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; BWOFF-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
 ; BWOFF-NEXT:    callq __truncdfhf2
 ; BWOFF-NEXT:    movw %ax, %r14w
@@ -740,7 +740,7 @@
 ; BWOFF-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
 ; BWOFF-NEXT:    callq __truncdfhf2
 ; BWOFF-NEXT:    movw %ax, %r15w
-; BWOFF-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; BWOFF-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; BWOFF-NEXT:    callq __truncdfhf2
 ; BWOFF-NEXT:    movw %ax, %bp
 ; BWOFF-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
@@ -764,24 +764,24 @@
 ; BWON-F16C-NEXT:    pushq %rbx
 ; BWON-F16C-NEXT:    subq $88, %rsp
 ; BWON-F16C-NEXT:    movq %rdi, %rbx
-; BWON-F16C-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; BWON-F16C-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; BWON-F16C-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; BWON-F16C-NEXT:    vzeroupper
 ; BWON-F16C-NEXT:    callq __truncdfhf2
 ; BWON-F16C-NEXT:    movl %eax, %r14d
-; BWON-F16C-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; BWON-F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; BWON-F16C-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; BWON-F16C-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; BWON-F16C-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; BWON-F16C-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; BWON-F16C-NEXT:    vzeroupper
 ; BWON-F16C-NEXT:    callq __truncdfhf2
 ; BWON-F16C-NEXT:    movl %eax, %r15d
-; BWON-F16C-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; BWON-F16C-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; BWON-F16C-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; BWON-F16C-NEXT:    vzeroupper
 ; BWON-F16C-NEXT:    callq __truncdfhf2
 ; BWON-F16C-NEXT:    movl %eax, %ebp
-; BWON-F16C-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; BWON-F16C-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; BWON-F16C-NEXT:    callq __truncdfhf2
 ; BWON-F16C-NEXT:    movw %ax, 4(%rbx)
 ; BWON-F16C-NEXT:    movw %bp, (%rbx)
@@ -801,21 +801,21 @@
 ; CHECK-I686-NEXT:    pushl %edi
 ; CHECK-I686-NEXT:    pushl %esi
 ; CHECK-I686-NEXT:    subl $60, %esp
-; CHECK-I686-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp) # 16-byte Spill
-; CHECK-I686-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp) # 16-byte Spill
+; CHECK-I686-NEXT:    movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-I686-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; CHECK-I686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; CHECK-I686-NEXT:    movlps %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncdfhf2
 ; CHECK-I686-NEXT:    movw %ax, %si
-; CHECK-I686-NEXT:    movapd {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT:    movapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    movhpd %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncdfhf2
 ; CHECK-I686-NEXT:    movw %ax, %di
-; CHECK-I686-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    movlps %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncdfhf2
 ; CHECK-I686-NEXT:    movw %ax, %bx
-; CHECK-I686-NEXT:    movapd {{[0-9]+}}(%esp), %xmm0 # 16-byte Reload
+; CHECK-I686-NEXT:    movapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-I686-NEXT:    movhpd %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncdfhf2
 ; CHECK-I686-NEXT:    movw %ax, 6(%ebp)
@@ -885,12 +885,12 @@
 ; CHECK-LIBCALL-NEXT:    movl %edi, %ebx
 ; CHECK-LIBCALL-NEXT:    movzwl (%rsi), %edi
 ; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-LIBCALL-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-LIBCALL-NEXT:    cvtsi2ssl %ebx, %xmm0
 ; CHECK-LIBCALL-NEXT:    callq __gnu_f2h_ieee
 ; CHECK-LIBCALL-NEXT:    movzwl %ax, %edi
 ; CHECK-LIBCALL-NEXT:    callq __gnu_h2f_ieee
-; CHECK-LIBCALL-NEXT:    addss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
+; CHECK-LIBCALL-NEXT:    addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
 ; CHECK-LIBCALL-NEXT:    addq $16, %rsp
 ; CHECK-LIBCALL-NEXT:    popq %rbx
 ; CHECK-LIBCALL-NEXT:    retq
@@ -915,7 +915,7 @@
 ; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
 ; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-I686-NEXT:    movss %xmm0, {{[0-9]+}}(%esp) # 4-byte Spill
+; CHECK-I686-NEXT:    movss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; CHECK-I686-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-I686-NEXT:    cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
 ; CHECK-I686-NEXT:    movss %xmm0, (%esp)
@@ -924,7 +924,7 @@
 ; CHECK-I686-NEXT:    movl %eax, (%esp)
 ; CHECK-I686-NEXT:    calll __gnu_h2f_ieee
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # 4-byte Reload
+; CHECK-I686-NEXT:    movss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-I686-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-I686-NEXT:    addss {{[0-9]+}}(%esp), %xmm0
 ; CHECK-I686-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
index 37841b8..c286c46 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -288,9 +288,9 @@
 ; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; X32-SSE1-NEXT:    retl
 ;
 ; X32-SSE41-LABEL: merge_4f32_f32_012u:
@@ -337,9 +337,9 @@
 ; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; X32-SSE1-NEXT:    retl
 ;
 ; X32-SSE41-LABEL: merge_4f32_f32_019u:
@@ -1198,11 +1198,11 @@
 ; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; X32-SSE1-NEXT:    retl
 ;
 ; X32-SSE41-LABEL: merge_4f32_f32_2345_volatile:
diff --git a/llvm/test/CodeGen/X86/mmx-build-vector.ll b/llvm/test/CodeGen/X86/mmx-build-vector.ll
index dace3cd..f94dd05 100644
--- a/llvm/test/CodeGen/X86/mmx-build-vector.ll
+++ b/llvm/test/CodeGen/X86/mmx-build-vector.ll
@@ -651,8 +651,8 @@
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movdq2q %xmm0, %mm0
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movdq2q %xmm1, %mm0
 ; X86-SSE-NEXT:    movdq2q %xmm0, %mm1
 ; X86-SSE-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
 ; X86-SSE-NEXT:    paddd %mm1, %mm1
diff --git a/llvm/test/CodeGen/X86/pr30430.ll b/llvm/test/CodeGen/X86/pr30430.ll
index 4a75d3e..94deca3 100644
--- a/llvm/test/CodeGen/X86/pr30430.ll
+++ b/llvm/test/CodeGen/X86/pr30430.ll
@@ -10,7 +10,7 @@
 ; CHECK-NEXT:    movq %rsp, %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_register %rbp
 ; CHECK-NEXT:    andq $-64, %rsp
-; CHECK-NEXT:    subq $256, %rsp # imm = 0x100
+; CHECK-NEXT:    subq $320, %rsp # imm = 0x140
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero
@@ -61,50 +61,68 @@
 ; CHECK-NEXT:    vmovss %xmm23, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm16 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm17 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm18 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm19 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm20 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm21 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm22 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm23 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    vmovaps %zmm21, %zmm0
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps %zmm20, %zmm0
+; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; CHECK-NEXT:    vmovaps %zmm22, %zmm1
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovaps %zmm23, %zmm1
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; CHECK-NEXT:    # implicit-def: $ymm2
-; CHECK-NEXT:    vmovaps %xmm1, %xmm2
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm2
-; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovaps %zmm17, %zmm1
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps %zmm16, %zmm0
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; CHECK-NEXT:    vmovaps %zmm18, %zmm1
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovaps %zmm19, %zmm1
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
-; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
+; CHECK-NEXT:    # implicit-def: $ymm1
+; CHECK-NEXT:    vmovaps %xmm0, %xmm1
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[2,3]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[0]
+; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Reload
+; CHECK-NEXT:    # xmm4 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 4-byte Reload
+; CHECK-NEXT:    # xmm5 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[2,3]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm4[0,1],xmm2[0],xmm4[3]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
 ; CHECK-NEXT:    # implicit-def: $ymm3
-; CHECK-NEXT:    vmovaps %xmm1, %xmm3
+; CHECK-NEXT:    vmovaps %xmm2, %xmm3
 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm3
 ; CHECK-NEXT:    # implicit-def: $zmm24
 ; CHECK-NEXT:    vmovaps %zmm3, %zmm24
-; CHECK-NEXT:    vinsertf64x4 $1, %ymm2, %zmm24, %zmm24
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm24, %zmm24
 ; CHECK-NEXT:    vmovaps %zmm24, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm0
-; CHECK-NEXT:    vmovss %xmm15, {{[0-9]+}}(%rsp) # 4-byte Spill
-; CHECK-NEXT:    vmovss %xmm8, {{[0-9]+}}(%rsp) # 4-byte Spill
-; CHECK-NEXT:    vmovss %xmm9, {{[0-9]+}}(%rsp) # 4-byte Spill
-; CHECK-NEXT:    vmovss %xmm10, {{[0-9]+}}(%rsp) # 4-byte Spill
-; CHECK-NEXT:    vmovss %xmm11, {{[0-9]+}}(%rsp) # 4-byte Spill
-; CHECK-NEXT:    vmovss %xmm12, {{[0-9]+}}(%rsp) # 4-byte Spill
-; CHECK-NEXT:    vmovss %xmm13, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    vmovss %xmm14, (%rsp) # 4-byte Spill
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
diff --git a/llvm/test/CodeGen/X86/sse-fcopysign.ll b/llvm/test/CodeGen/X86/sse-fcopysign.ll
index 883fb52..492c873 100644
--- a/llvm/test/CodeGen/X86/sse-fcopysign.ll
+++ b/llvm/test/CodeGen/X86/sse-fcopysign.ll
@@ -65,11 +65,11 @@
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:    andps {{\.LCPI.*}}, %xmm0
 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-NEXT:    andps {{\.LCPI.*}}, %xmm1
-; X32-NEXT:    orps %xmm0, %xmm1
-; X32-NEXT:    movss %xmm1, (%esp)
+; X32-NEXT:    andps {{\.LCPI.*}}, %xmm0
+; X32-NEXT:    orps %xmm1, %xmm0
+; X32-NEXT:    movss %xmm0, (%esp)
 ; X32-NEXT:    flds (%esp)
 ; X32-NEXT:    popl %eax
 ; X32-NEXT:    retl
@@ -91,14 +91,14 @@
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-8, %esp
 ; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:    addss 20(%ebp), %xmm0
-; X32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    andps {{\.LCPI.*}}, %xmm1
-; X32-NEXT:    cvtss2sd %xmm0, %xmm0
+; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    addss 20(%ebp), %xmm1
 ; X32-NEXT:    andps {{\.LCPI.*}}, %xmm0
-; X32-NEXT:    orps %xmm1, %xmm0
-; X32-NEXT:    movlps %xmm0, (%esp)
+; X32-NEXT:    cvtss2sd %xmm1, %xmm1
+; X32-NEXT:    andps {{\.LCPI.*}}, %xmm1
+; X32-NEXT:    orps %xmm0, %xmm1
+; X32-NEXT:    movlps %xmm1, (%esp)
 ; X32-NEXT:    fldl (%esp)
 ; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index dc943c0..661a478 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -436,16 +436,16 @@
 ; SSE3-NEXT:    movd %xmm1, %esi
 ; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE3-NEXT:    andl $3, %eax
+; SSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE3-NEXT:    andl $3, %ecx
-; SSE3-NEXT:    andl $3, %edx
-; SSE3-NEXT:    andl $3, %esi
-; SSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE3-NEXT:    andl $3, %edx
 ; SSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE3-NEXT:    andl $3, %esi
+; SSE3-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE3-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: var_shuffle_v4f32:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
index 18b97b1..f13c048 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
@@ -73,17 +73,17 @@
 ; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
 ; SSE2-NEXT:    # kill: def $edi killed $edi def $rdi
 ; SSE2-NEXT:    andl $3, %edi
-; SSE2-NEXT:    andl $3, %esi
-; SSE2-NEXT:    andl $3, %edx
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    andl $3, %ecx
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    andl $3, %esi
 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    andl $3, %edx
 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    andl $3, %ecx
+; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
@@ -93,17 +93,17 @@
 ; SSSE3-NEXT:    # kill: def $esi killed $esi def $rsi
 ; SSSE3-NEXT:    # kill: def $edi killed $edi def $rdi
 ; SSSE3-NEXT:    andl $3, %edi
-; SSSE3-NEXT:    andl $3, %esi
-; SSSE3-NEXT:    andl $3, %edx
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    andl $3, %ecx
 ; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    andl $3, %esi
 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    andl $3, %edx
 ; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    andl $3, %ecx
+; SSSE3-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
@@ -113,11 +113,11 @@
 ; SSE41-NEXT:    # kill: def $esi killed $esi def $rsi
 ; SSE41-NEXT:    # kill: def $edi killed $edi def $rdi
 ; SSE41-NEXT:    andl $3, %edi
+; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE41-NEXT:    andl $3, %esi
 ; SSE41-NEXT:    andl $3, %edx
-; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE41-NEXT:    andl $3, %ecx
-; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
@@ -130,11 +130,11 @@
 ; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
 ; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
 ; AVX-NEXT:    andl $3, %edi
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    andl $3, %esi
 ; AVX-NEXT:    andl $3, %edx
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX-NEXT:    andl $3, %ecx
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
@@ -1222,10 +1222,10 @@
 ; SSE-NEXT:    andl $3, %edi
 ; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    andl $3, %edx
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    andl $3, %ecx
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1239,10 +1239,10 @@
 ; AVX-NEXT:    andl $3, %edi
 ; AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
 ; AVX-NEXT:    andl $3, %edx
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX-NEXT:    andl $3, %ecx
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll
index 6a159ab..2c1b146 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll
@@ -14,15 +14,15 @@
 ; ALL-NEXT:    andq $-32, %rsp
 ; ALL-NEXT:    subq $64, %rsp
 ; ALL-NEXT:    andl $3, %esi
-; ALL-NEXT:    andl $3, %edi
 ; ALL-NEXT:    andl $3, %ecx
 ; ALL-NEXT:    andl $3, %edx
+; ALL-NEXT:    andl $3, %edi
 ; ALL-NEXT:    vmovaps %ymm0, (%rsp)
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; ALL-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; ALL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; ALL-NEXT:    movq %rbp, %rsp
 ; ALL-NEXT:    popq %rbp
 ; ALL-NEXT:    retq
@@ -68,15 +68,15 @@
 ; ALL-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64:
 ; ALL:       # %bb.0:
 ; ALL-NEXT:    andl $1, %esi
-; ALL-NEXT:    andl $1, %edi
 ; ALL-NEXT:    andl $1, %ecx
 ; ALL-NEXT:    andl $1, %edx
+; ALL-NEXT:    andl $1, %edi
 ; ALL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; ALL-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; ALL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %x0 = extractelement <2 x double> %x, i64 %i0
   %x1 = extractelement <2 x double> %x, i64 %i1
@@ -194,21 +194,21 @@
 ; ALL-NEXT:    movl 16(%rbp), %eax
 ; ALL-NEXT:    andl $7, %eax
 ; ALL-NEXT:    andl $7, %edi
+; ALL-NEXT:    vmovaps %ymm0, (%rsp)
+; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; ALL-NEXT:    andl $7, %esi
 ; ALL-NEXT:    andl $7, %edx
 ; ALL-NEXT:    andl $7, %ecx
 ; ALL-NEXT:    andl $7, %r8d
-; ALL-NEXT:    vmovaps %ymm0, (%rsp)
-; ALL-NEXT:    andl $7, %r9d
-; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 ; ALL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; ALL-NEXT:    andl $7, %r9d
 ; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
 ; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
 ; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; ALL-NEXT:    movq %rbp, %rsp
 ; ALL-NEXT:    popq %rbp
 ; ALL-NEXT:    retq
@@ -245,21 +245,21 @@
 ; ALL-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; ALL-NEXT:    andl $3, %eax
 ; ALL-NEXT:    andl $3, %edi
+; ALL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; ALL-NEXT:    andl $3, %esi
 ; ALL-NEXT:    andl $3, %edx
 ; ALL-NEXT:    andl $3, %ecx
 ; ALL-NEXT:    andl $3, %r8d
-; ALL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    andl $3, %r9d
-; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 ; ALL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; ALL-NEXT:    andl $3, %r9d
 ; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
 ; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
 ; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %x0 = extractelement <4 x float> %x, i32 %i0
   %x1 = extractelement <4 x float> %x, i32 %i1