[X86] Handle COPYs of physregs better (regalloc hints)

Enable enableMultipleCopyHints() on X86.

Original Patch by @jonpa:

While enabling the mischeduler for SystemZ, it was discovered that for some reason a test needed one extra seemingly needless COPY (test/CodeGen/SystemZ/call-03.ll). The handling for that is resulted in this patch, which improves the register coalescing by providing not just one copy hint, but a sorted list of copy hints. On SystemZ, this gives ~12500 less register moves on SPEC, as well as marginally less spilling.

Instead of improving just the SystemZ backend, the improvement has been implemented in common-code (calculateSpillWeightAndHint(). This gives a lot of test failures, but since this should be a general improvement I hope that the involved targets will help and review the test updates.

Differential Revision: https://reviews.llvm.org/D38128

llvm-svn: 342578
diff --git a/llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll
index e8098dd..811243b 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/add-scalar.ll
@@ -54,14 +54,15 @@
   ret i16 %ret
 }
 
-define i8 @test_add_i8(i8 %arg1, i8 %arg2) {
-; X64-LABEL: test_add_i8:
-; X64:       # %bb.0:
-; X64-NEXT:    addb %dil, %sil
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    retq
-;
-; X32-LABEL: test_add_i8:
+define i8 @test_add_i8(i8 %arg1, i8 %arg2) {

+; X64-LABEL: test_add_i8:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %esi, %eax

+; X64-NEXT:    addb %dil, %al

+; X64-NEXT:    # kill: def $al killed $al killed $eax

+; X64-NEXT:    retq

+;

+; X32-LABEL: test_add_i8:

 ; X32:       # %bb.0:
 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X32-NEXT:    addb {{[0-9]+}}(%esp), %al
diff --git a/llvm/test/CodeGen/X86/GlobalISel/and-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/and-scalar.ll
index b237013..a5bebff 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/and-scalar.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/and-scalar.ll
@@ -16,43 +16,45 @@
   ret i32 %ret
 }
 
-define i8 @test_and_i8(i8 %arg1, i8 %arg2) {
-; ALL-LABEL: test_and_i8:
-; ALL:       # %bb.0:
-; ALL-NEXT:    andb %dil, %sil
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    retq
-  %ret = and i8 %arg1, %arg2
-  ret i8 %ret
-}
-
-define i16 @test_and_i16(i16 %arg1, i16 %arg2) {
-; ALL-LABEL: test_and_i16:
-; ALL:       # %bb.0:
-; ALL-NEXT:    andw %di, %si
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    retq
-  %ret = and i16 %arg1, %arg2
-  ret i16 %ret
-}
-
-define i32 @test_and_i32(i32 %arg1, i32 %arg2) {
-; ALL-LABEL: test_and_i32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    andl %edi, %esi
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    retq
-  %ret = and i32 %arg1, %arg2
-  ret i32 %ret
-}
-
-define i64 @test_and_i64(i64 %arg1, i64 %arg2) {
-; ALL-LABEL: test_and_i64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    andq %rdi, %rsi
-; ALL-NEXT:    movq %rsi, %rax
-; ALL-NEXT:    retq
-  %ret = and i64 %arg1, %arg2
-  ret i64 %ret
+define i8 @test_and_i8(i8 %arg1, i8 %arg2) {

+; ALL-LABEL: test_and_i8:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movl %esi, %eax

+; ALL-NEXT:    andb %dil, %al

+; ALL-NEXT:    # kill: def $al killed $al killed $eax

+; ALL-NEXT:    retq

+  %ret = and i8 %arg1, %arg2

+  ret i8 %ret

+}

+

+define i16 @test_and_i16(i16 %arg1, i16 %arg2) {

+; ALL-LABEL: test_and_i16:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movl %esi, %eax

+; ALL-NEXT:    andw %di, %ax

+; ALL-NEXT:    # kill: def $ax killed $ax killed $eax

+; ALL-NEXT:    retq

+  %ret = and i16 %arg1, %arg2

+  ret i16 %ret

+}

+

+define i32 @test_and_i32(i32 %arg1, i32 %arg2) {

+; ALL-LABEL: test_and_i32:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movl %esi, %eax

+; ALL-NEXT:    andl %edi, %eax

+; ALL-NEXT:    retq

+  %ret = and i32 %arg1, %arg2

+  ret i32 %ret

+}

+

+define i64 @test_and_i64(i64 %arg1, i64 %arg2) {

+; ALL-LABEL: test_and_i64:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movq %rsi, %rax

+; ALL-NEXT:    andq %rdi, %rax

+; ALL-NEXT:    retq

+  %ret = and i64 %arg1, %arg2

+  ret i64 %ret

 }
 
diff --git a/llvm/test/CodeGen/X86/GlobalISel/ashr-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/ashr-scalar.ll
index 29848ad..0959553 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/ashr-scalar.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/ashr-scalar.ll
@@ -1,180 +1,191 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64
 
-define i64 @test_ashr_i64(i64 %arg1, i64 %arg2) {
-; X64-LABEL: test_ashr_i64:
-; X64:       # %bb.0:
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    # kill: def $cl killed $rcx
-; X64-NEXT:    sarq %cl, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    retq
-  %res = ashr i64 %arg1, %arg2
-  ret i64 %res
-}
-
-define i64 @test_ashr_i64_imm(i64 %arg1) {
-; X64-LABEL: test_ashr_i64_imm:
-; X64:       # %bb.0:
-; X64-NEXT:    movq $5, %rcx
-; X64-NEXT:    # kill: def $cl killed $rcx
-; X64-NEXT:    sarq %cl, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    retq
-  %res = ashr i64 %arg1, 5
-  ret i64 %res
-}
-
-define i64 @test_ashr_i64_imm1(i64 %arg1) {
-; X64-LABEL: test_ashr_i64_imm1:
-; X64:       # %bb.0:
-; X64-NEXT:    movq $1, %rcx
-; X64-NEXT:    # kill: def $cl killed $rcx
-; X64-NEXT:    sarq %cl, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    retq
-  %res = ashr i64 %arg1, 1
-  ret i64 %res
-}
-
-define i32 @test_ashr_i32(i32 %arg1, i32 %arg2) {
-; X64-LABEL: test_ashr_i32:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    # kill: def $cl killed $ecx
-; X64-NEXT:    sarl %cl, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %res = ashr i32 %arg1, %arg2
-  ret i32 %res
-}
-
-define i32 @test_ashr_i32_imm(i32 %arg1) {
-; X64-LABEL: test_ashr_i32_imm:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $5, %ecx
-; X64-NEXT:    # kill: def $cl killed $ecx
-; X64-NEXT:    sarl %cl, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %res = ashr i32 %arg1, 5
-  ret i32 %res
-}
-
-define i32 @test_ashr_i32_imm1(i32 %arg1) {
-; X64-LABEL: test_ashr_i32_imm1:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $1, %ecx
-; X64-NEXT:    # kill: def $cl killed $ecx
-; X64-NEXT:    sarl %cl, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %res = ashr i32 %arg1, 1
-  ret i32 %res
-}
-
-define i16 @test_ashr_i16(i32 %arg1, i32 %arg2) {
-; X64-LABEL: test_ashr_i16:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    # kill: def $cl killed $cx
-; X64-NEXT:    sarw %cl, %di
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i16
-  %a2 = trunc i32 %arg2 to i16
+define i64 @test_ashr_i64(i64 %arg1, i64 %arg2) {

+; X64-LABEL: test_ashr_i64:

+; X64:       # %bb.0:

+; X64-NEXT:    movq %rdi, %rax

+; X64-NEXT:    movq %rsi, %rcx

+; X64-NEXT:    # kill: def $cl killed $rcx

+; X64-NEXT:    sarq %cl, %rax

+; X64-NEXT:    retq

+  %res = ashr i64 %arg1, %arg2

+  ret i64 %res

+}

+

+define i64 @test_ashr_i64_imm(i64 %arg1) {

+; X64-LABEL: test_ashr_i64_imm:

+; X64:       # %bb.0:

+; X64-NEXT:    movq %rdi, %rax

+; X64-NEXT:    movq $5, %rcx

+; X64-NEXT:    # kill: def $cl killed $rcx

+; X64-NEXT:    sarq %cl, %rax

+; X64-NEXT:    retq

+  %res = ashr i64 %arg1, 5

+  ret i64 %res

+}

+

+define i64 @test_ashr_i64_imm1(i64 %arg1) {

+; X64-LABEL: test_ashr_i64_imm1:

+; X64:       # %bb.0:

+; X64-NEXT:    movq %rdi, %rax

+; X64-NEXT:    movq $1, %rcx

+; X64-NEXT:    # kill: def $cl killed $rcx

+; X64-NEXT:    sarq %cl, %rax

+; X64-NEXT:    retq

+  %res = ashr i64 %arg1, 1

+  ret i64 %res

+}

+

+define i32 @test_ashr_i32(i32 %arg1, i32 %arg2) {

+; X64-LABEL: test_ashr_i32:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl %esi, %ecx

+; X64-NEXT:    # kill: def $cl killed $ecx

+; X64-NEXT:    sarl %cl, %eax

+; X64-NEXT:    retq

+  %res = ashr i32 %arg1, %arg2

+  ret i32 %res

+}

+

+define i32 @test_ashr_i32_imm(i32 %arg1) {

+; X64-LABEL: test_ashr_i32_imm:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl $5, %ecx

+; X64-NEXT:    # kill: def $cl killed $ecx

+; X64-NEXT:    sarl %cl, %eax

+; X64-NEXT:    retq

+  %res = ashr i32 %arg1, 5

+  ret i32 %res

+}

+

+define i32 @test_ashr_i32_imm1(i32 %arg1) {

+; X64-LABEL: test_ashr_i32_imm1:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl $1, %ecx

+; X64-NEXT:    # kill: def $cl killed $ecx

+; X64-NEXT:    sarl %cl, %eax

+; X64-NEXT:    retq

+  %res = ashr i32 %arg1, 1

+  ret i32 %res

+}

+

+define i16 @test_ashr_i16(i32 %arg1, i32 %arg2) {

+; X64-LABEL: test_ashr_i16:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl %esi, %ecx

+; X64-NEXT:    # kill: def $cx killed $cx killed $ecx

+; X64-NEXT:    # kill: def $cl killed $cx

+; X64-NEXT:    sarw %cl, %ax

+; X64-NEXT:    # kill: def $ax killed $ax killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i16

+  %a2 = trunc i32 %arg2 to i16

   %res = ashr i16 %a, %a2
   ret i16 %res
 }
 
-define i16 @test_ashr_i16_imm(i32 %arg1) {
-; X64-LABEL: test_ashr_i16_imm:
-; X64:       # %bb.0:
-; X64-NEXT:    movw $5, %cx
-; X64-NEXT:    # kill: def $cl killed $cx
-; X64-NEXT:    sarw %cl, %di
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i16
-  %res = ashr i16 %a, 5
+define i16 @test_ashr_i16_imm(i32 %arg1) {

+; X64-LABEL: test_ashr_i16_imm:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movw $5, %cx

+; X64-NEXT:    # kill: def $cl killed $cx

+; X64-NEXT:    sarw %cl, %ax

+; X64-NEXT:    # kill: def $ax killed $ax killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i16

+  %res = ashr i16 %a, 5

   ret i16 %res
 }
 
-define i16 @test_ashr_i16_imm1(i32 %arg1) {
-; X64-LABEL: test_ashr_i16_imm1:
-; X64:       # %bb.0:
-; X64-NEXT:    movw $1, %cx
-; X64-NEXT:    # kill: def $cl killed $cx
-; X64-NEXT:    sarw %cl, %di
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i16
-  %res = ashr i16 %a, 1
+define i16 @test_ashr_i16_imm1(i32 %arg1) {

+; X64-LABEL: test_ashr_i16_imm1:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movw $1, %cx

+; X64-NEXT:    # kill: def $cl killed $cx

+; X64-NEXT:    sarw %cl, %ax

+; X64-NEXT:    # kill: def $ax killed $ax killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i16

+  %res = ashr i16 %a, 1

   ret i16 %res
 }
 
-define i8 @test_ashr_i8(i32 %arg1, i32 %arg2) {
-; X64-LABEL: test_ashr_i8:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    sarb %cl, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i8
-  %a2 = trunc i32 %arg2 to i8
+define i8 @test_ashr_i8(i32 %arg1, i32 %arg2) {

+; X64-LABEL: test_ashr_i8:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl %esi, %ecx

+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx

+; X64-NEXT:    sarb %cl, %al

+; X64-NEXT:    # kill: def $al killed $al killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i8

+  %a2 = trunc i32 %arg2 to i8

   %res = ashr i8 %a, %a2
   ret i8 %res
 }
 
-define i8 @test_ashr_i8_imm(i32 %arg1) {
-; X64-LABEL: test_ashr_i8_imm:
-; X64:       # %bb.0:
-; X64-NEXT:    sarb $5, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i8
-  %res = ashr i8 %a, 5
+define i8 @test_ashr_i8_imm(i32 %arg1) {

+; X64-LABEL: test_ashr_i8_imm:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    sarb $5, %al

+; X64-NEXT:    # kill: def $al killed $al killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i8

+  %res = ashr i8 %a, 5

   ret i8 %res
 }
 
-define i8 @test_ashr_i8_imm1(i32 %arg1) {
-; X64-LABEL: test_ashr_i8_imm1:
-; X64:       # %bb.0:
-; X64-NEXT:    sarb %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i8
-  %res = ashr i8 %a, 1
+define i8 @test_ashr_i8_imm1(i32 %arg1) {

+; X64-LABEL: test_ashr_i8_imm1:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    sarb %al

+; X64-NEXT:    # kill: def $al killed $al killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i8

+  %res = ashr i8 %a, 1

   ret i8 %res
 }
 
-define i1 @test_ashr_i1(i32 %arg1, i32 %arg2) {
-; X64-LABEL: test_ashr_i1:
-; X64:       # %bb.0:
-; X64-NEXT:    shlb $7, %dil
-; X64-NEXT:    sarb $7, %dil
-; X64-NEXT:    andb $1, %sil
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    sarb %cl, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i1
-  %a2 = trunc i32 %arg2 to i1
+define i1 @test_ashr_i1(i32 %arg1, i32 %arg2) {

+; X64-LABEL: test_ashr_i1:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl %esi, %ecx

+; X64-NEXT:    shlb $7, %al

+; X64-NEXT:    sarb $7, %al

+; X64-NEXT:    andb $1, %cl

+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx

+; X64-NEXT:    sarb %cl, %al

+; X64-NEXT:    # kill: def $al killed $al killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i1

+  %a2 = trunc i32 %arg2 to i1

   %res = ashr i1 %a, %a2
   ret i1 %res
 }
 
-define i1 @test_ashr_i1_imm1(i32 %arg1) {
-; X64-LABEL: test_ashr_i1_imm1:
-; X64:       # %bb.0:
-; X64-NEXT:    movb $-1, %cl
-; X64-NEXT:    shlb $7, %dil
-; X64-NEXT:    sarb $7, %dil
-; X64-NEXT:    andb $1, %cl
-; X64-NEXT:    sarb %cl, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i1
-  %res = ashr i1 %a, 1
+define i1 @test_ashr_i1_imm1(i32 %arg1) {

+; X64-LABEL: test_ashr_i1_imm1:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movb $-1, %cl

+; X64-NEXT:    shlb $7, %al

+; X64-NEXT:    sarb $7, %al

+; X64-NEXT:    andb $1, %cl

+; X64-NEXT:    sarb %cl, %al

+; X64-NEXT:    # kill: def $al killed $al killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i1

+  %res = ashr i1 %a, 1

   ret i1 %res
 }
diff --git a/llvm/test/CodeGen/X86/GlobalISel/binop.ll b/llvm/test/CodeGen/X86/GlobalISel/binop.ll
index a0efcff..dbb150c 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/binop.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/binop.ll
@@ -4,24 +4,24 @@
 ; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f                  -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512F
 ; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512VL
 
-define i64 @test_sub_i64(i64 %arg1, i64 %arg2) {
-; ALL-LABEL: test_sub_i64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    subq %rsi, %rdi
-; ALL-NEXT:    movq %rdi, %rax
-; ALL-NEXT:    retq
-  %ret = sub i64 %arg1, %arg2
-  ret i64 %ret
-}
-
-define i32 @test_sub_i32(i32 %arg1, i32 %arg2) {
-; ALL-LABEL: test_sub_i32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    subl %esi, %edi
-; ALL-NEXT:    movl %edi, %eax
-; ALL-NEXT:    retq
-  %ret = sub i32 %arg1, %arg2
-  ret i32 %ret
+define i64 @test_sub_i64(i64 %arg1, i64 %arg2) {

+; ALL-LABEL: test_sub_i64:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movq %rdi, %rax

+; ALL-NEXT:    subq %rsi, %rax

+; ALL-NEXT:    retq

+  %ret = sub i64 %arg1, %arg2

+  ret i64 %ret

+}

+

+define i32 @test_sub_i32(i32 %arg1, i32 %arg2) {

+; ALL-LABEL: test_sub_i32:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movl %edi, %eax

+; ALL-NEXT:    subl %esi, %eax

+; ALL-NEXT:    retq

+  %ret = sub i32 %arg1, %arg2

+  ret i32 %ret

 }
 
 define float @test_add_float(float %arg1, float %arg2) {
diff --git a/llvm/test/CodeGen/X86/GlobalISel/callingconv.ll b/llvm/test/CodeGen/X86/GlobalISel/callingconv.ll
index ede0df3..e3f9a4e 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/callingconv.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/callingconv.ll
@@ -35,12 +35,13 @@
 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X32-NEXT:    retl
 ;
-; X64-LABEL: test_arg_i8:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  ret i8 %a
-}
+; X64-LABEL: test_arg_i8:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    # kill: def $al killed $al killed $eax

+; X64-NEXT:    retq

+  ret i8 %a

+}

 
 define i16 @test_arg_i16(i16 %a) {
 ; X32-LABEL: test_arg_i16:
@@ -48,12 +49,13 @@
 ; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
-; X64-LABEL: test_arg_i16:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  ret i16 %a
-}
+; X64-LABEL: test_arg_i16:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    # kill: def $ax killed $ax killed $eax

+; X64-NEXT:    retq

+  ret i16 %a

+}

 
 define i32 @test_arg_i32(i32 %a) {
 ; X32-LABEL: test_arg_i32:
@@ -111,14 +113,14 @@
 
 define <8 x i32> @test_v8i32_args(<8 x i32> %arg1, <8 x i32> %arg2) {
 ; X32-LABEL: test_v8i32_args:
-; X32:       # %bb.0:
-; X32-NEXT:    subl $12, %esp
-; X32-NEXT:    .cfi_def_cfa_offset 16
-; X32-NEXT:    movups {{[0-9]+}}(%esp), %xmm1
-; X32-NEXT:    movaps %xmm2, %xmm0
-; X32-NEXT:    addl $12, %esp
-; X32-NEXT:    .cfi_def_cfa_offset 4
-; X32-NEXT:    retl
+; X32:       # %bb.0:

+; X32-NEXT:    subl $12, %esp

+; X32-NEXT:    .cfi_def_cfa_offset 16

+; X32-NEXT:    movaps %xmm2, %xmm0

+; X32-NEXT:    movups {{[0-9]+}}(%esp), %xmm1

+; X32-NEXT:    addl $12, %esp

+; X32-NEXT:    .cfi_def_cfa_offset 4

+; X32-NEXT:    retl

 ;
 ; X64-LABEL: test_v8i32_args:
 ; X64:       # %bb.0:
@@ -254,14 +256,14 @@
 ; X32-LABEL: test_split_return_callee:
 ; X32:       # %bb.0:
 ; X32-NEXT:    subl $44, %esp
-; X32-NEXT:    .cfi_def_cfa_offset 48
-; X32-NEXT:    movaps %xmm0, (%esp) # 16-byte Spill
-; X32-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp) # 16-byte Spill
-; X32-NEXT:    movdqu {{[0-9]+}}(%esp), %xmm1
-; X32-NEXT:    movdqa %xmm2, %xmm0
-; X32-NEXT:    calll split_return_callee
-; X32-NEXT:    paddd (%esp), %xmm0 # 16-byte Folded Reload
-; X32-NEXT:    paddd {{[0-9]+}}(%esp), %xmm1 # 16-byte Folded Reload
+; X32-NEXT:    .cfi_def_cfa_offset 48

+; X32-NEXT:    movaps %xmm0, (%esp) # 16-byte Spill

+; X32-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp) # 16-byte Spill

+; X32-NEXT:    movdqa %xmm2, %xmm0

+; X32-NEXT:    movdqu {{[0-9]+}}(%esp), %xmm1

+; X32-NEXT:    calll split_return_callee

+; X32-NEXT:    paddd (%esp), %xmm0 # 16-byte Folded Reload

+; X32-NEXT:    paddd {{[0-9]+}}(%esp), %xmm1 # 16-byte Folded Reload

 ; X32-NEXT:    addl $44, %esp
 ; X32-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/GlobalISel/ext-x86-64.ll b/llvm/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
index 6bbe12f..c340955 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
@@ -3,48 +3,45 @@
 
 ; TODO merge with ext.ll after i64 sext supported on 32bit platform
 
-define i64 @test_zext_i1(i8 %a) {
-; X64-LABEL: test_zext_i1:
-; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andq $1, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    retq
-  %val = trunc i8 %a to i1
-  %r = zext i1 %val to i64
+define i64 @test_zext_i1(i8 %a) {

+; X64-LABEL: test_zext_i1:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    andq $1, %rax

+; X64-NEXT:    retq

+  %val = trunc i8 %a to i1

+  %r = zext i1 %val to i64

   ret i64 %r
 }
 
-define i64 @test_sext_i8(i8 %val) {
-; X64-LABEL: test_sext_i8:
-; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    movq $56, %rcx
-; X64-NEXT:    # kill: def $cl killed $rcx
-; X64-NEXT:    shlq %cl, %rdi
-; X64-NEXT:    movq $56, %rcx
-; X64-NEXT:    # kill: def $cl killed $rcx
-; X64-NEXT:    sarq %cl, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    retq
-  %r = sext i8 %val to i64
-  ret i64 %r
-}
-
-define i64 @test_sext_i16(i16 %val) {
-; X64-LABEL: test_sext_i16:
-; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    movq $48, %rcx
-; X64-NEXT:    # kill: def $cl killed $rcx
-; X64-NEXT:    shlq %cl, %rdi
-; X64-NEXT:    movq $48, %rcx
-; X64-NEXT:    # kill: def $cl killed $rcx
-; X64-NEXT:    sarq %cl, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    retq
-  %r = sext i16 %val to i64
-  ret i64 %r
+define i64 @test_sext_i8(i8 %val) {

+; X64-LABEL: test_sext_i8:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movq $56, %rcx

+; X64-NEXT:    # kill: def $cl killed $rcx

+; X64-NEXT:    shlq %cl, %rax

+; X64-NEXT:    movq $56, %rcx

+; X64-NEXT:    # kill: def $cl killed $rcx

+; X64-NEXT:    sarq %cl, %rax

+; X64-NEXT:    retq

+  %r = sext i8 %val to i64

+  ret i64 %r

+}

+

+define i64 @test_sext_i16(i16 %val) {

+; X64-LABEL: test_sext_i16:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movq $48, %rcx

+; X64-NEXT:    # kill: def $cl killed $rcx

+; X64-NEXT:    shlq %cl, %rax

+; X64-NEXT:    movq $48, %rcx

+; X64-NEXT:    # kill: def $cl killed $rcx

+; X64-NEXT:    sarq %cl, %rax

+; X64-NEXT:    retq

+  %r = sext i16 %val to i64

+  ret i64 %r

 }
 
 ; TODO enable after selection supported
diff --git a/llvm/test/CodeGen/X86/GlobalISel/ext.ll b/llvm/test/CodeGen/X86/GlobalISel/ext.ll
index 3310b0b..5ece4e2 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/ext.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/ext.ll
@@ -2,14 +2,15 @@
 ; RUN: llc -mtriple=x86_64-linux-gnu    -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64
 ; RUN: llc -mtriple=i386-linux-gnu      -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X32
 
-define i8 @test_zext_i1toi8(i32 %a) {
-; X64-LABEL: test_zext_i1toi8:
-; X64:       # %bb.0:
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-;
-; X32-LABEL: test_zext_i1toi8:
+define i8 @test_zext_i1toi8(i32 %a) {

+; X64-LABEL: test_zext_i1toi8:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    andb $1, %al

+; X64-NEXT:    # kill: def $al killed $al killed $eax

+; X64-NEXT:    retq

+;

+; X32-LABEL: test_zext_i1toi8:

 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    andb $1, %al
@@ -20,14 +21,15 @@
   ret i8 %r
 }
 
-define i16 @test_zext_i1toi16(i32 %a) {
-; X64-LABEL: test_zext_i1toi16:
-; X64:       # %bb.0:
-; X64-NEXT:    andw $1, %di
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-;
-; X32-LABEL: test_zext_i1toi16:
+define i16 @test_zext_i1toi16(i32 %a) {

+; X64-LABEL: test_zext_i1toi16:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    andw $1, %ax

+; X64-NEXT:    # kill: def $ax killed $ax killed $eax

+; X64-NEXT:    retq

+;

+; X32-LABEL: test_zext_i1toi16:

 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    andw $1, %ax
@@ -38,14 +40,14 @@
   ret i16 %r
 }
 
-define i32 @test_zext_i1(i32 %a) {
-; X64-LABEL: test_zext_i1:
-; X64:       # %bb.0:
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-;
-; X32-LABEL: test_zext_i1:
+define i32 @test_zext_i1(i32 %a) {

+; X64-LABEL: test_zext_i1:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    andl $1, %eax

+; X64-NEXT:    retq

+;

+; X32-LABEL: test_zext_i1:

 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    andl $1, %eax
@@ -83,19 +85,19 @@
   ret i32 %r
 }
 
-define i32 @test_sext_i8(i8 %val) {
-; X64-LABEL: test_sext_i8:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $24, %ecx
-; X64-NEXT:    # kill: def $cl killed $ecx
-; X64-NEXT:    shll %cl, %edi
-; X64-NEXT:    movl $24, %ecx
-; X64-NEXT:    # kill: def $cl killed $ecx
-; X64-NEXT:    sarl %cl, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-;
-; X32-LABEL: test_sext_i8:
+define i32 @test_sext_i8(i8 %val) {

+; X64-LABEL: test_sext_i8:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl $24, %ecx

+; X64-NEXT:    # kill: def $cl killed $ecx

+; X64-NEXT:    shll %cl, %eax

+; X64-NEXT:    movl $24, %ecx

+; X64-NEXT:    # kill: def $cl killed $ecx

+; X64-NEXT:    sarl %cl, %eax

+; X64-NEXT:    retq

+;

+; X32-LABEL: test_sext_i8:

 ; X32:       # %bb.0:
 ; X32-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
@@ -103,19 +105,19 @@
   ret i32 %r
 }
 
-define i32 @test_sext_i16(i16 %val) {
-; X64-LABEL: test_sext_i16:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $16, %ecx
-; X64-NEXT:    # kill: def $cl killed $ecx
-; X64-NEXT:    shll %cl, %edi
-; X64-NEXT:    movl $16, %ecx
-; X64-NEXT:    # kill: def $cl killed $ecx
-; X64-NEXT:    sarl %cl, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-;
-; X32-LABEL: test_sext_i16:
+define i32 @test_sext_i16(i16 %val) {

+; X64-LABEL: test_sext_i16:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl $16, %ecx

+; X64-NEXT:    # kill: def $cl killed $ecx

+; X64-NEXT:    shll %cl, %eax

+; X64-NEXT:    movl $16, %ecx

+; X64-NEXT:    # kill: def $cl killed $ecx

+; X64-NEXT:    sarl %cl, %eax

+; X64-NEXT:    retq

+;

+; X32-LABEL: test_sext_i16:

 ; X32:       # %bb.0:
 ; X32-NEXT:    movswl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/GlobalISel/lshr-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/lshr-scalar.ll
index 0fc40ee..11633fb 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/lshr-scalar.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/lshr-scalar.ll
@@ -1,178 +1,189 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64
 
-define i64 @test_lshr_i64(i64 %arg1, i64 %arg2) {
-; X64-LABEL: test_lshr_i64:
-; X64:       # %bb.0:
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    # kill: def $cl killed $rcx
-; X64-NEXT:    shrq %cl, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    retq
-  %res = lshr i64 %arg1, %arg2
-  ret i64 %res
-}
-
-define i64 @test_lshr_i64_imm(i64 %arg1) {
-; X64-LABEL: test_lshr_i64_imm:
-; X64:       # %bb.0:
-; X64-NEXT:    movq $5, %rcx
-; X64-NEXT:    # kill: def $cl killed $rcx
-; X64-NEXT:    shrq %cl, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    retq
-  %res = lshr i64 %arg1, 5
-  ret i64 %res
-}
-
-define i64 @test_lshr_i64_imm1(i64 %arg1) {
-; X64-LABEL: test_lshr_i64_imm1:
-; X64:       # %bb.0:
-; X64-NEXT:    movq $1, %rcx
-; X64-NEXT:    # kill: def $cl killed $rcx
-; X64-NEXT:    shrq %cl, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    retq
-  %res = lshr i64 %arg1, 1
-  ret i64 %res
-}
-
-define i32 @test_lshr_i32(i32 %arg1, i32 %arg2) {
-; X64-LABEL: test_lshr_i32:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    # kill: def $cl killed $ecx
-; X64-NEXT:    shrl %cl, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %res = lshr i32 %arg1, %arg2
-  ret i32 %res
-}
-
-define i32 @test_lshr_i32_imm(i32 %arg1) {
-; X64-LABEL: test_lshr_i32_imm:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $5, %ecx
-; X64-NEXT:    # kill: def $cl killed $ecx
-; X64-NEXT:    shrl %cl, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %res = lshr i32 %arg1, 5
-  ret i32 %res
-}
-
-define i32 @test_lshr_i32_imm1(i32 %arg1) {
-; X64-LABEL: test_lshr_i32_imm1:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $1, %ecx
-; X64-NEXT:    # kill: def $cl killed $ecx
-; X64-NEXT:    shrl %cl, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %res = lshr i32 %arg1, 1
-  ret i32 %res
-}
-
-define i16 @test_lshr_i16(i32 %arg1, i32 %arg2) {
-; X64-LABEL: test_lshr_i16:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    # kill: def $cl killed $cx
-; X64-NEXT:    shrw %cl, %di
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i16
-  %a2 = trunc i32 %arg2 to i16
+define i64 @test_lshr_i64(i64 %arg1, i64 %arg2) {

+; X64-LABEL: test_lshr_i64:

+; X64:       # %bb.0:

+; X64-NEXT:    movq %rdi, %rax

+; X64-NEXT:    movq %rsi, %rcx

+; X64-NEXT:    # kill: def $cl killed $rcx

+; X64-NEXT:    shrq %cl, %rax

+; X64-NEXT:    retq

+  %res = lshr i64 %arg1, %arg2

+  ret i64 %res

+}

+

+define i64 @test_lshr_i64_imm(i64 %arg1) {

+; X64-LABEL: test_lshr_i64_imm:

+; X64:       # %bb.0:

+; X64-NEXT:    movq %rdi, %rax

+; X64-NEXT:    movq $5, %rcx

+; X64-NEXT:    # kill: def $cl killed $rcx

+; X64-NEXT:    shrq %cl, %rax

+; X64-NEXT:    retq

+  %res = lshr i64 %arg1, 5

+  ret i64 %res

+}

+

+define i64 @test_lshr_i64_imm1(i64 %arg1) {

+; X64-LABEL: test_lshr_i64_imm1:

+; X64:       # %bb.0:

+; X64-NEXT:    movq %rdi, %rax

+; X64-NEXT:    movq $1, %rcx

+; X64-NEXT:    # kill: def $cl killed $rcx

+; X64-NEXT:    shrq %cl, %rax

+; X64-NEXT:    retq

+  %res = lshr i64 %arg1, 1

+  ret i64 %res

+}

+

+define i32 @test_lshr_i32(i32 %arg1, i32 %arg2) {

+; X64-LABEL: test_lshr_i32:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl %esi, %ecx

+; X64-NEXT:    # kill: def $cl killed $ecx

+; X64-NEXT:    shrl %cl, %eax

+; X64-NEXT:    retq

+  %res = lshr i32 %arg1, %arg2

+  ret i32 %res

+}

+

+define i32 @test_lshr_i32_imm(i32 %arg1) {

+; X64-LABEL: test_lshr_i32_imm:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl $5, %ecx

+; X64-NEXT:    # kill: def $cl killed $ecx

+; X64-NEXT:    shrl %cl, %eax

+; X64-NEXT:    retq

+  %res = lshr i32 %arg1, 5

+  ret i32 %res

+}

+

+define i32 @test_lshr_i32_imm1(i32 %arg1) {

+; X64-LABEL: test_lshr_i32_imm1:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl $1, %ecx

+; X64-NEXT:    # kill: def $cl killed $ecx

+; X64-NEXT:    shrl %cl, %eax

+; X64-NEXT:    retq

+  %res = lshr i32 %arg1, 1

+  ret i32 %res

+}

+

+define i16 @test_lshr_i16(i32 %arg1, i32 %arg2) {

+; X64-LABEL: test_lshr_i16:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl %esi, %ecx

+; X64-NEXT:    # kill: def $cx killed $cx killed $ecx

+; X64-NEXT:    # kill: def $cl killed $cx

+; X64-NEXT:    shrw %cl, %ax

+; X64-NEXT:    # kill: def $ax killed $ax killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i16

+  %a2 = trunc i32 %arg2 to i16

   %res = lshr i16 %a, %a2
   ret i16 %res
 }
 
-define i16 @test_lshr_i16_imm(i32 %arg1) {
-; X64-LABEL: test_lshr_i16_imm:
-; X64:       # %bb.0:
-; X64-NEXT:    movw $5, %cx
-; X64-NEXT:    # kill: def $cl killed $cx
-; X64-NEXT:    shrw %cl, %di
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i16
-  %res = lshr i16 %a, 5
+define i16 @test_lshr_i16_imm(i32 %arg1) {

+; X64-LABEL: test_lshr_i16_imm:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movw $5, %cx

+; X64-NEXT:    # kill: def $cl killed $cx

+; X64-NEXT:    shrw %cl, %ax

+; X64-NEXT:    # kill: def $ax killed $ax killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i16

+  %res = lshr i16 %a, 5

   ret i16 %res
 }
 
-define i16 @test_lshr_i16_imm1(i32 %arg1) {
-; X64-LABEL: test_lshr_i16_imm1:
-; X64:       # %bb.0:
-; X64-NEXT:    movw $1, %cx
-; X64-NEXT:    # kill: def $cl killed $cx
-; X64-NEXT:    shrw %cl, %di
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i16
-  %res = lshr i16 %a, 1
+define i16 @test_lshr_i16_imm1(i32 %arg1) {

+; X64-LABEL: test_lshr_i16_imm1:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movw $1, %cx

+; X64-NEXT:    # kill: def $cl killed $cx

+; X64-NEXT:    shrw %cl, %ax

+; X64-NEXT:    # kill: def $ax killed $ax killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i16

+  %res = lshr i16 %a, 1

   ret i16 %res
 }
 
-define i8 @test_lshr_i8(i32 %arg1, i32 %arg2) {
-; X64-LABEL: test_lshr_i8:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i8
-  %a2 = trunc i32 %arg2 to i8
+define i8 @test_lshr_i8(i32 %arg1, i32 %arg2) {

+; X64-LABEL: test_lshr_i8:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl %esi, %ecx

+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx

+; X64-NEXT:    shrb %cl, %al

+; X64-NEXT:    # kill: def $al killed $al killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i8

+  %a2 = trunc i32 %arg2 to i8

   %res = lshr i8 %a, %a2
   ret i8 %res
 }
 
-define i8 @test_lshr_i8_imm(i32 %arg1) {
-; X64-LABEL: test_lshr_i8_imm:
-; X64:       # %bb.0:
-; X64-NEXT:    shrb $5, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i8
-  %res = lshr i8 %a, 5
+define i8 @test_lshr_i8_imm(i32 %arg1) {

+; X64-LABEL: test_lshr_i8_imm:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    shrb $5, %al

+; X64-NEXT:    # kill: def $al killed $al killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i8

+  %res = lshr i8 %a, 5

   ret i8 %res
 }
 
-define i8 @test_lshr_i8_imm1(i32 %arg1) {
-; X64-LABEL: test_lshr_i8_imm1:
-; X64:       # %bb.0:
-; X64-NEXT:    shrb %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i8
-  %res = lshr i8 %a, 1
+define i8 @test_lshr_i8_imm1(i32 %arg1) {

+; X64-LABEL: test_lshr_i8_imm1:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    shrb %al

+; X64-NEXT:    # kill: def $al killed $al killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i8

+  %res = lshr i8 %a, 1

   ret i8 %res
 }
 
-define i1 @test_lshr_i1(i32 %arg1, i32 %arg2) {
-; X64-LABEL: test_lshr_i1:
-; X64:       # %bb.0:
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    andb $1, %sil
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i1
-  %a2 = trunc i32 %arg2 to i1
+define i1 @test_lshr_i1(i32 %arg1, i32 %arg2) {

+; X64-LABEL: test_lshr_i1:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl %esi, %ecx

+; X64-NEXT:    andb $1, %al

+; X64-NEXT:    andb $1, %cl

+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx

+; X64-NEXT:    shrb %cl, %al

+; X64-NEXT:    # kill: def $al killed $al killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i1

+  %a2 = trunc i32 %arg2 to i1

   %res = lshr i1 %a, %a2
   ret i1 %res
 }
 
-define i1 @test_lshr_i1_imm1(i32 %arg1) {
-; X64-LABEL: test_lshr_i1_imm1:
-; X64:       # %bb.0:
-; X64-NEXT:    movb $-1, %cl
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    andb $1, %cl
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i1
-  %res = lshr i1 %a, 1
+define i1 @test_lshr_i1_imm1(i32 %arg1) {

+; X64-LABEL: test_lshr_i1_imm1:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movb $-1, %cl

+; X64-NEXT:    andb $1, %al

+; X64-NEXT:    andb $1, %cl

+; X64-NEXT:    shrb %cl, %al

+; X64-NEXT:    # kill: def $al killed $al killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i1

+  %res = lshr i1 %a, 1

   ret i1 %res
 }
diff --git a/llvm/test/CodeGen/X86/GlobalISel/memop-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/memop-scalar.ll
index 0355c39..6b94feb 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/memop-scalar.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/memop-scalar.ll
@@ -79,71 +79,71 @@
   ret double %r
 }
 
-define i1 * @test_store_i1(i1 %val, i1 * %p1) {
-; ALL-LABEL: test_store_i1:
-; ALL:       # %bb.0:
-; ALL-NEXT:    andb $1, %dil
-; ALL-NEXT:    movb %dil, (%rsi)
-; ALL-NEXT:    movq %rsi, %rax
-; ALL-NEXT:    retq
-  store i1 %val, i1* %p1
-  ret i1 * %p1;
-}
-
-define i32 * @test_store_i32(i32 %val, i32 * %p1) {
-; ALL-LABEL: test_store_i32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    movl %edi, (%rsi)
-; ALL-NEXT:    movq %rsi, %rax
-; ALL-NEXT:    retq
-  store i32 %val, i32* %p1
-  ret i32 * %p1;
-}
-
-define i64 * @test_store_i64(i64 %val, i64 * %p1) {
-; ALL-LABEL: test_store_i64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    movq %rdi, (%rsi)
-; ALL-NEXT:    movq %rsi, %rax
-; ALL-NEXT:    retq
-  store i64 %val, i64* %p1
-  ret i64 * %p1;
+define i1 * @test_store_i1(i1 %val, i1 * %p1) {

+; ALL-LABEL: test_store_i1:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movq %rsi, %rax

+; ALL-NEXT:    andb $1, %dil

+; ALL-NEXT:    movb %dil, (%rsi)

+; ALL-NEXT:    retq

+  store i1 %val, i1* %p1

+  ret i1 * %p1;

+}

+

+define i32 * @test_store_i32(i32 %val, i32 * %p1) {

+; ALL-LABEL: test_store_i32:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movq %rsi, %rax

+; ALL-NEXT:    movl %edi, (%rsi)

+; ALL-NEXT:    retq

+  store i32 %val, i32* %p1

+  ret i32 * %p1;

+}

+

+define i64 * @test_store_i64(i64 %val, i64 * %p1) {

+; ALL-LABEL: test_store_i64:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movq %rsi, %rax

+; ALL-NEXT:    movq %rdi, (%rsi)

+; ALL-NEXT:    retq

+  store i64 %val, i64* %p1

+  ret i64 * %p1;

 }
 
 define float * @test_store_float(float %val, float * %p1) {
-;
-; SSE_FAST-LABEL: test_store_float:
-; SSE_FAST:       # %bb.0:
-; SSE_FAST-NEXT:    movd %xmm0, %eax
-; SSE_FAST-NEXT:    movl %eax, (%rdi)
-; SSE_FAST-NEXT:    movq %rdi, %rax
-; SSE_FAST-NEXT:    retq
-;
-; SSE_GREEDY-LABEL: test_store_float:
-; SSE_GREEDY:       # %bb.0:
-; SSE_GREEDY-NEXT:    movss %xmm0, (%rdi)
-; SSE_GREEDY-NEXT:    movq %rdi, %rax
-; SSE_GREEDY-NEXT:    retq
-  store float %val, float* %p1
-  ret float * %p1;
+;

+; SSE_FAST-LABEL: test_store_float:

+; SSE_FAST:       # %bb.0:

+; SSE_FAST-NEXT:    movq %rdi, %rax

+; SSE_FAST-NEXT:    movd %xmm0, %ecx

+; SSE_FAST-NEXT:    movl %ecx, (%rdi)

+; SSE_FAST-NEXT:    retq

+;

+; SSE_GREEDY-LABEL: test_store_float:

+; SSE_GREEDY:       # %bb.0:

+; SSE_GREEDY-NEXT:    movq %rdi, %rax

+; SSE_GREEDY-NEXT:    movss %xmm0, (%rdi)

+; SSE_GREEDY-NEXT:    retq

+  store float %val, float* %p1

+  ret float * %p1;

 }
 
 define double * @test_store_double(double %val, double * %p1) {
-;
-; SSE_FAST-LABEL: test_store_double:
-; SSE_FAST:       # %bb.0:
-; SSE_FAST-NEXT:    movq %xmm0, %rax
-; SSE_FAST-NEXT:    movq %rax, (%rdi)
-; SSE_FAST-NEXT:    movq %rdi, %rax
-; SSE_FAST-NEXT:    retq
-;
-; SSE_GREEDY-LABEL: test_store_double:
-; SSE_GREEDY:       # %bb.0:
-; SSE_GREEDY-NEXT:    movsd %xmm0, (%rdi)
-; SSE_GREEDY-NEXT:    movq %rdi, %rax
-; SSE_GREEDY-NEXT:    retq
-  store double %val, double* %p1
-  ret double * %p1;
+;

+; SSE_FAST-LABEL: test_store_double:

+; SSE_FAST:       # %bb.0:

+; SSE_FAST-NEXT:    movq %rdi, %rax

+; SSE_FAST-NEXT:    movq %xmm0, %rcx

+; SSE_FAST-NEXT:    movq %rcx, (%rdi)

+; SSE_FAST-NEXT:    retq

+;

+; SSE_GREEDY-LABEL: test_store_double:

+; SSE_GREEDY:       # %bb.0:

+; SSE_GREEDY-NEXT:    movq %rdi, %rax

+; SSE_GREEDY-NEXT:    movsd %xmm0, (%rdi)

+; SSE_GREEDY-NEXT:    retq

+  store double %val, double* %p1

+  ret double * %p1;

 }
 
 define i32* @test_load_ptr(i32** %ptr1) {
diff --git a/llvm/test/CodeGen/X86/GlobalISel/mul-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/mul-scalar.ll
index 5fd64c4..897a9ec 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/mul-scalar.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/mul-scalar.ll
@@ -5,35 +5,36 @@
 ;define i8 @test_mul_i8(i8 %arg1, i8 %arg2) {
 ;  %ret = mul i8 %arg1, %arg2
 ;  ret i8 %ret
-;}
-
-define i16 @test_mul_i16(i16 %arg1, i16 %arg2) {
-; X64-LABEL: test_mul_i16:
-; X64:       # %bb.0:
-; X64-NEXT:    imulw %di, %si
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    retq
-  %ret = mul i16 %arg1, %arg2
-  ret i16 %ret
-}
-
-define i32 @test_mul_i32(i32 %arg1, i32 %arg2) {
-; X64-LABEL: test_mul_i32:
-; X64:       # %bb.0:
-; X64-NEXT:    imull %edi, %esi
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    retq
-  %ret = mul i32 %arg1, %arg2
-  ret i32 %ret
-}
-
-define i64 @test_mul_i64(i64 %arg1, i64 %arg2) {
-; X64-LABEL: test_mul_i64:
-; X64:       # %bb.0:
-; X64-NEXT:    imulq %rdi, %rsi
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    retq
-  %ret = mul i64 %arg1, %arg2
-  ret i64 %ret
-}
+;}

+

+define i16 @test_mul_i16(i16 %arg1, i16 %arg2) {

+; ALL-LABEL: test_mul_i16:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movl %esi, %eax

+; ALL-NEXT:    imulw %di, %ax

+; ALL-NEXT:    # kill: def $ax killed $ax killed $eax

+; ALL-NEXT:    retq

+  %ret = mul i16 %arg1, %arg2

+  ret i16 %ret

+}

+

+define i32 @test_mul_i32(i32 %arg1, i32 %arg2) {

+; ALL-LABEL: test_mul_i32:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movl %esi, %eax

+; ALL-NEXT:    imull %edi, %eax

+; ALL-NEXT:    retq

+  %ret = mul i32 %arg1, %arg2

+  ret i32 %ret

+}

+

+define i64 @test_mul_i64(i64 %arg1, i64 %arg2) {

+; ALL-LABEL: test_mul_i64:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movq %rsi, %rax

+; ALL-NEXT:    imulq %rdi, %rax

+; ALL-NEXT:    retq

+  %ret = mul i64 %arg1, %arg2

+  ret i64 %ret

+}

 
diff --git a/llvm/test/CodeGen/X86/GlobalISel/or-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/or-scalar.ll
index 397deaa..47634c8 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/or-scalar.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/or-scalar.ll
@@ -16,43 +16,45 @@
   ret i32 %ret
 }
 
-define i8 @test_or_i8(i8 %arg1, i8 %arg2) {
-; ALL-LABEL: test_or_i8:
-; ALL:       # %bb.0:
-; ALL-NEXT:    orb %dil, %sil
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    retq
-  %ret = or i8 %arg1, %arg2
-  ret i8 %ret
-}
-
-define i16 @test_or_i16(i16 %arg1, i16 %arg2) {
-; ALL-LABEL: test_or_i16:
-; ALL:       # %bb.0:
-; ALL-NEXT:    orw %di, %si
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    retq
-  %ret = or i16 %arg1, %arg2
-  ret i16 %ret
-}
-
-define i32 @test_or_i32(i32 %arg1, i32 %arg2) {
-; ALL-LABEL: test_or_i32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    orl %edi, %esi
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    retq
-  %ret = or i32 %arg1, %arg2
-  ret i32 %ret
-}
-
-define i64 @test_or_i64(i64 %arg1, i64 %arg2) {
-; ALL-LABEL: test_or_i64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    orq %rdi, %rsi
-; ALL-NEXT:    movq %rsi, %rax
-; ALL-NEXT:    retq
-  %ret = or i64 %arg1, %arg2
-  ret i64 %ret
+define i8 @test_or_i8(i8 %arg1, i8 %arg2) {

+; ALL-LABEL: test_or_i8:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movl %esi, %eax

+; ALL-NEXT:    orb %dil, %al

+; ALL-NEXT:    # kill: def $al killed $al killed $eax

+; ALL-NEXT:    retq

+  %ret = or i8 %arg1, %arg2

+  ret i8 %ret

+}

+

+define i16 @test_or_i16(i16 %arg1, i16 %arg2) {

+; ALL-LABEL: test_or_i16:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movl %esi, %eax

+; ALL-NEXT:    orw %di, %ax

+; ALL-NEXT:    # kill: def $ax killed $ax killed $eax

+; ALL-NEXT:    retq

+  %ret = or i16 %arg1, %arg2

+  ret i16 %ret

+}

+

+define i32 @test_or_i32(i32 %arg1, i32 %arg2) {

+; ALL-LABEL: test_or_i32:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movl %esi, %eax

+; ALL-NEXT:    orl %edi, %eax

+; ALL-NEXT:    retq

+  %ret = or i32 %arg1, %arg2

+  ret i32 %ret

+}

+

+define i64 @test_or_i64(i64 %arg1, i64 %arg2) {

+; ALL-LABEL: test_or_i64:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movq %rsi, %rax

+; ALL-NEXT:    orq %rdi, %rax

+; ALL-NEXT:    retq

+  %ret = or i64 %arg1, %arg2

+  ret i64 %ret

 }
 
diff --git a/llvm/test/CodeGen/X86/GlobalISel/phi.ll b/llvm/test/CodeGen/X86/GlobalISel/phi.ll
index 2157081..11c55ce 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/phi.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/phi.ll
@@ -1,21 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL
 
-define i8 @test_i8(i32 %a, i8 %f, i8 %t) {
-; ALL-LABEL: test_i8:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    xorl %eax, %eax
-; ALL-NEXT:    cmpl %eax, %edi
-; ALL-NEXT:    setg %al
-; ALL-NEXT:    testb $1, %al
-; ALL-NEXT:    jne .LBB0_2
-; ALL-NEXT:  # %bb.1: # %cond.false
-; ALL-NEXT:    movl %edx, %esi
-; ALL-NEXT:  .LBB0_2: # %cond.end
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    retq
-entry:
-  %cmp = icmp sgt i32 %a, 0
+define i8 @test_i8(i32 %a, i8 %f, i8 %t) {

+; ALL-LABEL: test_i8:

+; ALL:       # %bb.0: # %entry

+; ALL-NEXT:    xorl %ecx, %ecx

+; ALL-NEXT:    cmpl %ecx, %edi

+; ALL-NEXT:    setg %cl

+; ALL-NEXT:    testb $1, %cl

+; ALL-NEXT:    je .LBB0_2

+; ALL-NEXT:  # %bb.1:

+; ALL-NEXT:    movl %esi, %eax

+; ALL-NEXT:    # kill: def $al killed $al killed $eax

+; ALL-NEXT:    retq

+; ALL-NEXT:  .LBB0_2: # %cond.false

+; ALL-NEXT:    movl %edx, %eax

+; ALL-NEXT:    # kill: def $al killed $al killed $eax

+; ALL-NEXT:    retq

+entry:

+  %cmp = icmp sgt i32 %a, 0

   br i1 %cmp, label %cond.true, label %cond.false
 
 cond.true:                                        ; preds = %entry
@@ -29,21 +32,24 @@
   ret i8 %cond
 }
 
-define i16 @test_i16(i32 %a, i16 %f, i16 %t) {
-; ALL-LABEL: test_i16:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    xorl %eax, %eax
-; ALL-NEXT:    cmpl %eax, %edi
-; ALL-NEXT:    setg %al
-; ALL-NEXT:    testb $1, %al
-; ALL-NEXT:    jne .LBB1_2
-; ALL-NEXT:  # %bb.1: # %cond.false
-; ALL-NEXT:    movl %edx, %esi
-; ALL-NEXT:  .LBB1_2: # %cond.end
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    retq
-entry:
-  %cmp = icmp sgt i32 %a, 0
+define i16 @test_i16(i32 %a, i16 %f, i16 %t) {

+; ALL-LABEL: test_i16:

+; ALL:       # %bb.0: # %entry

+; ALL-NEXT:    xorl %ecx, %ecx

+; ALL-NEXT:    cmpl %ecx, %edi

+; ALL-NEXT:    setg %cl

+; ALL-NEXT:    testb $1, %cl

+; ALL-NEXT:    je .LBB1_2

+; ALL-NEXT:  # %bb.1:

+; ALL-NEXT:    movl %esi, %eax

+; ALL-NEXT:    # kill: def $ax killed $ax killed $eax

+; ALL-NEXT:    retq

+; ALL-NEXT:  .LBB1_2: # %cond.false

+; ALL-NEXT:    movl %edx, %eax

+; ALL-NEXT:    # kill: def $ax killed $ax killed $eax

+; ALL-NEXT:    retq

+entry:

+  %cmp = icmp sgt i32 %a, 0

   br i1 %cmp, label %cond.true, label %cond.false
 
 cond.true:                                        ; preds = %entry
@@ -57,21 +63,21 @@
   ret i16 %cond
 }
 
-define i32 @test_i32(i32 %a, i32 %f, i32 %t) {
-; ALL-LABEL: test_i32:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    xorl %eax, %eax
-; ALL-NEXT:    cmpl %eax, %edi
-; ALL-NEXT:    setg %al
-; ALL-NEXT:    testb $1, %al
-; ALL-NEXT:    jne .LBB2_2
-; ALL-NEXT:  # %bb.1: # %cond.false
-; ALL-NEXT:    movl %edx, %esi
-; ALL-NEXT:  .LBB2_2: # %cond.end
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    retq
-entry:
-  %cmp = icmp sgt i32 %a, 0
+define i32 @test_i32(i32 %a, i32 %f, i32 %t) {

+; ALL-LABEL: test_i32:

+; ALL:       # %bb.0: # %entry

+; ALL-NEXT:    movl %esi, %eax

+; ALL-NEXT:    xorl %ecx, %ecx

+; ALL-NEXT:    cmpl %ecx, %edi

+; ALL-NEXT:    setg %cl

+; ALL-NEXT:    testb $1, %cl

+; ALL-NEXT:    jne .LBB2_2

+; ALL-NEXT:  # %bb.1: # %cond.false

+; ALL-NEXT:    movl %edx, %eax

+; ALL-NEXT:  .LBB2_2: # %cond.end

+; ALL-NEXT:    retq

+entry:

+  %cmp = icmp sgt i32 %a, 0

   br i1 %cmp, label %cond.true, label %cond.false
 
 cond.true:                                        ; preds = %entry
@@ -85,21 +91,21 @@
   ret i32 %cond
 }
 
-define i64 @test_i64(i32 %a, i64 %f, i64 %t) {
-; ALL-LABEL: test_i64:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    xorl %eax, %eax
-; ALL-NEXT:    cmpl %eax, %edi
-; ALL-NEXT:    setg %al
-; ALL-NEXT:    testb $1, %al
-; ALL-NEXT:    jne .LBB3_2
-; ALL-NEXT:  # %bb.1: # %cond.false
-; ALL-NEXT:    movq %rdx, %rsi
-; ALL-NEXT:  .LBB3_2: # %cond.end
-; ALL-NEXT:    movq %rsi, %rax
-; ALL-NEXT:    retq
-entry:
-  %cmp = icmp sgt i32 %a, 0
+define i64 @test_i64(i32 %a, i64 %f, i64 %t) {

+; ALL-LABEL: test_i64:

+; ALL:       # %bb.0: # %entry

+; ALL-NEXT:    movq %rsi, %rax

+; ALL-NEXT:    xorl %ecx, %ecx

+; ALL-NEXT:    cmpl %ecx, %edi

+; ALL-NEXT:    setg %cl

+; ALL-NEXT:    testb $1, %cl

+; ALL-NEXT:    jne .LBB3_2

+; ALL-NEXT:  # %bb.1: # %cond.false

+; ALL-NEXT:    movq %rdx, %rax

+; ALL-NEXT:  .LBB3_2: # %cond.end

+; ALL-NEXT:    retq

+entry:

+  %cmp = icmp sgt i32 %a, 0

   br i1 %cmp, label %cond.true, label %cond.false
 
 cond.true:                                        ; preds = %entry
diff --git a/llvm/test/CodeGen/X86/GlobalISel/ptrtoint.ll b/llvm/test/CodeGen/X86/GlobalISel/ptrtoint.ll
index 99ebbd5..4e4decb 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/ptrtoint.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/ptrtoint.ll
@@ -1,43 +1,47 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=CHECK
 
-define i1 @ptrtoint_s1_p0(i64* %p) {
-; CHECK-LABEL: ptrtoint_s1_p0:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    retq
-entry:
-  %0 = ptrtoint i64* %p to i1
+define i1 @ptrtoint_s1_p0(i64* %p) {

+; CHECK-LABEL: ptrtoint_s1_p0:

+; CHECK:       # %bb.0: # %entry

+; CHECK-NEXT:    movq %rdi, %rax

+; CHECK-NEXT:    # kill: def $al killed $al killed $rax

+; CHECK-NEXT:    retq

+entry:

+  %0 = ptrtoint i64* %p to i1

   ret i1 %0
 }
 
-define i8 @ptrtoint_s8_p0(i64* %p) {
-; CHECK-LABEL: ptrtoint_s8_p0:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    retq
-entry:
-  %0 = ptrtoint i64* %p to i8
+define i8 @ptrtoint_s8_p0(i64* %p) {

+; CHECK-LABEL: ptrtoint_s8_p0:

+; CHECK:       # %bb.0: # %entry

+; CHECK-NEXT:    movq %rdi, %rax

+; CHECK-NEXT:    # kill: def $al killed $al killed $rax

+; CHECK-NEXT:    retq

+entry:

+  %0 = ptrtoint i64* %p to i8

   ret i8 %0
 }
 
-define i16 @ptrtoint_s16_p0(i64* %p) {
-; CHECK-LABEL: ptrtoint_s16_p0:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    retq
-entry:
-  %0 = ptrtoint i64* %p to i16
+define i16 @ptrtoint_s16_p0(i64* %p) {

+; CHECK-LABEL: ptrtoint_s16_p0:

+; CHECK:       # %bb.0: # %entry

+; CHECK-NEXT:    movq %rdi, %rax

+; CHECK-NEXT:    # kill: def $ax killed $ax killed $rax

+; CHECK-NEXT:    retq

+entry:

+  %0 = ptrtoint i64* %p to i16

   ret i16 %0
 }
 
-define i32 @ptrtoint_s32_p0(i64* %p) {
-; CHECK-LABEL: ptrtoint_s32_p0:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    retq
-entry:
-  %0 = ptrtoint i64* %p to i32
+define i32 @ptrtoint_s32_p0(i64* %p) {

+; CHECK-LABEL: ptrtoint_s32_p0:

+; CHECK:       # %bb.0: # %entry

+; CHECK-NEXT:    movq %rdi, %rax

+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax

+; CHECK-NEXT:    retq

+entry:

+  %0 = ptrtoint i64* %p to i32

   ret i32 %0
 }
 
diff --git a/llvm/test/CodeGen/X86/GlobalISel/shl-scalar-widening.ll b/llvm/test/CodeGen/X86/GlobalISel/shl-scalar-widening.ll
index 02bfa9d..5f82567 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/shl-scalar-widening.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/shl-scalar-widening.ll
@@ -1,67 +1,54 @@
-; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64
-
-define i16 @test_shl_i4(i16 %v, i16 %a, i16 %b) {
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py

+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64

+

+define i16 @test_shl_i4(i16 %v, i16 %a, i16 %b) {

 ; Let's say the arguments are the following unsigned
 ; integers in two’s complement representation:
 ;
-; %v: 77 (0000 0000  0100 1101)
-; %a: 74 (0000 0000  0100 1010)
-; %b: 72 (0000 0000  0100 1000)
-  %v.t = trunc i16 %v to i4  ; %v.t: 13 (1101)
-  %a.t = trunc i16 %a to i4  ; %a.t: 10 (1010)
-  %b.t = trunc i16 %b to i4  ; %b.t:  8 (1000)
+; %v: 77 (0000 0000  0100 1101)

+; %a: 74 (0000 0000  0100 1010)

+; %b: 72 (0000 0000  0100 1000)

+; X64-LABEL: test_shl_i4:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl %edx, %ecx

+; X64-NEXT:    addb %sil, %cl

+; X64-NEXT:    andb $15, %cl

+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx

+; X64-NEXT:    shlb %cl, %al

+; X64-NEXT:    andw $15, %ax

+; X64-NEXT:    # kill: def $ax killed $ax killed $eax

+; X64-NEXT:    retq

+  %v.t = trunc i16 %v to i4  ; %v.t: 13 (1101)

+  %a.t = trunc i16 %a to i4  ; %a.t: 10 (1010)

+  %b.t = trunc i16 %b to i4  ; %b.t:  8 (1000)

   %n.t = add i4 %a.t, %b.t   ; %n.t:  2 (0010)
   %r.t = shl i4 %v.t, %n.t   ; %r.t:  4 (0100)
   %r = zext i4 %r.t to i16
-; %r:  4 (0000 0000 0000 0100)
-  ret i16 %r
-
-; X64-LABEL: test_shl_i4
-;
-; %di:  77 (0000 0000  0100 1101)
-; %si:  74 (0000 0000  0100 1010)
-; %dx:  72 (0000 0000  0100 1000)
-;
-; X64:       # %bb.0:
-;
-; X64-NEXT:    addb %sil, %dl
-; %dx: 146 (0000 0000  1001 0010)
-;
-; X64-NEXT:    andb $15, %dl
-; %dx:   2 (0000 0000  0000 0010)
-;
-; X64-NEXT:    movl %edx, %ecx
-; %cx:   2 (0000 0000  0000 0010)
-;
-; X64-NEXT:    shlb %cl, %dil
-; %di:  52 (0000 0000  0011 0100)
-;
-; X64-NEXT:    andw $15, %di
-; %di:   4 (0000 0000  0000 0100)
-;
-; X64-NEXT:    movl %edi, %eax
-; %ax:   4 (0000 0000  0000 0100)
-;
-; X64-NEXT:    retq
-;
-; Let's pretend that legalizing G_SHL by widening its second
-; source operand is done via G_ANYEXT rather than G_ZEXT and
-; see what happens:
-;
-;              addb %sil, %dl
-; %dx: 146 (0000 0000  1001 0010)
-;
-;              movl %edx, %ecx
-; %cx: 146 (0000 0000  1001 0010)
-;
-;              shlb %cl, %dil
-; %di:   0 (0000 0000  0000 0000)
-;
-;              andw $15, %di
-; %di:   0 (0000 0000  0000 0000)
-;
-;              movl %edi, %eax
-; %ax:   0 (0000 0000  0000 0000)
-;
-;              retq
-}
+; %r:  4 (0000 0000 0000 0100)

+  ret i16 %r

+

+; %di:  77 (0000 0000  0100 1101)

+; %si:  74 (0000 0000  0100 1010)

+; %dx:  72 (0000 0000  0100 1000)

+; %dx: 146 (0000 0000  1001 0010)

+; %dx:   2 (0000 0000  0000 0010)

+; %cx:   2 (0000 0000  0000 0010)

+; %di:  52 (0000 0000  0011 0100)

+; %di:   4 (0000 0000  0000 0100)

+; %ax:   4 (0000 0000  0000 0100)

+; Let's pretend that legalizing G_SHL by widening its second

+; source operand is done via G_ANYEXT rather than G_ZEXT and

+; see what happens:

+;              addb %sil, %dl

+; %dx: 146 (0000 0000  1001 0010)

+;              movl %edx, %ecx

+; %cx: 146 (0000 0000  1001 0010)

+;              shlb %cl, %dil

+; %di:   0 (0000 0000  0000 0000)

+;              andw $15, %di

+; %di:   0 (0000 0000  0000 0000)

+;              movl %edi, %eax

+; %ax:   0 (0000 0000  0000 0000)

+;              retq

+}

diff --git a/llvm/test/CodeGen/X86/GlobalISel/shl-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/shl-scalar.ll
index c0dcde7..8d04ae4 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/shl-scalar.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/shl-scalar.ll
@@ -1,176 +1,187 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64
 
-define i64 @test_shl_i64(i64 %arg1, i64 %arg2) {
-; X64-LABEL: test_shl_i64:
-; X64:       # %bb.0:
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    # kill: def $cl killed $rcx
-; X64-NEXT:    shlq %cl, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    retq
-  %res = shl i64 %arg1, %arg2
-  ret i64 %res
-}
-
-define i64 @test_shl_i64_imm(i64 %arg1) {
-; X64-LABEL: test_shl_i64_imm:
-; X64:       # %bb.0:
-; X64-NEXT:    movq $5, %rcx
-; X64-NEXT:    # kill: def $cl killed $rcx
-; X64-NEXT:    shlq %cl, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    retq
-  %res = shl i64 %arg1, 5
-  ret i64 %res
-}
-
-define i64 @test_shl_i64_imm1(i64 %arg1) {
-; X64-LABEL: test_shl_i64_imm1:
-; X64:       # %bb.0:
-; X64-NEXT:    movq $1, %rcx
-; X64-NEXT:    # kill: def $cl killed $rcx
-; X64-NEXT:    shlq %cl, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    retq
-  %res = shl i64 %arg1, 1
-  ret i64 %res
-}
-
-define i32 @test_shl_i32(i32 %arg1, i32 %arg2) {
-; X64-LABEL: test_shl_i32:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    # kill: def $cl killed $ecx
-; X64-NEXT:    shll %cl, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %res = shl i32 %arg1, %arg2
-  ret i32 %res
-}
-
-define i32 @test_shl_i32_imm(i32 %arg1) {
-; X64-LABEL: test_shl_i32_imm:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $5, %ecx
-; X64-NEXT:    # kill: def $cl killed $ecx
-; X64-NEXT:    shll %cl, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %res = shl i32 %arg1, 5
-  ret i32 %res
-}
-
-define i32 @test_shl_i32_imm1(i32 %arg1) {
-; X64-LABEL: test_shl_i32_imm1:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $1, %ecx
-; X64-NEXT:    # kill: def $cl killed $ecx
-; X64-NEXT:    shll %cl, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %res = shl i32 %arg1, 1
-  ret i32 %res
-}
-
-define i16 @test_shl_i16(i32 %arg1, i32 %arg2) {
-; X64-LABEL: test_shl_i16:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    # kill: def $cl killed $cx
-; X64-NEXT:    shlw %cl, %di
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i16
-  %a2 = trunc i32 %arg2 to i16
+define i64 @test_shl_i64(i64 %arg1, i64 %arg2) {

+; X64-LABEL: test_shl_i64:

+; X64:       # %bb.0:

+; X64-NEXT:    movq %rdi, %rax

+; X64-NEXT:    movq %rsi, %rcx

+; X64-NEXT:    # kill: def $cl killed $rcx

+; X64-NEXT:    shlq %cl, %rax

+; X64-NEXT:    retq

+  %res = shl i64 %arg1, %arg2

+  ret i64 %res

+}

+

+define i64 @test_shl_i64_imm(i64 %arg1) {

+; X64-LABEL: test_shl_i64_imm:

+; X64:       # %bb.0:

+; X64-NEXT:    movq %rdi, %rax

+; X64-NEXT:    movq $5, %rcx

+; X64-NEXT:    # kill: def $cl killed $rcx

+; X64-NEXT:    shlq %cl, %rax

+; X64-NEXT:    retq

+  %res = shl i64 %arg1, 5

+  ret i64 %res

+}

+

+define i64 @test_shl_i64_imm1(i64 %arg1) {

+; X64-LABEL: test_shl_i64_imm1:

+; X64:       # %bb.0:

+; X64-NEXT:    movq %rdi, %rax

+; X64-NEXT:    movq $1, %rcx

+; X64-NEXT:    # kill: def $cl killed $rcx

+; X64-NEXT:    shlq %cl, %rax

+; X64-NEXT:    retq

+  %res = shl i64 %arg1, 1

+  ret i64 %res

+}

+

+define i32 @test_shl_i32(i32 %arg1, i32 %arg2) {

+; X64-LABEL: test_shl_i32:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl %esi, %ecx

+; X64-NEXT:    # kill: def $cl killed $ecx

+; X64-NEXT:    shll %cl, %eax

+; X64-NEXT:    retq

+  %res = shl i32 %arg1, %arg2

+  ret i32 %res

+}

+

+define i32 @test_shl_i32_imm(i32 %arg1) {

+; X64-LABEL: test_shl_i32_imm:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl $5, %ecx

+; X64-NEXT:    # kill: def $cl killed $ecx

+; X64-NEXT:    shll %cl, %eax

+; X64-NEXT:    retq

+  %res = shl i32 %arg1, 5

+  ret i32 %res

+}

+

+define i32 @test_shl_i32_imm1(i32 %arg1) {

+; X64-LABEL: test_shl_i32_imm1:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl $1, %ecx

+; X64-NEXT:    # kill: def $cl killed $ecx

+; X64-NEXT:    shll %cl, %eax

+; X64-NEXT:    retq

+  %res = shl i32 %arg1, 1

+  ret i32 %res

+}

+

+define i16 @test_shl_i16(i32 %arg1, i32 %arg2) {

+; X64-LABEL: test_shl_i16:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl %esi, %ecx

+; X64-NEXT:    # kill: def $cx killed $cx killed $ecx

+; X64-NEXT:    # kill: def $cl killed $cx

+; X64-NEXT:    shlw %cl, %ax

+; X64-NEXT:    # kill: def $ax killed $ax killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i16

+  %a2 = trunc i32 %arg2 to i16

   %res = shl i16 %a, %a2
   ret i16 %res
 }
 
-define i16 @test_shl_i16_imm(i32 %arg1) {
-; X64-LABEL: test_shl_i16_imm:
-; X64:       # %bb.0:
-; X64-NEXT:    movw $5, %cx
-; X64-NEXT:    # kill: def $cl killed $cx
-; X64-NEXT:    shlw %cl, %di
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i16
-  %res = shl i16 %a, 5
+define i16 @test_shl_i16_imm(i32 %arg1) {

+; X64-LABEL: test_shl_i16_imm:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movw $5, %cx

+; X64-NEXT:    # kill: def $cl killed $cx

+; X64-NEXT:    shlw %cl, %ax

+; X64-NEXT:    # kill: def $ax killed $ax killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i16

+  %res = shl i16 %a, 5

   ret i16 %res
 }
 
-define i16 @test_shl_i16_imm1(i32 %arg1) {
-; X64-LABEL: test_shl_i16_imm1:
-; X64:       # %bb.0:
-; X64-NEXT:    movw $1, %cx
-; X64-NEXT:    # kill: def $cl killed $cx
-; X64-NEXT:    shlw %cl, %di
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i16
-  %res = shl i16 %a, 1
+define i16 @test_shl_i16_imm1(i32 %arg1) {

+; X64-LABEL: test_shl_i16_imm1:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movw $1, %cx

+; X64-NEXT:    # kill: def $cl killed $cx

+; X64-NEXT:    shlw %cl, %ax

+; X64-NEXT:    # kill: def $ax killed $ax killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i16

+  %res = shl i16 %a, 1

   ret i16 %res
 }
 
-define i8 @test_shl_i8(i32 %arg1, i32 %arg2) {
-; X64-LABEL: test_shl_i8:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    shlb %cl, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i8
-  %a2 = trunc i32 %arg2 to i8
+define i8 @test_shl_i8(i32 %arg1, i32 %arg2) {

+; X64-LABEL: test_shl_i8:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl %esi, %ecx

+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx

+; X64-NEXT:    shlb %cl, %al

+; X64-NEXT:    # kill: def $al killed $al killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i8

+  %a2 = trunc i32 %arg2 to i8

   %res = shl i8 %a, %a2
   ret i8 %res
 }
 
-define i8 @test_shl_i8_imm(i32 %arg1) {
-; X64-LABEL: test_shl_i8_imm:
-; X64:       # %bb.0:
-; X64-NEXT:    shlb $5, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i8
-  %res = shl i8 %a, 5
+define i8 @test_shl_i8_imm(i32 %arg1) {

+; X64-LABEL: test_shl_i8_imm:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    shlb $5, %al

+; X64-NEXT:    # kill: def $al killed $al killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i8

+  %res = shl i8 %a, 5

   ret i8 %res
 }
 
-define i8 @test_shl_i8_imm1(i32 %arg1) {
-; X64-LABEL: test_shl_i8_imm1:
-; X64:       # %bb.0:
-; X64-NEXT:    addb %dil, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i8
-  %res = shl i8 %a, 1
+define i8 @test_shl_i8_imm1(i32 %arg1) {

+; X64-LABEL: test_shl_i8_imm1:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    addb %al, %al

+; X64-NEXT:    # kill: def $al killed $al killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i8

+  %res = shl i8 %a, 1

   ret i8 %res
 }
 
-define i1 @test_shl_i1(i32 %arg1, i32 %arg2) {
-; X64-LABEL: test_shl_i1:
-; X64:       # %bb.0:
-; X64-NEXT:    andb $1, %sil
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    shlb %cl, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i1
-  %a2 = trunc i32 %arg2 to i1
+define i1 @test_shl_i1(i32 %arg1, i32 %arg2) {

+; X64-LABEL: test_shl_i1:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movl %esi, %ecx

+; X64-NEXT:    andb $1, %cl

+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx

+; X64-NEXT:    shlb %cl, %al

+; X64-NEXT:    # kill: def $al killed $al killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i1

+  %a2 = trunc i32 %arg2 to i1

   %res = shl i1 %a, %a2
   ret i1 %res
 }
 
-define i1 @test_shl_i1_imm1(i32 %arg1) {
-; X64-LABEL: test_shl_i1_imm1:
-; X64:       # %bb.0:
-; X64-NEXT:    movb $-1, %cl
-; X64-NEXT:    andb $1, %cl
-; X64-NEXT:    shlb %cl, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %a = trunc i32 %arg1 to i1
-  %res = shl i1 %a, 1
+define i1 @test_shl_i1_imm1(i32 %arg1) {

+; X64-LABEL: test_shl_i1_imm1:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    movb $-1, %cl

+; X64-NEXT:    andb $1, %cl

+; X64-NEXT:    shlb %cl, %al

+; X64-NEXT:    # kill: def $al killed $al killed $eax

+; X64-NEXT:    retq

+  %a = trunc i32 %arg1 to i1

+  %res = shl i1 %a, 1

   ret i1 %res
 }
diff --git a/llvm/test/CodeGen/X86/GlobalISel/sub-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/sub-scalar.ll
index f8d825d..10f6bfe 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/sub-scalar.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/sub-scalar.ll
@@ -1,44 +1,46 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64
 
-define i64 @test_sub_i64(i64 %arg1, i64 %arg2) {
-; X64-LABEL: test_sub_i64:
-; X64:       # %bb.0:
-; X64-NEXT:    subq %rsi, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    retq
-  %ret = sub i64 %arg1, %arg2
-  ret i64 %ret
-}
-
-define i32 @test_sub_i32(i32 %arg1, i32 %arg2) {
-; X64-LABEL: test_sub_i32:
-; X64:       # %bb.0:
-; X64-NEXT:    subl %esi, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %ret = sub i32 %arg1, %arg2
-  ret i32 %ret
-}
-
-define i16 @test_sub_i16(i16 %arg1, i16 %arg2) {
-; X64-LABEL: test_sub_i16:
-; X64:       # %bb.0:
-; X64-NEXT:    subw %si, %di
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %ret = sub i16 %arg1, %arg2
-  ret i16 %ret
-}
-
-define i8 @test_sub_i8(i8 %arg1, i8 %arg2) {
-; X64-LABEL: test_sub_i8:
-; X64:       # %bb.0:
-; X64-NEXT:    subb %sil, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %ret = sub i8 %arg1, %arg2
-  ret i8 %ret
+define i64 @test_sub_i64(i64 %arg1, i64 %arg2) {

+; X64-LABEL: test_sub_i64:

+; X64:       # %bb.0:

+; X64-NEXT:    movq %rdi, %rax

+; X64-NEXT:    subq %rsi, %rax

+; X64-NEXT:    retq

+  %ret = sub i64 %arg1, %arg2

+  ret i64 %ret

+}

+

+define i32 @test_sub_i32(i32 %arg1, i32 %arg2) {

+; X64-LABEL: test_sub_i32:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    subl %esi, %eax

+; X64-NEXT:    retq

+  %ret = sub i32 %arg1, %arg2

+  ret i32 %ret

+}

+

+define i16 @test_sub_i16(i16 %arg1, i16 %arg2) {

+; X64-LABEL: test_sub_i16:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    subw %si, %ax

+; X64-NEXT:    # kill: def $ax killed $ax killed $eax

+; X64-NEXT:    retq

+  %ret = sub i16 %arg1, %arg2

+  ret i16 %ret

+}

+

+define i8 @test_sub_i8(i8 %arg1, i8 %arg2) {

+; X64-LABEL: test_sub_i8:

+; X64:       # %bb.0:

+; X64-NEXT:    movl %edi, %eax

+; X64-NEXT:    subb %sil, %al

+; X64-NEXT:    # kill: def $al killed $al killed $eax

+; X64-NEXT:    retq

+  %ret = sub i8 %arg1, %arg2

+  ret i8 %ret

 }
 
 define i32 @test_sub_i1(i32 %arg1, i32 %arg2) {
diff --git a/llvm/test/CodeGen/X86/GlobalISel/trunc.ll b/llvm/test/CodeGen/X86/GlobalISel/trunc.ll
index 6c4729f..70af0ae 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/trunc.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/trunc.ll
@@ -2,56 +2,62 @@
 ; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=CHECK
 
 define i1 @trunc_i32toi1(i32 %a) {
-; CHECK-LABEL: trunc_i32toi1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    retq
-  %r = trunc i32 %a to i1
-  ret i1 %r
+; CHECK-LABEL: trunc_i32toi1:

+; CHECK:       # %bb.0:

+; CHECK-NEXT:    movl %edi, %eax

+; CHECK-NEXT:    # kill: def $al killed $al killed $eax

+; CHECK-NEXT:    retq

+  %r = trunc i32 %a to i1

+  ret i1 %r

 }
 
 define i8 @trunc_i32toi8(i32 %a) {
-; CHECK-LABEL: trunc_i32toi8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    retq
-  %r = trunc i32 %a to i8
-  ret i8 %r
+; CHECK-LABEL: trunc_i32toi8:

+; CHECK:       # %bb.0:

+; CHECK-NEXT:    movl %edi, %eax

+; CHECK-NEXT:    # kill: def $al killed $al killed $eax

+; CHECK-NEXT:    retq

+  %r = trunc i32 %a to i8

+  ret i8 %r

 }
 
 define i16 @trunc_i32toi16(i32 %a) {
-; CHECK-LABEL: trunc_i32toi16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    retq
-  %r = trunc i32 %a to i16
-  ret i16 %r
-}
-
-define i8 @trunc_i64toi8(i64 %a) {
-; CHECK-LABEL: trunc_i64toi8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    retq
-  %r = trunc i64 %a to i8
-  ret i8 %r
-}
-
-define i16 @trunc_i64toi16(i64 %a) {
-; CHECK-LABEL: trunc_i64toi16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    retq
-  %r = trunc i64 %a to i16
-  ret i16 %r
-}
-
-define i32 @trunc_i64toi32(i64 %a) {
-; CHECK-LABEL: trunc_i64toi32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    retq
-  %r = trunc i64 %a to i32
-  ret i32 %r
+; CHECK-LABEL: trunc_i32toi16:

+; CHECK:       # %bb.0:

+; CHECK-NEXT:    movl %edi, %eax

+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax

+; CHECK-NEXT:    retq

+  %r = trunc i32 %a to i16

+  ret i16 %r

+}

+

+define i8 @trunc_i64toi8(i64 %a) {

+; CHECK-LABEL: trunc_i64toi8:

+; CHECK:       # %bb.0:

+; CHECK-NEXT:    movq %rdi, %rax

+; CHECK-NEXT:    # kill: def $al killed $al killed $rax

+; CHECK-NEXT:    retq

+  %r = trunc i64 %a to i8

+  ret i8 %r

+}

+

+define i16 @trunc_i64toi16(i64 %a) {

+; CHECK-LABEL: trunc_i64toi16:

+; CHECK:       # %bb.0:

+; CHECK-NEXT:    movq %rdi, %rax

+; CHECK-NEXT:    # kill: def $ax killed $ax killed $rax

+; CHECK-NEXT:    retq

+  %r = trunc i64 %a to i16

+  ret i16 %r

+}

+

+define i32 @trunc_i64toi32(i64 %a) {

+; CHECK-LABEL: trunc_i64toi32:

+; CHECK:       # %bb.0:

+; CHECK-NEXT:    movq %rdi, %rax

+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax

+; CHECK-NEXT:    retq

+  %r = trunc i64 %a to i32

+  ret i32 %r

 }
 
diff --git a/llvm/test/CodeGen/X86/GlobalISel/undef.ll b/llvm/test/CodeGen/X86/GlobalISel/undef.ll
index 6edd0bf..41d278d 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/undef.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/undef.ll
@@ -8,14 +8,15 @@
   ret i8 undef
 }
 
-define i8 @test2(i8 %a) {
-; ALL-LABEL: test2:
-; ALL:       # %bb.0:
-; ALL-NEXT:    addb %al, %dil
-; ALL-NEXT:    movl %edi, %eax
-; ALL-NEXT:    retq
-  %r = add i8 %a, undef
-  ret i8 %r
+define i8 @test2(i8 %a) {

+; ALL-LABEL: test2:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movl %edi, %eax

+; ALL-NEXT:    addb %al, %al

+; ALL-NEXT:    # kill: def $al killed $al killed $eax

+; ALL-NEXT:    retq

+  %r = add i8 %a, undef

+  ret i8 %r

 }
 
 
diff --git a/llvm/test/CodeGen/X86/GlobalISel/xor-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/xor-scalar.ll
index 069edaa..8ae8e16 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/xor-scalar.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/xor-scalar.ll
@@ -16,43 +16,45 @@
   ret i32 %ret
 }
 
-define i8 @test_xor_i8(i8 %arg1, i8 %arg2) {
-; ALL-LABEL: test_xor_i8:
-; ALL:       # %bb.0:
-; ALL-NEXT:    xorb %dil, %sil
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    retq
-  %ret = xor i8 %arg1, %arg2
-  ret i8 %ret
-}
-
-define i16 @test_xor_i16(i16 %arg1, i16 %arg2) {
-; ALL-LABEL: test_xor_i16:
-; ALL:       # %bb.0:
-; ALL-NEXT:    xorw %di, %si
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    retq
-  %ret = xor i16 %arg1, %arg2
-  ret i16 %ret
-}
-
-define i32 @test_xor_i32(i32 %arg1, i32 %arg2) {
-; ALL-LABEL: test_xor_i32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    xorl %edi, %esi
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    retq
-  %ret = xor i32 %arg1, %arg2
-  ret i32 %ret
-}
-
-define i64 @test_xor_i64(i64 %arg1, i64 %arg2) {
-; ALL-LABEL: test_xor_i64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    xorq %rdi, %rsi
-; ALL-NEXT:    movq %rsi, %rax
-; ALL-NEXT:    retq
-  %ret = xor i64 %arg1, %arg2
-  ret i64 %ret
+define i8 @test_xor_i8(i8 %arg1, i8 %arg2) {

+; ALL-LABEL: test_xor_i8:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movl %esi, %eax

+; ALL-NEXT:    xorb %dil, %al

+; ALL-NEXT:    # kill: def $al killed $al killed $eax

+; ALL-NEXT:    retq

+  %ret = xor i8 %arg1, %arg2

+  ret i8 %ret

+}

+

+define i16 @test_xor_i16(i16 %arg1, i16 %arg2) {

+; ALL-LABEL: test_xor_i16:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movl %esi, %eax

+; ALL-NEXT:    xorw %di, %ax

+; ALL-NEXT:    # kill: def $ax killed $ax killed $eax

+; ALL-NEXT:    retq

+  %ret = xor i16 %arg1, %arg2

+  ret i16 %ret

+}

+

+define i32 @test_xor_i32(i32 %arg1, i32 %arg2) {

+; ALL-LABEL: test_xor_i32:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movl %esi, %eax

+; ALL-NEXT:    xorl %edi, %eax

+; ALL-NEXT:    retq

+  %ret = xor i32 %arg1, %arg2

+  ret i32 %ret

+}

+

+define i64 @test_xor_i64(i64 %arg1, i64 %arg2) {

+; ALL-LABEL: test_xor_i64:

+; ALL:       # %bb.0:

+; ALL-NEXT:    movq %rsi, %rax

+; ALL-NEXT:    xorq %rdi, %rax

+; ALL-NEXT:    retq

+  %ret = xor i64 %arg1, %arg2

+  ret i64 %ret

 }
 
diff --git a/llvm/test/CodeGen/X86/add.ll b/llvm/test/CodeGen/X86/add.ll
index cd919ef..dbf3715 100644
--- a/llvm/test/CodeGen/X86/add.ll
+++ b/llvm/test/CodeGen/X86/add.ll
@@ -16,14 +16,14 @@
 ;
 ; X64-LINUX-LABEL: test1:
 ; X64-LINUX:       # %bb.0: # %entry
-; X64-LINUX-NEXT:    subl $-128, %edi
 ; X64-LINUX-NEXT:    movl %edi, %eax
+; X64-LINUX-NEXT:    subl $-128, %eax
 ; X64-LINUX-NEXT:    retq
 ;
 ; X64-WIN32-LABEL: test1:
 ; X64-WIN32:       # %bb.0: # %entry
-; X64-WIN32-NEXT:    subl $-128, %ecx
 ; X64-WIN32-NEXT:    movl %ecx, %eax
+; X64-WIN32-NEXT:    subl $-128, %eax
 ; X64-WIN32-NEXT:    retq
 entry:
   %b = add i32 %a, 128
@@ -38,14 +38,14 @@
 ;
 ; X64-LINUX-LABEL: test2:
 ; X64-LINUX:       # %bb.0: # %entry
-; X64-LINUX-NEXT:    subq $-2147483648, %rdi # imm = 0x80000000
 ; X64-LINUX-NEXT:    movq %rdi, %rax
+; X64-LINUX-NEXT:    subq $-2147483648, %rax # imm = 0x80000000
 ; X64-LINUX-NEXT:    retq
 ;
 ; X64-WIN32-LABEL: test2:
 ; X64-WIN32:       # %bb.0: # %entry
-; X64-WIN32-NEXT:    subq $-2147483648, %rcx # imm = 0x80000000
 ; X64-WIN32-NEXT:    movq %rcx, %rax
+; X64-WIN32-NEXT:    subq $-2147483648, %rax # imm = 0x80000000
 ; X64-WIN32-NEXT:    retq
 entry:
   %b = add i64 %a, 2147483648
@@ -60,14 +60,14 @@
 ;
 ; X64-LINUX-LABEL: test3:
 ; X64-LINUX:       # %bb.0: # %entry
-; X64-LINUX-NEXT:    subq $-128, %rdi
 ; X64-LINUX-NEXT:    movq %rdi, %rax
+; X64-LINUX-NEXT:    subq $-128, %rax
 ; X64-LINUX-NEXT:    retq
 ;
 ; X64-WIN32-LABEL: test3:
 ; X64-WIN32:       # %bb.0: # %entry
-; X64-WIN32-NEXT:    subq $-128, %rcx
 ; X64-WIN32-NEXT:    movq %rcx, %rax
+; X64-WIN32-NEXT:    subq $-128, %rax
 ; X64-WIN32-NEXT:    retq
 entry:
   %b = add i64 %a, 128
@@ -204,16 +204,16 @@
 ;
 ; X64-LINUX-LABEL: test7:
 ; X64-LINUX:       # %bb.0: # %entry
-; X64-LINUX-NEXT:    addl %esi, %edi
-; X64-LINUX-NEXT:    setb %dl
 ; X64-LINUX-NEXT:    movl %edi, %eax
+; X64-LINUX-NEXT:    addl %esi, %eax
+; X64-LINUX-NEXT:    setb %dl
 ; X64-LINUX-NEXT:    retq
 ;
 ; X64-WIN32-LABEL: test7:
 ; X64-WIN32:       # %bb.0: # %entry
-; X64-WIN32-NEXT:    addl %edx, %ecx
-; X64-WIN32-NEXT:    setb %dl
 ; X64-WIN32-NEXT:    movl %ecx, %eax
+; X64-WIN32-NEXT:    addl %edx, %eax
+; X64-WIN32-NEXT:    setb %dl
 ; X64-WIN32-NEXT:    retq
 entry:
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
@@ -233,16 +233,16 @@
 ;
 ; X64-LINUX-LABEL: test8:
 ; X64-LINUX:       # %bb.0: # %entry
-; X64-LINUX-NEXT:    addq %rsi, %rdi
-; X64-LINUX-NEXT:    setb %dl
 ; X64-LINUX-NEXT:    movq %rdi, %rax
+; X64-LINUX-NEXT:    addq %rsi, %rax
+; X64-LINUX-NEXT:    setb %dl
 ; X64-LINUX-NEXT:    retq
 ;
 ; X64-WIN32-LABEL: test8:
 ; X64-WIN32:       # %bb.0: # %entry
-; X64-WIN32-NEXT:    addq %rdx, %rcx
-; X64-WIN32-NEXT:    setb %dl
 ; X64-WIN32-NEXT:    movq %rcx, %rax
+; X64-WIN32-NEXT:    addq %rdx, %rax
+; X64-WIN32-NEXT:    setb %dl
 ; X64-WIN32-NEXT:    retq
 entry:
   %extleft = zext i64 %left to i65
@@ -268,20 +268,20 @@
 ;
 ; X64-LINUX-LABEL: test9:
 ; X64-LINUX:       # %bb.0: # %entry
-; X64-LINUX-NEXT:    xorl %eax, %eax
-; X64-LINUX-NEXT:    cmpl $10, %edi
-; X64-LINUX-NEXT:    sete %al
-; X64-LINUX-NEXT:    subl %eax, %esi
 ; X64-LINUX-NEXT:    movl %esi, %eax
+; X64-LINUX-NEXT:    xorl %ecx, %ecx
+; X64-LINUX-NEXT:    cmpl $10, %edi
+; X64-LINUX-NEXT:    sete %cl
+; X64-LINUX-NEXT:    subl %ecx, %eax
 ; X64-LINUX-NEXT:    retq
 ;
 ; X64-WIN32-LABEL: test9:
 ; X64-WIN32:       # %bb.0: # %entry
-; X64-WIN32-NEXT:    xorl %eax, %eax
-; X64-WIN32-NEXT:    cmpl $10, %ecx
-; X64-WIN32-NEXT:    sete %al
-; X64-WIN32-NEXT:    subl %eax, %edx
 ; X64-WIN32-NEXT:    movl %edx, %eax
+; X64-WIN32-NEXT:    xorl %edx, %edx
+; X64-WIN32-NEXT:    cmpl $10, %ecx
+; X64-WIN32-NEXT:    sete %dl
+; X64-WIN32-NEXT:    subl %edx, %eax
 ; X64-WIN32-NEXT:    retq
 entry:
   %cmp = icmp eq i32 %x, 10
@@ -392,14 +392,14 @@
 ;
 ; X64-LINUX-LABEL: inc_not:
 ; X64-LINUX:       # %bb.0:
-; X64-LINUX-NEXT:    negl %edi
 ; X64-LINUX-NEXT:    movl %edi, %eax
+; X64-LINUX-NEXT:    negl %eax
 ; X64-LINUX-NEXT:    retq
 ;
 ; X64-WIN32-LABEL: inc_not:
 ; X64-WIN32:       # %bb.0:
-; X64-WIN32-NEXT:    negl %ecx
 ; X64-WIN32-NEXT:    movl %ecx, %eax
+; X64-WIN32-NEXT:    negl %eax
 ; X64-WIN32-NEXT:    retq
   %nota = xor i32 %a, -1
   %r = add i32 %nota, 1
diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll
index 33220eb..956d52c 100644
--- a/llvm/test/CodeGen/X86/addcarry.ll
+++ b/llvm/test/CodeGen/X86/addcarry.ll
@@ -4,9 +4,9 @@
 define i128 @add128(i128 %a, i128 %b) nounwind {
 ; CHECK-LABEL: add128:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq %rdx, %rdi
-; CHECK-NEXT:    adcq %rcx, %rsi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    addq %rdx, %rax
+; CHECK-NEXT:    adcq %rcx, %rsi
 ; CHECK-NEXT:    movq %rsi, %rdx
 ; CHECK-NEXT:    retq
 entry:
@@ -43,6 +43,7 @@
 define i256 @add256(i256 %a, i256 %b) nounwind {
 ; CHECK-LABEL: add256:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    addq %r9, %rsi
 ; CHECK-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
 ; CHECK-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
@@ -51,7 +52,6 @@
 ; CHECK-NEXT:    movq %rsi, (%rdi)
 ; CHECK-NEXT:    movq %rcx, 16(%rdi)
 ; CHECK-NEXT:    movq %r8, 24(%rdi)
-; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    retq
 entry:
   %0 = add i256 %a, %b
@@ -197,6 +197,7 @@
 define %scalar @pr31719(%scalar* nocapture readonly %this, %scalar %arg.b) {
 ; CHECK-LABEL: pr31719:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    addq (%rsi), %rdx
 ; CHECK-NEXT:    adcq 8(%rsi), %rcx
 ; CHECK-NEXT:    adcq 16(%rsi), %r8
@@ -205,7 +206,6 @@
 ; CHECK-NEXT:    movq %rcx, 8(%rdi)
 ; CHECK-NEXT:    movq %r8, 16(%rdi)
 ; CHECK-NEXT:    movq %r9, 24(%rdi)
-; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    retq
 entry:
   %0 = extractvalue %scalar %arg.b, 0
@@ -292,9 +292,9 @@
 define i64 @shiftadd(i64 %a, i64 %b, i64 %c, i64 %d) {
 ; CHECK-LABEL: shiftadd:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq %rsi, %rdi
-; CHECK-NEXT:    adcq %rcx, %rdx
 ; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    addq %rsi, %rdi
+; CHECK-NEXT:    adcq %rcx, %rax
 ; CHECK-NEXT:    retq
 entry:
   %0 = zext i64 %a to i128
@@ -312,23 +312,23 @@
 define %S @readd(%S* nocapture readonly %this, %S %arg.b) {
 ; CHECK-LABEL: readd:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    addq (%rsi), %rdx
-; CHECK-NEXT:    movq 8(%rsi), %r10
-; CHECK-NEXT:    adcq $0, %r10
-; CHECK-NEXT:    setb %al
-; CHECK-NEXT:    movzbl %al, %eax
-; CHECK-NEXT:    addq %rcx, %r10
-; CHECK-NEXT:    adcq 16(%rsi), %rax
+; CHECK-NEXT:    movq 8(%rsi), %r11
+; CHECK-NEXT:    adcq $0, %r11
+; CHECK-NEXT:    setb %r10b
+; CHECK-NEXT:    movzbl %r10b, %edi
+; CHECK-NEXT:    addq %rcx, %r11
+; CHECK-NEXT:    adcq 16(%rsi), %rdi
 ; CHECK-NEXT:    setb %cl
 ; CHECK-NEXT:    movzbl %cl, %ecx
-; CHECK-NEXT:    addq %r8, %rax
+; CHECK-NEXT:    addq %r8, %rdi
 ; CHECK-NEXT:    adcq 24(%rsi), %rcx
 ; CHECK-NEXT:    addq %r9, %rcx
-; CHECK-NEXT:    movq %rdx, (%rdi)
-; CHECK-NEXT:    movq %r10, 8(%rdi)
-; CHECK-NEXT:    movq %rax, 16(%rdi)
-; CHECK-NEXT:    movq %rcx, 24(%rdi)
-; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movq %rdx, (%rax)
+; CHECK-NEXT:    movq %r11, 8(%rax)
+; CHECK-NEXT:    movq %rdi, 16(%rax)
+; CHECK-NEXT:    movq %rcx, 24(%rax)
 ; CHECK-NEXT:    retq
 entry:
   %0 = extractvalue %S %arg.b, 0
@@ -377,10 +377,10 @@
 define i128 @addcarry1_not(i128 %n) {
 ; CHECK-LABEL: addcarry1_not:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    negq %rdi
-; CHECK-NEXT:    sbbq %rsi, %rdx
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    negq %rax
+; CHECK-NEXT:    sbbq %rsi, %rdx
 ; CHECK-NEXT:    retq
   %1 = xor i128 %n, -1
   %2 = add i128 %1, 1
diff --git a/llvm/test/CodeGen/X86/and-encoding.ll b/llvm/test/CodeGen/X86/and-encoding.ll
index 51cdbd9..52fd997 100644
--- a/llvm/test/CodeGen/X86/and-encoding.ll
+++ b/llvm/test/CodeGen/X86/and-encoding.ll
@@ -46,9 +46,9 @@
 define i32 @lopped32_32to8(i32 %x) {
 ; CHECK-LABEL: lopped32_32to8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrl $4, %edi # encoding: [0xc1,0xef,0x04]
-; CHECK-NEXT:    andl $-16, %edi # encoding: [0x83,0xe7,0xf0]
 ; CHECK-NEXT:    movl %edi, %eax # encoding: [0x89,0xf8]
+; CHECK-NEXT:    shrl $4, %eax # encoding: [0xc1,0xe8,0x04]
+; CHECK-NEXT:    andl $-16, %eax # encoding: [0x83,0xe0,0xf0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %shr = lshr i32 %x, 4
   %and = and i32 %shr, 268435440
@@ -60,9 +60,9 @@
 define i64 @lopped64_32to8(i64 %x) {
 ; CHECK-LABEL: lopped64_32to8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrq $36, %rdi # encoding: [0x48,0xc1,0xef,0x24]
-; CHECK-NEXT:    andl $-16, %edi # encoding: [0x83,0xe7,0xf0]
 ; CHECK-NEXT:    movq %rdi, %rax # encoding: [0x48,0x89,0xf8]
+; CHECK-NEXT:    shrq $36, %rax # encoding: [0x48,0xc1,0xe8,0x24]
+; CHECK-NEXT:    andl $-16, %eax # encoding: [0x83,0xe0,0xf0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %shr = lshr i64 %x, 36
   %and = and i64 %shr, 268435440
@@ -74,9 +74,9 @@
 define i64 @lopped64_64to8(i64 %x) {
 ; CHECK-LABEL: lopped64_64to8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrq $4, %rdi # encoding: [0x48,0xc1,0xef,0x04]
-; CHECK-NEXT:    andq $-16, %rdi # encoding: [0x48,0x83,0xe7,0xf0]
 ; CHECK-NEXT:    movq %rdi, %rax # encoding: [0x48,0x89,0xf8]
+; CHECK-NEXT:    shrq $4, %rax # encoding: [0x48,0xc1,0xe8,0x04]
+; CHECK-NEXT:    andq $-16, %rax # encoding: [0x48,0x83,0xe0,0xf0]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %shr = lshr i64 %x, 4
   %and = and i64 %shr, 1152921504606846960
@@ -88,10 +88,10 @@
 define i64 @lopped64_64to32(i64 %x) {
 ; CHECK-LABEL: lopped64_64to32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrq $4, %rdi # encoding: [0x48,0xc1,0xef,0x04]
-; CHECK-NEXT:    andq $-983056, %rdi # encoding: [0x48,0x81,0xe7,0xf0,0xff,0xf0,0xff]
-; CHECK-NEXT:    # imm = 0xFFF0FFF0
 ; CHECK-NEXT:    movq %rdi, %rax # encoding: [0x48,0x89,0xf8]
+; CHECK-NEXT:    shrq $4, %rax # encoding: [0x48,0xc1,0xe8,0x04]
+; CHECK-NEXT:    andq $-983056, %rax # encoding: [0x48,0x25,0xf0,0xff,0xf0,0xff]
+; CHECK-NEXT:    # imm = 0xFFF0FFF0
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %shr = lshr i64 %x, 4
   %and = and i64 %shr, 1152921504605863920
diff --git a/llvm/test/CodeGen/X86/andimm8.ll b/llvm/test/CodeGen/X86/andimm8.ll
index d430f73..9cc0bfc 100644
--- a/llvm/test/CodeGen/X86/andimm8.ll
+++ b/llvm/test/CodeGen/X86/andimm8.ll
@@ -14,9 +14,8 @@
 ;
 ; X64-LABEL: bra:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andl $-64, %edi # encoding: [0x83,0xe7,0xc0]
-; X64-NEXT:    movq %rdi, %rax # encoding: [0x48,0x89,0xf8]
+; X64-NEXT:    movl %edi, %eax # encoding: [0x89,0xf8]
+; X64-NEXT:    andl $-64, %eax # encoding: [0x83,0xe0,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
  %t1 = zext i32 %zed to i64
  %t2 = and i64  %t1, 4294967232
@@ -57,8 +56,8 @@
 ;
 ; X64-LABEL: bar:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl $42, %edi # encoding: [0x83,0xe7,0x2a]
 ; X64-NEXT:    movq %rdi, %rax # encoding: [0x48,0x89,0xf8]
+; X64-NEXT:    andl $42, %eax # encoding: [0x83,0xe0,0x2a]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %t1 = and i64 %zed, 42
   ret i64 %t1
@@ -75,9 +74,9 @@
 ;
 ; X64-LABEL: baz:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl $2147483647, %edi # encoding: [0x81,0xe7,0xff,0xff,0xff,0x7f]
-; X64-NEXT:    # imm = 0x7FFFFFFF
 ; X64-NEXT:    movq %rdi, %rax # encoding: [0x48,0x89,0xf8]
+; X64-NEXT:    andl $2147483647, %eax # encoding: [0x25,0xff,0xff,0xff,0x7f]
+; X64-NEXT:    # imm = 0x7FFFFFFF
 ; X64-NEXT:    retq # encoding: [0xc3]
   %t1 = and i64 %zed, 2147483647
   ret i64 %t1
diff --git a/llvm/test/CodeGen/X86/anyext.ll b/llvm/test/CodeGen/X86/anyext.ll
index e1435d6..66be521 100644
--- a/llvm/test/CodeGen/X86/anyext.ll
+++ b/llvm/test/CodeGen/X86/anyext.ll
@@ -41,8 +41,9 @@
 ;
 ; X64-LABEL: bar:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    divw %si
 ; X64-NEXT:    # kill: def $ax killed $ax def $eax
 ; X64-NEXT:    andl $1, %eax
diff --git a/llvm/test/CodeGen/X86/apm.ll b/llvm/test/CodeGen/X86/apm.ll
index 9750dd0..859f33a 100644
--- a/llvm/test/CodeGen/X86/apm.ll
+++ b/llvm/test/CodeGen/X86/apm.ll
@@ -17,8 +17,8 @@
 ;
 ; X64-LABEL: foo:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    leaq (%rdi), %rax
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    leaq (%rdi), %rax
 ; X64-NEXT:    monitor
 ; X64-NEXT:    retq
 ;
@@ -46,8 +46,8 @@
 ;
 ; X64-LABEL: bar:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %edi, %ecx
 ; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl %edi, %ecx
 ; X64-NEXT:    mwait
 ; X64-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/atomic-eflags-reuse.ll b/llvm/test/CodeGen/X86/atomic-eflags-reuse.ll
index b8b2b7f..b2a0d27 100644
--- a/llvm/test/CodeGen/X86/atomic-eflags-reuse.ll
+++ b/llvm/test/CodeGen/X86/atomic-eflags-reuse.ll
@@ -5,16 +5,16 @@
 define i32 @test_add_1_cmov_slt(i64* %p, i32 %a0, i32 %a1) #0 {
 ; FASTINCDEC-LABEL: test_add_1_cmov_slt:
 ; FASTINCDEC:       # %bb.0: # %entry
-; FASTINCDEC-NEXT:    lock incq (%rdi)
-; FASTINCDEC-NEXT:    cmovgl %edx, %esi
 ; FASTINCDEC-NEXT:    movl %esi, %eax
+; FASTINCDEC-NEXT:    lock incq (%rdi)
+; FASTINCDEC-NEXT:    cmovgl %edx, %eax
 ; FASTINCDEC-NEXT:    retq
 ;
 ; SLOWINCDEC-LABEL: test_add_1_cmov_slt:
 ; SLOWINCDEC:       # %bb.0: # %entry
-; SLOWINCDEC-NEXT:    lock addq $1, (%rdi)
-; SLOWINCDEC-NEXT:    cmovgl %edx, %esi
 ; SLOWINCDEC-NEXT:    movl %esi, %eax
+; SLOWINCDEC-NEXT:    lock addq $1, (%rdi)
+; SLOWINCDEC-NEXT:    cmovgl %edx, %eax
 ; SLOWINCDEC-NEXT:    retq
 entry:
   %tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
@@ -26,16 +26,16 @@
 define i32 @test_add_1_cmov_sge(i64* %p, i32 %a0, i32 %a1) #0 {
 ; FASTINCDEC-LABEL: test_add_1_cmov_sge:
 ; FASTINCDEC:       # %bb.0: # %entry
-; FASTINCDEC-NEXT:    lock incq (%rdi)
-; FASTINCDEC-NEXT:    cmovlel %edx, %esi
 ; FASTINCDEC-NEXT:    movl %esi, %eax
+; FASTINCDEC-NEXT:    lock incq (%rdi)
+; FASTINCDEC-NEXT:    cmovlel %edx, %eax
 ; FASTINCDEC-NEXT:    retq
 ;
 ; SLOWINCDEC-LABEL: test_add_1_cmov_sge:
 ; SLOWINCDEC:       # %bb.0: # %entry
-; SLOWINCDEC-NEXT:    lock addq $1, (%rdi)
-; SLOWINCDEC-NEXT:    cmovlel %edx, %esi
 ; SLOWINCDEC-NEXT:    movl %esi, %eax
+; SLOWINCDEC-NEXT:    lock addq $1, (%rdi)
+; SLOWINCDEC-NEXT:    cmovlel %edx, %eax
 ; SLOWINCDEC-NEXT:    retq
 entry:
   %tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
@@ -47,16 +47,16 @@
 define i32 @test_sub_1_cmov_sle(i64* %p, i32 %a0, i32 %a1) #0 {
 ; FASTINCDEC-LABEL: test_sub_1_cmov_sle:
 ; FASTINCDEC:       # %bb.0: # %entry
-; FASTINCDEC-NEXT:    lock decq (%rdi)
-; FASTINCDEC-NEXT:    cmovgel %edx, %esi
 ; FASTINCDEC-NEXT:    movl %esi, %eax
+; FASTINCDEC-NEXT:    lock decq (%rdi)
+; FASTINCDEC-NEXT:    cmovgel %edx, %eax
 ; FASTINCDEC-NEXT:    retq
 ;
 ; SLOWINCDEC-LABEL: test_sub_1_cmov_sle:
 ; SLOWINCDEC:       # %bb.0: # %entry
-; SLOWINCDEC-NEXT:    lock addq $-1, (%rdi)
-; SLOWINCDEC-NEXT:    cmovgel %edx, %esi
 ; SLOWINCDEC-NEXT:    movl %esi, %eax
+; SLOWINCDEC-NEXT:    lock addq $-1, (%rdi)
+; SLOWINCDEC-NEXT:    cmovgel %edx, %eax
 ; SLOWINCDEC-NEXT:    retq
 entry:
   %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
@@ -68,16 +68,16 @@
 define i32 @test_sub_1_cmov_sgt(i64* %p, i32 %a0, i32 %a1) #0 {
 ; FASTINCDEC-LABEL: test_sub_1_cmov_sgt:
 ; FASTINCDEC:       # %bb.0: # %entry
-; FASTINCDEC-NEXT:    lock decq (%rdi)
-; FASTINCDEC-NEXT:    cmovll %edx, %esi
 ; FASTINCDEC-NEXT:    movl %esi, %eax
+; FASTINCDEC-NEXT:    lock decq (%rdi)
+; FASTINCDEC-NEXT:    cmovll %edx, %eax
 ; FASTINCDEC-NEXT:    retq
 ;
 ; SLOWINCDEC-LABEL: test_sub_1_cmov_sgt:
 ; SLOWINCDEC:       # %bb.0: # %entry
-; SLOWINCDEC-NEXT:    lock addq $-1, (%rdi)
-; SLOWINCDEC-NEXT:    cmovll %edx, %esi
 ; SLOWINCDEC-NEXT:    movl %esi, %eax
+; SLOWINCDEC-NEXT:    lock addq $-1, (%rdi)
+; SLOWINCDEC-NEXT:    cmovll %edx, %eax
 ; SLOWINCDEC-NEXT:    retq
 entry:
   %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
@@ -159,11 +159,11 @@
 define i32 @test_add_1_cmov_sle(i64* %p, i32 %a0, i32 %a1) #0 {
 ; CHECK-LABEL: test_add_1_cmov_sle:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    lock xaddq %rax, (%rdi)
-; CHECK-NEXT:    testq %rax, %rax
-; CHECK-NEXT:    cmovgl %edx, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movl $1, %ecx
+; CHECK-NEXT:    lock xaddq %rcx, (%rdi)
+; CHECK-NEXT:    testq %rcx, %rcx
+; CHECK-NEXT:    cmovgl %edx, %eax
 ; CHECK-NEXT:    retq
 entry:
   %tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
@@ -175,11 +175,11 @@
 define i32 @test_add_1_cmov_sgt(i64* %p, i32 %a0, i32 %a1) #0 {
 ; CHECK-LABEL: test_add_1_cmov_sgt:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    lock xaddq %rax, (%rdi)
-; CHECK-NEXT:    testq %rax, %rax
-; CHECK-NEXT:    cmovlel %edx, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movl $1, %ecx
+; CHECK-NEXT:    lock xaddq %rcx, (%rdi)
+; CHECK-NEXT:    testq %rcx, %rcx
+; CHECK-NEXT:    cmovlel %edx, %eax
 ; CHECK-NEXT:    retq
 entry:
   %tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
diff --git a/llvm/test/CodeGen/X86/atomic128.ll b/llvm/test/CodeGen/X86/atomic128.ll
index 896b6d2..76475b3 100644
--- a/llvm/test/CodeGen/X86/atomic128.ll
+++ b/llvm/test/CodeGen/X86/atomic128.ll
@@ -12,10 +12,9 @@
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset %rbx, -16
-; CHECK-NEXT:    movq %rcx, %r9
+; CHECK-NEXT:    movq %rcx, %rbx
 ; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    movq %r8, %rcx
-; CHECK-NEXT:    movq %r9, %rbx
 ; CHECK-NEXT:    lock cmpxchg16b (%rdi)
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index 2f071b4..2fe0767 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -1638,6 +1638,7 @@
 define <512 x i8> @avg_v512i8_3(<512 x i8> %a, <512 x i8> %b) nounwind {
 ; SSE2-LABEL: avg_v512i8_3:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm8
 ; SSE2-NEXT:    movdqa %xmm8, 496(%rdi)
@@ -1726,7 +1727,6 @@
 ; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
 ; SSE2-NEXT:    pavgb {{[0-9]+}}(%rsp), %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, (%rdi)
-; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: avg_v512i8_3:
@@ -1735,6 +1735,7 @@
 ; AVX1-NEXT:    movq %rsp, %rbp
 ; AVX1-NEXT:    andq $-32, %rsp
 ; AVX1-NEXT:    subq $128, %rsp
+; AVX1-NEXT:    movq %rdi, %rax
 ; AVX1-NEXT:    vmovdqa 144(%rbp), %ymm8
 ; AVX1-NEXT:    vmovdqa 112(%rbp), %ymm9
 ; AVX1-NEXT:    vmovdqa 80(%rbp), %ymm10
@@ -1861,7 +1862,6 @@
 ; AVX1-NEXT:    vmovaps %ymm0, 32(%rdi)
 ; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX1-NEXT:    vmovaps %ymm0, (%rdi)
-; AVX1-NEXT:    movq %rdi, %rax
 ; AVX1-NEXT:    movq %rbp, %rsp
 ; AVX1-NEXT:    popq %rbp
 ; AVX1-NEXT:    vzeroupper
@@ -1873,6 +1873,7 @@
 ; AVX2-NEXT:    movq %rsp, %rbp
 ; AVX2-NEXT:    andq $-32, %rsp
 ; AVX2-NEXT:    subq $32, %rsp
+; AVX2-NEXT:    movq %rdi, %rax
 ; AVX2-NEXT:    vmovdqa 240(%rbp), %ymm8
 ; AVX2-NEXT:    vmovdqa 208(%rbp), %ymm9
 ; AVX2-NEXT:    vmovdqa 176(%rbp), %ymm10
@@ -1913,7 +1914,6 @@
 ; AVX2-NEXT:    vmovdqa %ymm2, 64(%rdi)
 ; AVX2-NEXT:    vmovdqa %ymm1, 32(%rdi)
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rdi)
-; AVX2-NEXT:    movq %rdi, %rax
 ; AVX2-NEXT:    movq %rbp, %rsp
 ; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    vzeroupper
@@ -1925,6 +1925,7 @@
 ; AVX512F-NEXT:    movq %rsp, %rbp
 ; AVX512F-NEXT:    andq $-32, %rsp
 ; AVX512F-NEXT:    subq $32, %rsp
+; AVX512F-NEXT:    movq %rdi, %rax
 ; AVX512F-NEXT:    vmovdqa 240(%rbp), %ymm8
 ; AVX512F-NEXT:    vmovdqa 208(%rbp), %ymm9
 ; AVX512F-NEXT:    vmovdqa 176(%rbp), %ymm10
@@ -1965,7 +1966,6 @@
 ; AVX512F-NEXT:    vmovdqa %ymm2, 64(%rdi)
 ; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rdi)
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rdi)
-; AVX512F-NEXT:    movq %rdi, %rax
 ; AVX512F-NEXT:    movq %rbp, %rsp
 ; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    vzeroupper
@@ -1977,6 +1977,7 @@
 ; AVX512BW-NEXT:    movq %rsp, %rbp
 ; AVX512BW-NEXT:    andq $-64, %rsp
 ; AVX512BW-NEXT:    subq $64, %rsp
+; AVX512BW-NEXT:    movq %rdi, %rax
 ; AVX512BW-NEXT:    vpavgb 16(%rbp), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpavgb 80(%rbp), %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpavgb 144(%rbp), %zmm2, %zmm2
@@ -1993,7 +1994,6 @@
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, 128(%rdi)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdi)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdi)
-; AVX512BW-NEXT:    movq %rdi, %rax
 ; AVX512BW-NEXT:    movq %rbp, %rsp
 ; AVX512BW-NEXT:    popq %rbp
 ; AVX512BW-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/avoid-sfb.ll b/llvm/test/CodeGen/X86/avoid-sfb.ll
index 5319700..9d6c6c9 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb.ll
+++ b/llvm/test/CodeGen/X86/avoid-sfb.ll
@@ -727,28 +727,29 @@
 define void @test_stack(%struct.S6* noalias nocapture sret %agg.result, %struct.S6* byval nocapture readnone align 8 %s1, %struct.S6* byval nocapture align 8 %s2, i32 %x) local_unnamed_addr #0 {
 ; CHECK-LABEL: test_stack:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; CHECK-NEXT:    movups %xmm0, (%rdi)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, 16(%rdi)
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    movl %eax, 24(%rdi)
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    movl %eax, 28(%rdi)
-; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    movq %rcx, 16(%rdi)
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movl %ecx, 24(%rdi)
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movl %ecx, 28(%rdi)
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %esi
 ; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl %edx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    retq
 ;
 ; DISABLED-LABEL: test_stack:
 ; DISABLED:       # %bb.0: # %entry
+; DISABLED-NEXT:    movq %rdi, %rax
 ; DISABLED-NEXT:    movl %esi, {{[0-9]+}}(%rsp)
 ; DISABLED-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
 ; DISABLED-NEXT:    movups %xmm0, (%rdi)
@@ -758,51 +759,50 @@
 ; DISABLED-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
 ; DISABLED-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; DISABLED-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
-; DISABLED-NEXT:    movq %rdi, %rax
 ; DISABLED-NEXT:    retq
 ;
 ; CHECK-AVX2-LABEL: test_stack:
 ; CHECK-AVX2:       # %bb.0: # %entry
+; CHECK-AVX2-NEXT:    movq %rdi, %rax
 ; CHECK-AVX2-NEXT:    movl %esi, {{[0-9]+}}(%rsp)
 ; CHECK-AVX2-NEXT:    vmovups {{[0-9]+}}(%rsp), %xmm0
 ; CHECK-AVX2-NEXT:    vmovups %xmm0, (%rdi)
-; CHECK-AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-AVX2-NEXT:    movq %rax, 16(%rdi)
-; CHECK-AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-AVX2-NEXT:    movl %eax, 24(%rdi)
-; CHECK-AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-AVX2-NEXT:    movl %eax, 28(%rdi)
+; CHECK-AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-AVX2-NEXT:    movq %rcx, 16(%rdi)
+; CHECK-AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; CHECK-AVX2-NEXT:    movl %ecx, 24(%rdi)
+; CHECK-AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; CHECK-AVX2-NEXT:    movl %ecx, 28(%rdi)
 ; CHECK-AVX2-NEXT:    vmovups {{[0-9]+}}(%rsp), %xmm0
 ; CHECK-AVX2-NEXT:    vmovups %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-AVX2-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-AVX2-NEXT:    movl %eax, {{[0-9]+}}(%rsp)
-; CHECK-AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-AVX2-NEXT:    movl %eax, {{[0-9]+}}(%rsp)
-; CHECK-AVX2-NEXT:    movq %rdi, %rax
+; CHECK-AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-AVX2-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; CHECK-AVX2-NEXT:    movl %ecx, {{[0-9]+}}(%rsp)
+; CHECK-AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; CHECK-AVX2-NEXT:    movl %ecx, {{[0-9]+}}(%rsp)
 ; CHECK-AVX2-NEXT:    retq
 ;
 ; CHECK-AVX512-LABEL: test_stack:
 ; CHECK-AVX512:       # %bb.0: # %entry
+; CHECK-AVX512-NEXT:    movq %rdi, %rax
 ; CHECK-AVX512-NEXT:    movl %esi, {{[0-9]+}}(%rsp)
 ; CHECK-AVX512-NEXT:    vmovups {{[0-9]+}}(%rsp), %xmm0
 ; CHECK-AVX512-NEXT:    vmovups %xmm0, (%rdi)
-; CHECK-AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-AVX512-NEXT:    movq %rax, 16(%rdi)
-; CHECK-AVX512-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-AVX512-NEXT:    movl %eax, 24(%rdi)
-; CHECK-AVX512-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-AVX512-NEXT:    movl %eax, 28(%rdi)
+; CHECK-AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-AVX512-NEXT:    movq %rcx, 16(%rdi)
+; CHECK-AVX512-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; CHECK-AVX512-NEXT:    movl %ecx, 24(%rdi)
+; CHECK-AVX512-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; CHECK-AVX512-NEXT:    movl %ecx, 28(%rdi)
 ; CHECK-AVX512-NEXT:    vmovups {{[0-9]+}}(%rsp), %xmm0
 ; CHECK-AVX512-NEXT:    vmovups %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-AVX512-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-AVX512-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-AVX512-NEXT:    movl %eax, {{[0-9]+}}(%rsp)
-; CHECK-AVX512-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-AVX512-NEXT:    movl %eax, {{[0-9]+}}(%rsp)
-; CHECK-AVX512-NEXT:    movq %rdi, %rax
+; CHECK-AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-AVX512-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-AVX512-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; CHECK-AVX512-NEXT:    movl %ecx, {{[0-9]+}}(%rsp)
+; CHECK-AVX512-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; CHECK-AVX512-NEXT:    movl %ecx, {{[0-9]+}}(%rsp)
 ; CHECK-AVX512-NEXT:    retq
 entry:
   %s6.sroa.0.0..sroa_cast1 = bitcast %struct.S6* %s2 to i8*
diff --git a/llvm/test/CodeGen/X86/avx-intel-ocl.ll b/llvm/test/CodeGen/X86/avx-intel-ocl.ll
index 53165d8..4560061 100644
--- a/llvm/test/CodeGen/X86/avx-intel-ocl.ll
+++ b/llvm/test/CodeGen/X86/avx-intel-ocl.ll
@@ -122,8 +122,8 @@
 
 ; pass parameters in registers for 64-bit platform
 ; X64-LABEL: test_int
-; X64: leal {{.*}}, %edi
 ; X64: movl {{.*}}, %esi
+; X64: leal {{.*}}, %edi
 ; X64: call
 ; X64: addl {{.*}}, %eax
 define i32 @test_int(i32 %a, i32 %b) nounwind {
diff --git a/llvm/test/CodeGen/X86/avx-vinsertf128.ll b/llvm/test/CodeGen/X86/avx-vinsertf128.ll
index 7615f65..89f74a4 100644
--- a/llvm/test/CodeGen/X86/avx-vinsertf128.ll
+++ b/llvm/test/CodeGen/X86/avx-vinsertf128.ll
@@ -75,8 +75,7 @@
 define <4 x double> @insert_undef_pd(<4 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: insert_undef_pd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
-; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> undef, <2 x double> %a1, i8 0)
 ret <4 x double> %res
@@ -86,8 +85,7 @@
 define <8 x float> @insert_undef_ps(<8 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: insert_undef_ps:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
-; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %a1, i8 0)
 ret <8 x float> %res
@@ -97,8 +95,7 @@
 define <8 x i32> @insert_undef_si(<8 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: insert_undef_si:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
-; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> undef, <4 x i32> %a1, i8 0)
 ret <8 x i32> %res
diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll
index f44e276..d836e9e 100644
--- a/llvm/test/CodeGen/X86/avx512-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512-arith.ll
@@ -904,9 +904,9 @@
 define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i,
 ; CHECK-LABEL: test_mask_broadcast_vaddpd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vptestmq %zmm2, %zmm2, %k1
-; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
                                       double* %j, <8 x i64> %mask1) nounwind {
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
index b313ee4..4145fe9 100644
--- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
@@ -272,9 +272,9 @@
 define i32 @test10(i32 %a, i32 %b, i1 %cond) {
 ; ALL_X64-LABEL: test10:
 ; ALL_X64:       ## %bb.0:
-; ALL_X64-NEXT:    testb $1, %dl
-; ALL_X64-NEXT:    cmovel %esi, %edi
 ; ALL_X64-NEXT:    movl %edi, %eax
+; ALL_X64-NEXT:    testb $1, %dl
+; ALL_X64-NEXT:    cmovel %esi, %eax
 ; ALL_X64-NEXT:    retq
 ;
 ; KNL_X32-LABEL: test10:
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 4171cf5..db3716c 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -195,21 +195,21 @@
 define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) {
 ; KNL-LABEL: test12:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vpcmpgtq %zmm0, %zmm2, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    testb $1, %al
-; KNL-NEXT:    cmoveq %rsi, %rdi
 ; KNL-NEXT:    movq %rdi, %rax
+; KNL-NEXT:    vpcmpgtq %zmm0, %zmm2, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    testb $1, %cl
+; KNL-NEXT:    cmoveq %rsi, %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test12:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vpcmpgtq %zmm0, %zmm2, %k0
-; SKX-NEXT:    kmovd %k0, %eax
-; SKX-NEXT:    testb $1, %al
-; SKX-NEXT:    cmoveq %rsi, %rdi
 ; SKX-NEXT:    movq %rdi, %rax
+; SKX-NEXT:    vpcmpgtq %zmm0, %zmm2, %k0
+; SKX-NEXT:    kmovd %k0, %ecx
+; SKX-NEXT:    testb $1, %cl
+; SKX-NEXT:    cmoveq %rsi, %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %cmpvector_func.i = icmp slt <16 x i64> %a, %b
@@ -257,23 +257,23 @@
 define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
 ; KNL-LABEL: test14:
 ; KNL:       ## %bb.0:
+; KNL-NEXT:    movq %rdi, %rax
 ; KNL-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
 ; KNL-NEXT:    kshiftrw $4, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    testb $1, %al
-; KNL-NEXT:    cmoveq %rsi, %rdi
-; KNL-NEXT:    movq %rdi, %rax
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    testb $1, %cl
+; KNL-NEXT:    cmoveq %rsi, %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test14:
 ; SKX:       ## %bb.0:
+; SKX-NEXT:    movq %rdi, %rax
 ; SKX-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
 ; SKX-NEXT:    kshiftrw $4, %k0, %k0
-; SKX-NEXT:    kmovd %k0, %eax
-; SKX-NEXT:    testb $1, %al
-; SKX-NEXT:    cmoveq %rsi, %rdi
-; SKX-NEXT:    movq %rdi, %rax
+; SKX-NEXT:    kmovd %k0, %ecx
+; SKX-NEXT:    testb $1, %cl
+; SKX-NEXT:    cmoveq %rsi, %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %cmpvector_func.i = icmp slt <8 x i64> %a, %b
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index 7cb6d788..3542ec7 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -5853,9 +5853,10 @@
 ;
 ; X64-LABEL: test_kand:
 ; X64:       ## %bb.0:
-; X64-NEXT:    andl %esi, %edi ## encoding: [0x21,0xf7]
-; X64-NEXT:    andl $8, %edi ## encoding: [0x83,0xe7,0x08]
 ; X64-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
+; X64-NEXT:    andl %esi, %eax ## encoding: [0x21,0xf0]
+; X64-NEXT:    andl $8, %eax ## encoding: [0x83,0xe0,0x08]
+; X64-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq ## encoding: [0xc3]
   %t1 = call i16 @llvm.x86.avx512.kand.w(i16 %a0, i16 8)
   %t2 = call i16 @llvm.x86.avx512.kand.w(i16 %t1, i16 %a1)
@@ -5875,9 +5876,10 @@
 ;
 ; X64-LABEL: test_kandn:
 ; X64:       ## %bb.0:
-; X64-NEXT:    orl $-9, %edi ## encoding: [0x83,0xcf,0xf7]
-; X64-NEXT:    andl %esi, %edi ## encoding: [0x21,0xf7]
 ; X64-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
+; X64-NEXT:    orl $-9, %eax ## encoding: [0x83,0xc8,0xf7]
+; X64-NEXT:    andl %esi, %eax ## encoding: [0x21,0xf0]
+; X64-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq ## encoding: [0xc3]
   %t1 = call i16 @llvm.x86.avx512.kandn.w(i16 %a0, i16 8)
   %t2 = call i16 @llvm.x86.avx512.kandn.w(i16 %t1, i16 %a1)
@@ -5895,8 +5897,9 @@
 ;
 ; X64-LABEL: test_knot:
 ; X64:       ## %bb.0:
-; X64-NEXT:    notl %edi ## encoding: [0xf7,0xd7]
 ; X64-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
+; X64-NEXT:    notl %eax ## encoding: [0xf7,0xd0]
+; X64-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.knot.w(i16 %a0)
   ret i16 %res
@@ -5914,9 +5917,10 @@
 ;
 ; X64-LABEL: test_kor:
 ; X64:       ## %bb.0:
-; X64-NEXT:    orl %esi, %edi ## encoding: [0x09,0xf7]
-; X64-NEXT:    orl $8, %edi ## encoding: [0x83,0xcf,0x08]
 ; X64-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
+; X64-NEXT:    orl %esi, %eax ## encoding: [0x09,0xf0]
+; X64-NEXT:    orl $8, %eax ## encoding: [0x83,0xc8,0x08]
+; X64-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq ## encoding: [0xc3]
   %t1 = call i16 @llvm.x86.avx512.kor.w(i16 %a0, i16 8)
   %t2 = call i16 @llvm.x86.avx512.kor.w(i16 %t1, i16 %a1)
@@ -5937,9 +5941,10 @@
 ;
 ; X64-LABEL: test_kxnor:
 ; X64:       ## %bb.0:
-; X64-NEXT:    xorl %esi, %edi ## encoding: [0x31,0xf7]
-; X64-NEXT:    xorl $8, %edi ## encoding: [0x83,0xf7,0x08]
 ; X64-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
+; X64-NEXT:    xorl %esi, %eax ## encoding: [0x31,0xf0]
+; X64-NEXT:    xorl $8, %eax ## encoding: [0x83,0xf0,0x08]
+; X64-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq ## encoding: [0xc3]
   %t1 = call i16 @llvm.x86.avx512.kxnor.w(i16 %a0, i16 8)
   %t2 = call i16 @llvm.x86.avx512.kxnor.w(i16 %t1, i16 %a1)
@@ -5958,9 +5963,10 @@
 ;
 ; X64-LABEL: test_kxor:
 ; X64:       ## %bb.0:
-; X64-NEXT:    xorl %esi, %edi ## encoding: [0x31,0xf7]
-; X64-NEXT:    xorl $8, %edi ## encoding: [0x83,0xf7,0x08]
 ; X64-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
+; X64-NEXT:    xorl %esi, %eax ## encoding: [0x31,0xf0]
+; X64-NEXT:    xorl $8, %eax ## encoding: [0x83,0xf0,0x08]
+; X64-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq ## encoding: [0xc3]
   %t1 = call i16 @llvm.x86.avx512.kxor.w(i16 %a0, i16 8)
   %t2 = call i16 @llvm.x86.avx512.kxor.w(i16 %t1, i16 %a1)
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index a99b8bf..1449f5c 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -9,8 +9,9 @@
 define i16 @mask16(i16 %x) {
 ; CHECK-LABEL: mask16:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    notl %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    notl %eax
+; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
 ;
 ; X86-LABEL: mask16:
@@ -47,8 +48,9 @@
 define i8 @mask8(i8 %x) {
 ; CHECK-LABEL: mask8:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    notb %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    notb %al
+; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 ;
 ; X86-LABEL: mask8:
@@ -149,10 +151,11 @@
 ; CHECK-LABEL: mand16:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    xorl %esi, %eax
-; CHECK-NEXT:    andl %esi, %edi
-; CHECK-NEXT:    orl %eax, %edi
-; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    xorl %esi, %ecx
+; CHECK-NEXT:    andl %esi, %eax
+; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
 ;
 ; X86-LABEL: mand16:
diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
index 9bdb19a..ea705d1 100644
--- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -924,47 +924,46 @@
 ; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    subl $20, %esp
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %esi
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    subl %ecx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl %edi, %ebp
-; X32-NEXT:    subl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    imull %ebp, %edx
-; X32-NEXT:    subl %esi, %ebx
+; X32-NEXT:    subl %ecx, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl %esi, %ecx
+; X32-NEXT:    movl %esi, %ebp
+; X32-NEXT:    subl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    imull %ebp, %ebx
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    subl %edi, %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    imull %ebx, %ecx
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %ebp
+; X32-NEXT:    imull %ebp, %ecx
+; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %ebp
 ; X32-NEXT:    subl {{[0-9]+}}(%esp), %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    subl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    imull %ebp, %eax
-; X32-NEXT:    addl %eax, %edx
+; X32-NEXT:    addl %eax, %ebx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl (%esp), %ebp # 4-byte Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    imull %eax, %edi
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    imull %ebp, %esi
-; X32-NEXT:    addl %edi, %esi
+; X32-NEXT:    imull %eax, %esi
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    imull %ebp, %edx
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    imull %ebx, %ecx
-; X32-NEXT:    addl %esi, %ecx
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    movl %edx, %eax
+; X32-NEXT:    imull %edi, %ecx
+; X32-NEXT:    addl %edx, %ecx
+; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    addl $20, %esp
 ; X32-NEXT:    popl %ebx
 ; X32-NEXT:    popl %ebp
diff --git a/llvm/test/CodeGen/X86/avx512-schedule.ll b/llvm/test/CodeGen/X86/avx512-schedule.ll
index 28af00a..5915523 100755
--- a/llvm/test/CodeGen/X86/avx512-schedule.ll
+++ b/llvm/test/CodeGen/X86/avx512-schedule.ll
@@ -947,16 +947,16 @@
 define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i, double* %j, <8 x i64> %mask1) nounwind {
 ; GENERIC-LABEL: test_mask_broadcast_vaddpd:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [10:1.00]
 ; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vptestmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT:    vaddpd (%rdi){1to8}, %zmm1, %zmm0 {%k1} # sched: [10:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: test_mask_broadcast_vaddpd:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vptestmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [11:0.50]
 ; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT:    vptestmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vaddpd (%rdi){1to8}, %zmm1, %zmm0 {%k1} # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %mask = icmp ne <8 x i64> %mask1, zeroinitializer
   %tmp = load double, double* %j
@@ -6669,14 +6669,16 @@
 define i16 @mask16(i16 %x) {
 ; GENERIC-LABEL: mask16:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    notl %edi # sched: [1:0.33]
 ; GENERIC-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT:    notl %eax # sched: [1:0.33]
+; GENERIC-NEXT:    # kill: def $ax killed $ax killed $eax
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mask16:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    notl %edi # sched: [1:0.25]
 ; SKX-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; SKX-NEXT:    notl %eax # sched: [1:0.25]
+; SKX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %m0 = bitcast i16 %x to <16 x i1>
   %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
@@ -6706,14 +6708,16 @@
 define i8 @mask8(i8 %x) {
 ; GENERIC-LABEL: mask8:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    notb %dil # sched: [1:0.33]
 ; GENERIC-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT:    notb %al # sched: [1:0.33]
+; GENERIC-NEXT:    # kill: def $al killed $al killed $eax
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mask8:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    notb %dil # sched: [1:0.25]
 ; SKX-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; SKX-NEXT:    notb %al # sched: [1:0.25]
+; SKX-NEXT:    # kill: def $al killed $al killed $eax
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %m0 = bitcast i8 %x to <8 x i1>
   %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
@@ -6788,19 +6792,21 @@
 ; GENERIC-LABEL: mand16:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    movl %edi, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    xorl %esi, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    andl %esi, %edi # sched: [1:0.33]
-; GENERIC-NEXT:    orl %eax, %edi # sched: [1:0.33]
-; GENERIC-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT:    movl %edi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT:    xorl %esi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT:    andl %esi, %eax # sched: [1:0.33]
+; GENERIC-NEXT:    orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT:    # kill: def $ax killed $ax killed $eax
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: mand16:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; SKX-NEXT:    xorl %esi, %eax # sched: [1:0.25]
-; SKX-NEXT:    andl %esi, %edi # sched: [1:0.25]
-; SKX-NEXT:    orl %eax, %edi # sched: [1:0.25]
-; SKX-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; SKX-NEXT:    movl %edi, %ecx # sched: [1:0.25]
+; SKX-NEXT:    xorl %esi, %ecx # sched: [1:0.25]
+; SKX-NEXT:    andl %esi, %eax # sched: [1:0.25]
+; SKX-NEXT:    orl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %ma = bitcast i16 %x to <16 x i1>
   %mb = bitcast i16 %y to <16 x i1>
diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll
index aadec5a..2ad2251 100644
--- a/llvm/test/CodeGen/X86/avx512-select.ll
+++ b/llvm/test/CodeGen/X86/avx512-select.ll
@@ -135,8 +135,9 @@
 ;
 ; X64-LABEL: select05:
 ; X64:       # %bb.0:
-; X64-NEXT:    orl %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %mask = bitcast i8 %m to <8 x i1>
   %a = bitcast i8 %a.0 to <8 x i1>
@@ -185,8 +186,9 @@
 ;
 ; X64-LABEL: select06:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %mask = bitcast i8 %m to <8 x i1>
   %a = bitcast i8 %a.0 to <8 x i1>
diff --git a/llvm/test/CodeGen/X86/avx512bw-mask-op.ll b/llvm/test/CodeGen/X86/avx512bw-mask-op.ll
index 7ddb1f3..c6a2da1 100644
--- a/llvm/test/CodeGen/X86/avx512bw-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-mask-op.ll
@@ -4,8 +4,8 @@
 define i32 @mask32(i32 %x) {
 ; CHECK-LABEL: mask32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    notl %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    notl %eax
 ; CHECK-NEXT:    retq
   %m0 = bitcast i32 %x to <32 x i1>
   %m1 = xor <32 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
@@ -19,8 +19,8 @@
 define i64 @mask64(i64 %x) {
 ; CHECK-LABEL: mask64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    notq %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    notq %rax
 ; CHECK-NEXT:    retq
   %m0 = bitcast i64 %x to <64 x i1>
   %m1 = xor <64 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
@@ -79,10 +79,10 @@
 ; CHECK-LABEL: mand32:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andl %esi, %eax
-; CHECK-NEXT:    xorl %esi, %edi
-; CHECK-NEXT:    orl %eax, %edi
-; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    andl %esi, %ecx
+; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    orl %ecx, %eax
 ; CHECK-NEXT:    retq
   %ma = bitcast i32 %x to <32 x i1>
   %mb = bitcast i32 %y to <32 x i1>
@@ -116,10 +116,10 @@
 ; CHECK-LABEL: mand64:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    andq %rsi, %rax
-; CHECK-NEXT:    xorq %rsi, %rdi
-; CHECK-NEXT:    orq %rax, %rdi
-; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movq %rdi, %rcx
+; CHECK-NEXT:    andq %rsi, %rcx
+; CHECK-NEXT:    xorq %rsi, %rax
+; CHECK-NEXT:    orq %rcx, %rax
 ; CHECK-NEXT:    retq
   %ma = bitcast i64 %x to <64 x i1>
   %mb = bitcast i64 %y to <64 x i1>
diff --git a/llvm/test/CodeGen/X86/avx512dq-mask-op.ll b/llvm/test/CodeGen/X86/avx512dq-mask-op.ll
index 7bc367f..06bbeef 100644
--- a/llvm/test/CodeGen/X86/avx512dq-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512dq-mask-op.ll
@@ -4,8 +4,9 @@
 define i8 @mask8(i8 %x) {
 ; CHECK-LABEL: mask8:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    notb %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    notb %al
+; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %m0 = bitcast i8 %x to <8 x i1>
   %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
@@ -32,10 +33,11 @@
 ; CHECK-LABEL: mand8:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    xorl %esi, %eax
-; CHECK-NEXT:    andl %esi, %edi
-; CHECK-NEXT:    orl %eax, %edi
-; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    xorl %esi, %ecx
+; CHECK-NEXT:    andl %esi, %eax
+; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %ma = bitcast i8 %x to <8 x i1>
   %mb = bitcast i8 %y to <8 x i1>
diff --git a/llvm/test/CodeGen/X86/avx512vl-arith.ll b/llvm/test/CodeGen/X86/avx512vl-arith.ll
index 967ac3b..9c6de82 100755
--- a/llvm/test/CodeGen/X86/avx512vl-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-arith.ll
@@ -408,9 +408,9 @@
 define <4 x double> @test_mask_broadcast_vaddpd_256(<4 x double> %dst, <4 x double> %i, double* %j, <4 x i64> %mask1) nounwind {
 ; CHECK-LABEL: test_mask_broadcast_vaddpd_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vptestmq %ymm2, %ymm2, %k1 ## encoding: [0x62,0xf2,0xed,0x28,0x27,0xca]
-; CHECK-NEXT:    vaddpd (%rdi){1to4}, %ymm1, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x39,0x58,0x0f]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1]
+; CHECK-NEXT:    vptestmq %ymm2, %ymm2, %k1 ## encoding: [0x62,0xf2,0xed,0x28,0x27,0xca]
+; CHECK-NEXT:    vaddpd (%rdi){1to4}, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x39,0x58,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <4 x i64> %mask1, zeroinitializer
   %tmp = load double, double* %j
@@ -835,9 +835,9 @@
 define <2 x double> @test_mask_broadcast_vaddpd_128(<2 x double> %dst, <2 x double> %i, double* %j, <2 x i64> %mask1) nounwind {
 ; CHECK-LABEL: test_mask_broadcast_vaddpd_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vptestmq %xmm2, %xmm2, %k1 ## encoding: [0x62,0xf2,0xed,0x08,0x27,0xca]
-; CHECK-NEXT:    vaddpd (%rdi){1to2}, %xmm1, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x19,0x58,0x0f]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-NEXT:    vptestmq %xmm2, %xmm2, %k1 ## encoding: [0x62,0xf2,0xed,0x08,0x27,0xca]
+; CHECK-NEXT:    vaddpd (%rdi){1to2}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x19,0x58,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %mask = icmp ne <2 x i64> %mask1, zeroinitializer
   %tmp = load double, double* %j
diff --git a/llvm/test/CodeGen/X86/bigstructret.ll b/llvm/test/CodeGen/X86/bigstructret.ll
index d4db764..b459330 100644
--- a/llvm/test/CodeGen/X86/bigstructret.ll
+++ b/llvm/test/CodeGen/X86/bigstructret.ll
@@ -8,20 +8,20 @@
 define fastcc %0 @ReturnBigStruct() nounwind readnone {
 ; X86-LABEL: ReturnBigStruct:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl $24601, 12(%ecx) # imm = 0x6019
 ; X86-NEXT:    movl $48, 8(%ecx)
 ; X86-NEXT:    movl $24, 4(%ecx)
 ; X86-NEXT:    movl $12, (%ecx)
-; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: ReturnBigStruct:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movabsq $105660490448944, %rax # imm = 0x601900000030
-; X64-NEXT:    movq %rax, 8(%rdi)
-; X64-NEXT:    movabsq $103079215116, %rax # imm = 0x180000000C
-; X64-NEXT:    movq %rax, (%rdi)
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movabsq $105660490448944, %rcx # imm = 0x601900000030
+; X64-NEXT:    movq %rcx, 8(%rdi)
+; X64-NEXT:    movabsq $103079215116, %rcx # imm = 0x180000000C
+; X64-NEXT:    movq %rcx, (%rdi)
 ; X64-NEXT:    retq
 entry:
   %0 = insertvalue %0 zeroinitializer, i32 12, 0
@@ -35,18 +35,18 @@
 define fastcc %1 @ReturnBigStruct2() nounwind readnone {
 ; X86-LABEL: ReturnBigStruct2:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl $48, 4(%ecx)
 ; X86-NEXT:    movb $1, 2(%ecx)
 ; X86-NEXT:    movw $256, (%ecx) # imm = 0x100
-; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: ReturnBigStruct2:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    movl $48, 4(%rdi)
 ; X64-NEXT:    movb $1, 2(%rdi)
 ; X64-NEXT:    movw $256, (%rdi) # imm = 0x100
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
 entry:
   %0 = insertvalue %1 zeroinitializer, i1 false, 0
diff --git a/llvm/test/CodeGen/X86/bitcast-i256.ll b/llvm/test/CodeGen/X86/bitcast-i256.ll
index a29292e..0a1953b 100644
--- a/llvm/test/CodeGen/X86/bitcast-i256.ll
+++ b/llvm/test/CodeGen/X86/bitcast-i256.ll
@@ -5,16 +5,16 @@
 define i256 @foo(<8 x i32> %a) {
 ; FAST-LABEL: foo:
 ; FAST:       # %bb.0:
-; FAST-NEXT:    vmovups %ymm0, (%rdi)
 ; FAST-NEXT:    movq %rdi, %rax
+; FAST-NEXT:    vmovups %ymm0, (%rdi)
 ; FAST-NEXT:    vzeroupper
 ; FAST-NEXT:    retq
 ;
 ; SLOW-LABEL: foo:
 ; SLOW:       # %bb.0:
+; SLOW-NEXT:    movq %rdi, %rax
 ; SLOW-NEXT:    vextractf128 $1, %ymm0, 16(%rdi)
 ; SLOW-NEXT:    vmovups %xmm0, (%rdi)
-; SLOW-NEXT:    movq %rdi, %rax
 ; SLOW-NEXT:    vzeroupper
 ; SLOW-NEXT:    retq
   %r = bitcast <8 x i32> %a to i256
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
index c1b1140..1acc834 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
@@ -193,8 +193,8 @@
 define <32 x i1> @bitcast_i32_32i1(i32 %a0) {
 ; SSE2-SSSE3-LABEL: bitcast_i32_32i1:
 ; SSE2-SSSE3:       # %bb.0:
-; SSE2-SSSE3-NEXT:    movl %esi, (%rdi)
 ; SSE2-SSSE3-NEXT:    movq %rdi, %rax
+; SSE2-SSSE3-NEXT:    movl %esi, (%rdi)
 ; SSE2-SSSE3-NEXT:    retq
 ;
 ; AVX1-LABEL: bitcast_i32_32i1:
@@ -250,14 +250,14 @@
 define <64 x i1> @bitcast_i64_64i1(i64 %a0) {
 ; SSE2-SSSE3-LABEL: bitcast_i64_64i1:
 ; SSE2-SSSE3:       # %bb.0:
-; SSE2-SSSE3-NEXT:    movq %rsi, (%rdi)
 ; SSE2-SSSE3-NEXT:    movq %rdi, %rax
+; SSE2-SSSE3-NEXT:    movq %rsi, (%rdi)
 ; SSE2-SSSE3-NEXT:    retq
 ;
 ; AVX12-LABEL: bitcast_i64_64i1:
 ; AVX12:       # %bb.0:
-; AVX12-NEXT:    movq %rsi, (%rdi)
 ; AVX12-NEXT:    movq %rdi, %rax
+; AVX12-NEXT:    movq %rsi, (%rdi)
 ; AVX12-NEXT:    retq
 ;
 ; AVX512-LABEL: bitcast_i64_64i1:
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index aeca4c3..2e35fde 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -341,20 +341,21 @@
 ;
 ; X64-LABEL: test_bitreverse_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    rolb $4, %dil
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andb $51, %al
-; X64-NEXT:    shlb $2, %al
-; X64-NEXT:    andb $-52, %dil
-; X64-NEXT:    shrb $2, %dil
-; X64-NEXT:    orb %al, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andb $85, %al
-; X64-NEXT:    addb %al, %al
-; X64-NEXT:    andb $-86, %dil
-; X64-NEXT:    shrb %dil
-; X64-NEXT:    orb %al, %dil
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    rolb $4, %al
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andb $51, %cl
+; X64-NEXT:    shlb $2, %cl
+; X64-NEXT:    andb $-52, %al
+; X64-NEXT:    shrb $2, %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andb $85, %cl
+; X64-NEXT:    addb %cl, %cl
+; X64-NEXT:    andb $-86, %al
+; X64-NEXT:    shrb %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %b = call i8 @llvm.bitreverse.i8(i8 %a)
   ret i8 %b
@@ -384,21 +385,22 @@
 ;
 ; X64-LABEL: test_bitreverse_i4:
 ; X64:       # %bb.0:
-; X64-NEXT:    rolb $4, %dil
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andb $51, %al
-; X64-NEXT:    shlb $2, %al
-; X64-NEXT:    andb $-52, %dil
-; X64-NEXT:    shrb $2, %dil
-; X64-NEXT:    orb %al, %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andb $80, %al
-; X64-NEXT:    addb %al, %al
-; X64-NEXT:    andb $-96, %dil
-; X64-NEXT:    shrb %dil
-; X64-NEXT:    orb %al, %dil
-; X64-NEXT:    shrb $4, %dil
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    rolb $4, %al
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andb $51, %cl
+; X64-NEXT:    shlb $2, %cl
+; X64-NEXT:    andb $-52, %al
+; X64-NEXT:    shrb $2, %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andb $80, %cl
+; X64-NEXT:    addb %cl, %cl
+; X64-NEXT:    andb $-96, %al
+; X64-NEXT:    shrb %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    shrb $4, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %b = call i4 @llvm.bitreverse.i4(i4 %a)
   ret i4 %b
@@ -474,6 +476,7 @@
 ; X64-LABEL: identity_i8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %b = call i8 @llvm.bitreverse.i8(i8 %a)
   %c = call i8 @llvm.bitreverse.i8(i8 %b)
diff --git a/llvm/test/CodeGen/X86/bmi-intrinsics-fast-isel-x86_64.ll b/llvm/test/CodeGen/X86/bmi-intrinsics-fast-isel-x86_64.ll
index f86df57..01c225d 100644
--- a/llvm/test/CodeGen/X86/bmi-intrinsics-fast-isel-x86_64.ll
+++ b/llvm/test/CodeGen/X86/bmi-intrinsics-fast-isel-x86_64.ll
@@ -10,9 +10,9 @@
 define i64 @test__andn_u64(i64 %a0, i64 %a1) {
 ; X64-LABEL: test__andn_u64:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorq $-1, %rdi
-; X64-NEXT:    andq %rsi, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    xorq $-1, %rax
+; X64-NEXT:    andq %rsi, %rax
 ; X64-NEXT:    retq
   %xor = xor i64 %a0, -1
   %res = and i64 %xor, %a1
@@ -84,9 +84,9 @@
 define i64 @test_andn_u64(i64 %a0, i64 %a1) {
 ; X64-LABEL: test_andn_u64:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorq $-1, %rdi
-; X64-NEXT:    andq %rsi, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    xorq $-1, %rax
+; X64-NEXT:    andq %rsi, %rax
 ; X64-NEXT:    retq
   %xor = xor i64 %a0, -1
   %res = and i64 %xor, %a1
diff --git a/llvm/test/CodeGen/X86/bmi-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/bmi-intrinsics-fast-isel.ll
index 52ccd24..a784466 100644
--- a/llvm/test/CodeGen/X86/bmi-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/bmi-intrinsics-fast-isel.ll
@@ -47,9 +47,9 @@
 ;
 ; X64-LABEL: test__andn_u32:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl $-1, %edi
-; X64-NEXT:    andl %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorl $-1, %eax
+; X64-NEXT:    andl %esi, %eax
 ; X64-NEXT:    retq
   %xor = xor i32 %a0, -1
   %res = and i32 %xor, %a1
@@ -199,9 +199,9 @@
 ;
 ; X64-LABEL: test_andn_u32:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl $-1, %edi
-; X64-NEXT:    andl %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorl $-1, %eax
+; X64-NEXT:    andl %esi, %eax
 ; X64-NEXT:    retq
   %xor = xor i32 %a0, -1
   %res = and i32 %xor, %a1
diff --git a/llvm/test/CodeGen/X86/bmi.ll b/llvm/test/CodeGen/X86/bmi.ll
index 2fc7cd5..945ff82 100644
--- a/llvm/test/CodeGen/X86/bmi.ll
+++ b/llvm/test/CodeGen/X86/bmi.ll
@@ -421,9 +421,9 @@
 ;
 ; X64-LABEL: non_bextr32:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    shrl $2, %edi
-; X64-NEXT:    andl $111, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shrl $2, %eax
+; X64-NEXT:    andl $111, %eax
 ; X64-NEXT:    retq
 entry:
   %shr = lshr i32 %x, 2
diff --git a/llvm/test/CodeGen/X86/bmi2.ll b/llvm/test/CodeGen/X86/bmi2.ll
index 782d3fa..bf78cb4 100644
--- a/llvm/test/CodeGen/X86/bmi2.ll
+++ b/llvm/test/CodeGen/X86/bmi2.ll
@@ -130,15 +130,15 @@
 ;
 ; X64-LABEL: mulx32:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    addl %edi, %edi
-; X64-NEXT:    addl %esi, %esi
-; X64-NEXT:    imulq %rdi, %rsi
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    shrq $32, %rax
-; X64-NEXT:    movl %eax, (%rdx)
-; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    addl %eax, %eax
+; X64-NEXT:    imulq %rdi, %rax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    shrq $32, %rcx
+; X64-NEXT:    movl %ecx, (%rdx)
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %x1 = add i32 %x, %x
   %y1 = add i32 %y, %y
@@ -165,14 +165,14 @@
 ;
 ; X64-LABEL: mulx32_load:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    addl %edi, %edi
-; X64-NEXT:    movl (%rsi), %eax
-; X64-NEXT:    imulq %rax, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shrq $32, %rax
-; X64-NEXT:    movl %eax, (%rdx)
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    addl %eax, %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    imulq %rcx, %rax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    shrq $32, %rcx
+; X64-NEXT:    movl %ecx, (%rdx)
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %x1 = add i32 %x, %x
   %y1 = load i32, i32* %y
diff --git a/llvm/test/CodeGen/X86/bool-math.ll b/llvm/test/CodeGen/X86/bool-math.ll
index 8a63d52..86cda14 100644
--- a/llvm/test/CodeGen/X86/bool-math.ll
+++ b/llvm/test/CodeGen/X86/bool-math.ll
@@ -32,9 +32,10 @@
 define i8 @sub_zext_cmp_mask_narrower_result(i32 %x) {
 ; CHECK-LABEL: sub_zext_cmp_mask_narrower_result:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    orb $46, %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    orb $46, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %a = and i32 %x, 1
   %c = icmp eq i32 %a, 0
@@ -46,9 +47,10 @@
 define i8 @add_zext_cmp_mask_same_size_result(i8 %x) {
 ; CHECK-LABEL: add_zext_cmp_mask_same_size_result:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andb $1, %dil
-; CHECK-NEXT:    xorb $27, %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    xorb $27, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %a = and i8 %x, 1
   %c = icmp eq i8 %a, 0
@@ -60,9 +62,9 @@
 define i32 @add_zext_cmp_mask_wider_result(i8 %x) {
 ; CHECK-LABEL: add_zext_cmp_mask_wider_result:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    xorl $27, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    xorl $27, %eax
 ; CHECK-NEXT:    retq
   %a = and i8 %x, 1
   %c = icmp eq i8 %a, 0
@@ -74,9 +76,10 @@
 define i8 @add_zext_cmp_mask_narrower_result(i32 %x) {
 ; CHECK-LABEL: add_zext_cmp_mask_narrower_result:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    xorb $43, %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    xorb $43, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %a = and i32 %x, 1
   %c = icmp eq i32 %a, 0
@@ -128,9 +131,10 @@
 define i8 @low_bit_select_constants_bigger_true_same_size_result(i8 %x) {
 ; CHECK-LABEL: low_bit_select_constants_bigger_true_same_size_result:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andb $1, %dil
-; CHECK-NEXT:    xorb $-29, %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    xorb $-29, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %a = and i8 %x, 1
   %c = icmp eq i8 %a, 0
@@ -141,9 +145,9 @@
 define i32 @low_bit_select_constants_bigger_true_wider_result(i8 %x) {
 ; CHECK-LABEL: low_bit_select_constants_bigger_true_wider_result:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    xorl $227, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    xorl $227, %eax
 ; CHECK-NEXT:    retq
   %a = and i8 %x, 1
   %c = icmp eq i8 %a, 0
@@ -154,9 +158,10 @@
 define i8 @low_bit_select_constants_bigger_true_narrower_result(i16 %x) {
 ; CHECK-LABEL: low_bit_select_constants_bigger_true_narrower_result:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    xorb $41, %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    xorb $41, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %a = and i16 %x, 1
   %c = icmp eq i16 %a, 0
diff --git a/llvm/test/CodeGen/X86/bool-simplify.ll b/llvm/test/CodeGen/X86/bool-simplify.ll
index bbb7eb7..edc36fd 100644
--- a/llvm/test/CodeGen/X86/bool-simplify.ll
+++ b/llvm/test/CodeGen/X86/bool-simplify.ll
@@ -4,9 +4,9 @@
 define i32 @foo(<2 x i64> %c, i32 %a, i32 %b) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    ptest %xmm0, %xmm0
-; CHECK-NEXT:    cmovnel %esi, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    ptest %xmm0, %xmm0
+; CHECK-NEXT:    cmovnel %esi, %eax
 ; CHECK-NEXT:    retq
   %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
   %t2 = icmp ne i32 %t1, 0
diff --git a/llvm/test/CodeGen/X86/bswap-rotate.ll b/llvm/test/CodeGen/X86/bswap-rotate.ll
index 62798ba..3326c1b 100644
--- a/llvm/test/CodeGen/X86/bswap-rotate.ll
+++ b/llvm/test/CodeGen/X86/bswap-rotate.ll
@@ -14,8 +14,9 @@
 ;
 ; X64-LABEL: combine_bswap_rotate:
 ; X64:       # %bb.0:
-; X64-NEXT:    rolw $9, %di
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    rolw $9, %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %1 = call i16 @llvm.bswap.i16(i16 %a0)
   %2 = shl i16 %1, 1
diff --git a/llvm/test/CodeGen/X86/bswap-wide-int.ll b/llvm/test/CodeGen/X86/bswap-wide-int.ll
index 8d64161..1ba107a 100644
--- a/llvm/test/CodeGen/X86/bswap-wide-int.ll
+++ b/llvm/test/CodeGen/X86/bswap-wide-int.ll
@@ -25,14 +25,14 @@
 ;
 ; X64-LABEL: bswap_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    bswapq %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    retq
 ;
 ; X64-MOVBE-LABEL: bswap_i64:
 ; X64-MOVBE:       # %bb.0:
-; X64-MOVBE-NEXT:    bswapq %rdi
 ; X64-MOVBE-NEXT:    movq %rdi, %rax
+; X64-MOVBE-NEXT:    bswapq %rax
 ; X64-MOVBE-NEXT:    retq
   %1 = call i64 @llvm.bswap.i64(i64 %a0)
   ret i64 %1
@@ -79,17 +79,17 @@
 ;
 ; X64-LABEL: bswap_i128:
 ; X64:       # %bb.0:
-; X64-NEXT:    bswapq %rsi
-; X64-NEXT:    bswapq %rdi
 ; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    bswapq %rax
+; X64-NEXT:    bswapq %rdi
 ; X64-NEXT:    movq %rdi, %rdx
 ; X64-NEXT:    retq
 ;
 ; X64-MOVBE-LABEL: bswap_i128:
 ; X64-MOVBE:       # %bb.0:
-; X64-MOVBE-NEXT:    bswapq %rsi
-; X64-MOVBE-NEXT:    bswapq %rdi
 ; X64-MOVBE-NEXT:    movq %rsi, %rax
+; X64-MOVBE-NEXT:    bswapq %rax
+; X64-MOVBE-NEXT:    bswapq %rdi
 ; X64-MOVBE-NEXT:    movq %rdi, %rdx
 ; X64-MOVBE-NEXT:    retq
   %1 = call i128 @llvm.bswap.i128(i128 %a0)
@@ -149,6 +149,7 @@
 ;
 ; X64-LABEL: bswap_i256:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    bswapq %r8
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    bswapq %rdx
@@ -157,16 +158,15 @@
 ; X64-NEXT:    movq %rdx, 16(%rdi)
 ; X64-NEXT:    movq %rcx, 8(%rdi)
 ; X64-NEXT:    movq %r8, (%rdi)
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
 ;
 ; X64-MOVBE-LABEL: bswap_i256:
 ; X64-MOVBE:       # %bb.0:
+; X64-MOVBE-NEXT:    movq %rdi, %rax
 ; X64-MOVBE-NEXT:    movbeq %rsi, 24(%rdi)
 ; X64-MOVBE-NEXT:    movbeq %rdx, 16(%rdi)
 ; X64-MOVBE-NEXT:    movbeq %rcx, 8(%rdi)
 ; X64-MOVBE-NEXT:    movbeq %r8, (%rdi)
-; X64-MOVBE-NEXT:    movq %rdi, %rax
 ; X64-MOVBE-NEXT:    retq
   %1 = call i256 @llvm.bswap.i256(i256 %a0)
   ret i256 %1
diff --git a/llvm/test/CodeGen/X86/bswap.ll b/llvm/test/CodeGen/X86/bswap.ll
index c831de3..756dd7fa 100644
--- a/llvm/test/CodeGen/X86/bswap.ll
+++ b/llvm/test/CodeGen/X86/bswap.ll
@@ -19,8 +19,9 @@
 ;
 ; CHECK64-LABEL: W:
 ; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    rolw $8, %di
 ; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64-NEXT:    rolw $8, %ax
+; CHECK64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK64-NEXT:    retq
         %Z = call i16 @llvm.bswap.i16( i16 %A )         ; <i16> [#uses=1]
         ret i16 %Z
@@ -35,8 +36,8 @@
 ;
 ; CHECK64-LABEL: X:
 ; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    bswapl %edi
 ; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64-NEXT:    bswapl %eax
 ; CHECK64-NEXT:    retq
         %Z = call i32 @llvm.bswap.i32( i32 %A )         ; <i32> [#uses=1]
         ret i32 %Z
@@ -53,8 +54,8 @@
 ;
 ; CHECK64-LABEL: Y:
 ; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    bswapq %rdi
 ; CHECK64-NEXT:    movq %rdi, %rax
+; CHECK64-NEXT:    bswapq %rax
 ; CHECK64-NEXT:    retq
         %Z = call i64 @llvm.bswap.i64( i64 %A )         ; <i64> [#uses=1]
         ret i64 %Z
@@ -71,9 +72,9 @@
 ;
 ; CHECK64-LABEL: test1:
 ; CHECK64:       # %bb.0: # %entry
-; CHECK64-NEXT:    bswapl %edi
-; CHECK64-NEXT:    shrl $16, %edi
 ; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64-NEXT:    bswapl %eax
+; CHECK64-NEXT:    shrl $16, %eax
 ; CHECK64-NEXT:    retq
 entry:
 
@@ -95,9 +96,9 @@
 ;
 ; CHECK64-LABEL: test2:
 ; CHECK64:       # %bb.0: # %entry
-; CHECK64-NEXT:    bswapl %edi
-; CHECK64-NEXT:    sarl $16, %edi
 ; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64-NEXT:    bswapl %eax
+; CHECK64-NEXT:    sarl $16, %eax
 ; CHECK64-NEXT:    retq
 entry:
 
diff --git a/llvm/test/CodeGen/X86/bswap_tree.ll b/llvm/test/CodeGen/X86/bswap_tree.ll
index acd9330..537acdb 100644
--- a/llvm/test/CodeGen/X86/bswap_tree.ll
+++ b/llvm/test/CodeGen/X86/bswap_tree.ll
@@ -20,9 +20,9 @@
 ;
 ; CHECK64-LABEL: test1:
 ; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    bswapl %edi
-; CHECK64-NEXT:    roll $16, %edi
 ; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64-NEXT:    bswapl %eax
+; CHECK64-NEXT:    roll $16, %eax
 ; CHECK64-NEXT:    retq
   %byte0 = and i32 %x, 255        ; 0x000000ff
   %byte1 = and i32 %x, 65280      ; 0x0000ff00
@@ -53,9 +53,9 @@
 ;
 ; CHECK64-LABEL: test2:
 ; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    bswapl %edi
-; CHECK64-NEXT:    roll $16, %edi
 ; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64-NEXT:    bswapl %eax
+; CHECK64-NEXT:    roll $16, %eax
 ; CHECK64-NEXT:    retq
   %byte1 = shl  i32 %x, 8
   %byte0 = lshr i32 %x, 8
diff --git a/llvm/test/CodeGen/X86/bswap_tree2.ll b/llvm/test/CodeGen/X86/bswap_tree2.ll
index f4d75f4..d716d82 100644
--- a/llvm/test/CodeGen/X86/bswap_tree2.ll
+++ b/llvm/test/CodeGen/X86/bswap_tree2.ll
@@ -25,16 +25,16 @@
 ; CHECK64-LABEL: test1:
 ; CHECK64:       # %bb.0:
 ; CHECK64-NEXT:    movl %edi, %eax
-; CHECK64-NEXT:    andl $16711680, %eax # imm = 0xFF0000
 ; CHECK64-NEXT:    movl %edi, %ecx
-; CHECK64-NEXT:    orl $-16777216, %ecx # imm = 0xFF000000
-; CHECK64-NEXT:    shll $8, %eax
-; CHECK64-NEXT:    shrl $8, %ecx
-; CHECK64-NEXT:    orl %eax, %ecx
-; CHECK64-NEXT:    bswapl %edi
-; CHECK64-NEXT:    shrl $16, %edi
-; CHECK64-NEXT:    orl %ecx, %edi
-; CHECK64-NEXT:    movl %edi, %eax
+; CHECK64-NEXT:    andl $16711680, %ecx # imm = 0xFF0000
+; CHECK64-NEXT:    movl %edi, %edx
+; CHECK64-NEXT:    orl $-16777216, %edx # imm = 0xFF000000
+; CHECK64-NEXT:    shll $8, %ecx
+; CHECK64-NEXT:    shrl $8, %edx
+; CHECK64-NEXT:    orl %ecx, %edx
+; CHECK64-NEXT:    bswapl %eax
+; CHECK64-NEXT:    shrl $16, %eax
+; CHECK64-NEXT:    orl %edx, %eax
 ; CHECK64-NEXT:    retq
   %byte0 = and i32 %x, 255        ; 0x000000ff
   %byte1 = and i32 %x, 65280      ; 0x0000ff00
diff --git a/llvm/test/CodeGen/X86/bt.ll b/llvm/test/CodeGen/X86/bt.ll
index 144e9e7..83c751f 100644
--- a/llvm/test/CodeGen/X86/bt.ll
+++ b/llvm/test/CodeGen/X86/bt.ll
@@ -1112,16 +1112,16 @@
 ;
 ; X64-LABEL: demanded_i32:
 ; X64:       # %bb.0:
+; X64-NEXT:    movl %edx, %ecx
 ; X64-NEXT:    movl %edx, %eax
 ; X64-NEXT:    shrl $5, %eax
-; X64-NEXT:    movl (%rdi,%rax,4), %r8d
-; X64-NEXT:    movl $1, %edi
-; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shll %cl, %edi
-; X64-NEXT:    btl %edx, %r8d
+; X64-NEXT:    movl (%rdi,%rax,4), %edi
+; X64-NEXT:    movl $1, %edx
+; X64-NEXT:    shll %cl, %edx
+; X64-NEXT:    btl %ecx, %edi
 ; X64-NEXT:    jae .LBB30_2
 ; X64-NEXT:  # %bb.1:
-; X64-NEXT:    orl %edi, (%rsi,%rax,4)
+; X64-NEXT:    orl %edx, (%rsi,%rax,4)
 ; X64-NEXT:  .LBB30_2:
 ; X64-NEXT:    retq
   %4 = lshr i32 %2, 5
diff --git a/llvm/test/CodeGen/X86/btc_bts_btr.ll b/llvm/test/CodeGen/X86/btc_bts_btr.ll
index 4b14d39..951794c 100644
--- a/llvm/test/CodeGen/X86/btc_bts_btr.ll
+++ b/llvm/test/CodeGen/X86/btc_bts_btr.ll
@@ -6,8 +6,9 @@
 define i16 @btr_16(i16 %x, i16 %n) {
 ; X64-LABEL: btr_16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movw $-2, %ax
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movw $-2, %ax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    rolw %cl, %ax
 ; X64-NEXT:    andl %edi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -28,8 +29,9 @@
 define i16 @bts_16(i16 %x, i16 %n) {
 ; X64-LABEL: bts_16:
 ; X64:       # %bb.0:
-; X64-NEXT:    btsl %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    btsl %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: bts_16:
@@ -48,8 +50,9 @@
 define i16 @btc_16(i16 %x, i16 %n) {
 ; X64-LABEL: btc_16:
 ; X64:       # %bb.0:
-; X64-NEXT:    btcl %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    btcl %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: btc_16:
@@ -68,8 +71,8 @@
 define i32 @btr_32(i32 %x, i32 %n) {
 ; X64-LABEL: btr_32:
 ; X64:       # %bb.0:
-; X64-NEXT:    btrl %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    btrl %esi, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: btr_32:
@@ -87,8 +90,8 @@
 define i32 @bts_32(i32 %x, i32 %n) {
 ; X64-LABEL: bts_32:
 ; X64:       # %bb.0:
-; X64-NEXT:    btsl %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    btsl %esi, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: bts_32:
@@ -105,8 +108,8 @@
 define i32 @btc_32(i32 %x, i32 %n) {
 ; X64-LABEL: btc_32:
 ; X64:       # %bb.0:
-; X64-NEXT:    btcl %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    btcl %esi, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: btc_32:
@@ -123,8 +126,8 @@
 define i64 @btr_64(i64 %x, i64 %n) {
 ; X64-LABEL: btr_64:
 ; X64:       # %bb.0:
-; X64-NEXT:    btrq %rsi, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    btrq %rsi, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: btr_64:
@@ -154,8 +157,8 @@
 define i64 @bts_64(i64 %x, i64 %n) {
 ; X64-LABEL: bts_64:
 ; X64:       # %bb.0:
-; X64-NEXT:    btsq %rsi, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    btsq %rsi, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: bts_64:
@@ -182,8 +185,8 @@
 define i64 @btc_64(i64 %x, i64 %n) {
 ; X64-LABEL: btc_64:
 ; X64:       # %bb.0:
-; X64-NEXT:    btcq %rsi, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    btcq %rsi, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: btc_64:
@@ -210,8 +213,9 @@
 define i16 @btr_16_mask(i16 %x, i16 %n) {
 ; X64-LABEL: btr_16_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    movw $-2, %ax
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movw $-2, %ax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    rolw %cl, %ax
 ; X64-NEXT:    andl %edi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -233,9 +237,10 @@
 define i16 @bts_16_mask(i16 %x, i16 %n) {
 ; X64-LABEL: bts_16_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $15, %sil
-; X64-NEXT:    btsl %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $15, %sil
+; X64-NEXT:    btsl %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: bts_16_mask:
@@ -256,9 +261,10 @@
 define i16 @btc_16_mask(i16 %x, i16 %n) {
 ; X64-LABEL: btc_16_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $15, %sil
-; X64-NEXT:    btcl %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $15, %sil
+; X64-NEXT:    btcl %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: btc_16_mask:
@@ -279,8 +285,8 @@
 define i32 @btr_32_mask(i32 %x, i32 %n) {
 ; X64-LABEL: btr_32_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    btrl %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    btrl %esi, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: btr_32_mask:
@@ -299,8 +305,8 @@
 define i32 @bts_32_mask(i32 %x, i32 %n) {
 ; X64-LABEL: bts_32_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    btsl %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    btsl %esi, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: bts_32_mask:
@@ -318,8 +324,8 @@
 define i32 @btc_32_mask(i32 %x, i32 %n) {
 ; X64-LABEL: btc_32_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    btcl %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    btcl %esi, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: btc_32_mask:
@@ -337,8 +343,8 @@
 define i64 @btr_64_mask(i64 %x, i64 %n) {
 ; X64-LABEL: btr_64_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    btrq %rsi, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    btrq %rsi, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: btr_64_mask:
@@ -369,8 +375,8 @@
 define i64 @bts_64_mask(i64 %x, i64 %n) {
 ; X64-LABEL: bts_64_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    btsq %rsi, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    btsq %rsi, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: bts_64_mask:
@@ -398,8 +404,8 @@
 define i64 @btc_64_mask(i64 %x, i64 %n) {
 ; X64-LABEL: btc_64_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    btcq %rsi, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    btcq %rsi, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: btc_64_mask:
@@ -450,8 +456,9 @@
 define i16 @bts_16_load(i16* %x, i16 %n) {
 ; X64-LABEL: bts_16_load:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $1, %eax
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    orw (%rdi), %ax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -475,8 +482,9 @@
 define i16 @btc_16_load(i16* %x, i16 %n) {
 ; X64-LABEL: btc_16_load:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $1, %eax
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    xorw (%rdi), %ax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -673,8 +681,9 @@
 define void @btr_16_dont_fold(i16* %x, i16 %n) {
 ; X64-LABEL: btr_16_dont_fold:
 ; X64:       # %bb.0:
-; X64-NEXT:    movw $-2, %ax
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movw $-2, %ax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    rolw %cl, %ax
 ; X64-NEXT:    andw %ax, (%rdi)
 ; X64-NEXT:    retq
@@ -698,8 +707,9 @@
 define void @bts_16_dont_fold(i16* %x, i16 %n) {
 ; X64-LABEL: bts_16_dont_fold:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $1, %eax
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    orw %ax, (%rdi)
 ; X64-NEXT:    retq
@@ -722,8 +732,9 @@
 define void @btc_16_dont_fold(i16* %x, i16 %n) {
 ; X64-LABEL: btc_16_dont_fold:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $1, %eax
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    xorw %ax, (%rdi)
 ; X64-NEXT:    retq
@@ -746,8 +757,9 @@
 define void @btr_32_dont_fold(i32* %x, i32 %n) {
 ; X64-LABEL: btr_32_dont_fold:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $-2, %eax
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl $-2, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    roll %cl, %eax
 ; X64-NEXT:    andl %eax, (%rdi)
 ; X64-NEXT:    retq
@@ -771,8 +783,9 @@
 define void @bts_32_dont_fold(i32* %x, i32 %n) {
 ; X64-LABEL: bts_32_dont_fold:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $1, %eax
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    orl %eax, (%rdi)
 ; X64-NEXT:    retq
@@ -795,8 +808,9 @@
 define void @btc_32_dont_fold(i32* %x, i32 %n) {
 ; X64-LABEL: btc_32_dont_fold:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $1, %eax
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    xorl %eax, (%rdi)
 ; X64-NEXT:    retq
@@ -819,8 +833,9 @@
 define void @btr_64_dont_fold(i64* %x, i64 %n) {
 ; X64-LABEL: btr_64_dont_fold:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rcx
 ; X64-NEXT:    movq $-2, %rax
-; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NEXT:    rolq %cl, %rax
 ; X64-NEXT:    andq %rax, (%rdi)
 ; X64-NEXT:    retq
@@ -860,8 +875,9 @@
 define void @bts_64_dont_fold(i64* %x, i64 %n) {
 ; X64-LABEL: bts_64_dont_fold:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rcx
 ; X64-NEXT:    movl $1, %eax
-; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NEXT:    shlq %cl, %rax
 ; X64-NEXT:    orq %rax, (%rdi)
 ; X64-NEXT:    retq
@@ -898,8 +914,9 @@
 define void @btc_64_dont_fold(i64* %x, i64 %n) {
 ; X64-LABEL: btc_64_dont_fold:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rcx
 ; X64-NEXT:    movl $1, %eax
-; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NEXT:    shlq %cl, %rax
 ; X64-NEXT:    xorq %rax, (%rdi)
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-64.ll b/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
index bed775d..24d73f5 100644
--- a/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
+++ b/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
@@ -8,17 +8,17 @@
 ; CHECK-LABEL: Test_get_quotient:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    orq %rsi, %rax
-; CHECK-NEXT:    shrq $32, %rax
+; CHECK-NEXT:    movq %rdi, %rcx
+; CHECK-NEXT:    orq %rsi, %rcx
+; CHECK-NEXT:    shrq $32, %rcx
 ; CHECK-NEXT:    je .LBB0_1
 ; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    cqto
 ; CHECK-NEXT:    idivq %rsi
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB0_1:
 ; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-NEXT:    divl %esi
 ; CHECK-NEXT:    # kill: def $eax killed $eax def $rax
 ; CHECK-NEXT:    retq
@@ -30,21 +30,20 @@
 ; CHECK-LABEL: Test_get_remainder:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    orq %rsi, %rax
-; CHECK-NEXT:    shrq $32, %rax
+; CHECK-NEXT:    movq %rdi, %rcx
+; CHECK-NEXT:    orq %rsi, %rcx
+; CHECK-NEXT:    shrq $32, %rcx
 ; CHECK-NEXT:    je .LBB1_1
 ; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    cqto
 ; CHECK-NEXT:    idivq %rsi
 ; CHECK-NEXT:    movq %rdx, %rax
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB1_1:
 ; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-NEXT:    divl %esi
-; CHECK-NEXT:    # kill: def $edx killed $edx def $rdx
-; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    retq
   %result = srem i64 %a, %b
   ret i64 %result
@@ -54,18 +53,18 @@
 ; CHECK-LABEL: Test_get_quotient_and_remainder:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    orq %rsi, %rax
-; CHECK-NEXT:    shrq $32, %rax
+; CHECK-NEXT:    movq %rdi, %rcx
+; CHECK-NEXT:    orq %rsi, %rcx
+; CHECK-NEXT:    shrq $32, %rcx
 ; CHECK-NEXT:    je .LBB2_1
 ; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    cqto
 ; CHECK-NEXT:    idivq %rsi
 ; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB2_1:
 ; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-NEXT:    divl %esi
 ; CHECK-NEXT:    # kill: def $edx killed $edx def $rdx
 ; CHECK-NEXT:    # kill: def $eax killed $eax def $rax
diff --git a/llvm/test/CodeGen/X86/clear-highbits.ll b/llvm/test/CodeGen/X86/clear-highbits.ll
index 3590b65..b23a696 100644
--- a/llvm/test/CodeGen/X86/clear-highbits.ll
+++ b/llvm/test/CodeGen/X86/clear-highbits.ll
@@ -33,10 +33,11 @@
 ; X64-LABEL: clear_highbits8_c0:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    shlb %cl, %dil
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrb %cl, %dil
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shlb %cl, %al
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shrb %cl, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %mask = lshr i8 -1, %numhighbits
   %masked = and i8 %mask, %val
@@ -79,10 +80,11 @@
 ; X64-LABEL: clear_highbits8_c4_commutative:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    shlb %cl, %dil
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shrb %cl, %dil
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shlb %cl, %al
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shrb %cl, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %mask = lshr i8 -1, %numhighbits
   %masked = and i8 %val, %mask ; swapped order
@@ -340,10 +342,10 @@
 ; X64-NOBMI2-LABEL: clear_highbits32_c0:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movl %esi, %ecx
-; X64-NOBMI2-NEXT:    shll %cl, %edi
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI2-NEXT:    shrl %cl, %edi
 ; X64-NOBMI2-NEXT:    movl %edi, %eax
+; X64-NOBMI2-NEXT:    shll %cl, %eax
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI2-NEXT:    shrl %cl, %eax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: clear_highbits32_c0:
@@ -375,10 +377,10 @@
 ; X64-NOBMI2-LABEL: clear_highbits32_c1_indexzext:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movl %esi, %ecx
-; X64-NOBMI2-NEXT:    shll %cl, %edi
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI2-NEXT:    shrl %cl, %edi
 ; X64-NOBMI2-NEXT:    movl %edi, %eax
+; X64-NOBMI2-NEXT:    shll %cl, %eax
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI2-NEXT:    shrl %cl, %eax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: clear_highbits32_c1_indexzext:
@@ -488,10 +490,10 @@
 ; X64-NOBMI2-LABEL: clear_highbits32_c4_commutative:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movl %esi, %ecx
-; X64-NOBMI2-NEXT:    shll %cl, %edi
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI2-NEXT:    shrl %cl, %edi
 ; X64-NOBMI2-NEXT:    movl %edi, %eax
+; X64-NOBMI2-NEXT:    shll %cl, %eax
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI2-NEXT:    shrl %cl, %eax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: clear_highbits32_c4_commutative:
@@ -545,10 +547,10 @@
 ; X64-NOBMI2-LABEL: clear_highbits64_c0:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movq %rsi, %rcx
-; X64-NOBMI2-NEXT:    shlq %cl, %rdi
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI2-NEXT:    shrq %cl, %rdi
 ; X64-NOBMI2-NEXT:    movq %rdi, %rax
+; X64-NOBMI2-NEXT:    shlq %cl, %rax
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NOBMI2-NEXT:    shrq %cl, %rax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: clear_highbits64_c0:
@@ -598,10 +600,10 @@
 ; X64-NOBMI2-LABEL: clear_highbits64_c1_indexzext:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movl %esi, %ecx
-; X64-NOBMI2-NEXT:    shlq %cl, %rdi
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI2-NEXT:    shrq %cl, %rdi
 ; X64-NOBMI2-NEXT:    movq %rdi, %rax
+; X64-NOBMI2-NEXT:    shlq %cl, %rax
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI2-NEXT:    shrq %cl, %rax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: clear_highbits64_c1_indexzext:
@@ -775,10 +777,10 @@
 ; X64-NOBMI2-LABEL: clear_highbits64_c4_commutative:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movq %rsi, %rcx
-; X64-NOBMI2-NEXT:    shlq %cl, %rdi
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI2-NEXT:    shrq %cl, %rdi
 ; X64-NOBMI2-NEXT:    movq %rdi, %rax
+; X64-NOBMI2-NEXT:    shlq %cl, %rax
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NOBMI2-NEXT:    shrq %cl, %rax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: clear_highbits64_c4_commutative:
@@ -834,9 +836,10 @@
 ; X64-NOBMI2-NEXT:    pushq %rbp
 ; X64-NOBMI2-NEXT:    pushq %rbx
 ; X64-NOBMI2-NEXT:    pushq %rax
+; X64-NOBMI2-NEXT:    movl %esi, %ecx
 ; X64-NOBMI2-NEXT:    movl %edi, %ebx
 ; X64-NOBMI2-NEXT:    movl $-1, %ebp
-; X64-NOBMI2-NEXT:    movl %esi, %ecx
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI2-NEXT:    shrl %cl, %ebp
 ; X64-NOBMI2-NEXT:    movl %ebp, %edi
 ; X64-NOBMI2-NEXT:    callq use32
@@ -934,9 +937,10 @@
 ; X64-NOBMI2-NEXT:    pushq %r14
 ; X64-NOBMI2-NEXT:    pushq %rbx
 ; X64-NOBMI2-NEXT:    pushq %rax
+; X64-NOBMI2-NEXT:    movq %rsi, %rcx
 ; X64-NOBMI2-NEXT:    movq %rdi, %r14
 ; X64-NOBMI2-NEXT:    movq $-1, %rbx
-; X64-NOBMI2-NEXT:    movl %esi, %ecx
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI2-NEXT:    shrq %cl, %rbx
 ; X64-NOBMI2-NEXT:    movq %rbx, %rdi
 ; X64-NOBMI2-NEXT:    callq use64
diff --git a/llvm/test/CodeGen/X86/clear-lowbits.ll b/llvm/test/CodeGen/X86/clear-lowbits.ll
index bd77459..f29717e 100644
--- a/llvm/test/CodeGen/X86/clear-lowbits.ll
+++ b/llvm/test/CodeGen/X86/clear-lowbits.ll
@@ -35,10 +35,11 @@
 ; X64-LABEL: clear_lowbits8_c0:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shlb %cl, %dil
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shrb %cl, %al
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shlb %cl, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %mask = shl i8 -1, %numlowbits
   %masked = and i8 %mask, %val
@@ -81,10 +82,11 @@
 ; X64-LABEL: clear_lowbits8_c4_commutative:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shlb %cl, %dil
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shrb %cl, %al
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shlb %cl, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %mask = shl i8 -1, %numlowbits
   %masked = and i8 %val, %mask ; swapped order
@@ -327,10 +329,10 @@
 ; X64-NOBMI2-LABEL: clear_lowbits32_c0:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movl %esi, %ecx
-; X64-NOBMI2-NEXT:    shrl %cl, %edi
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI2-NEXT:    shll %cl, %edi
 ; X64-NOBMI2-NEXT:    movl %edi, %eax
+; X64-NOBMI2-NEXT:    shrl %cl, %eax
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI2-NEXT:    shll %cl, %eax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: clear_lowbits32_c0:
@@ -362,10 +364,10 @@
 ; X64-NOBMI2-LABEL: clear_lowbits32_c1_indexzext:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movl %esi, %ecx
-; X64-NOBMI2-NEXT:    shrl %cl, %edi
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI2-NEXT:    shll %cl, %edi
 ; X64-NOBMI2-NEXT:    movl %edi, %eax
+; X64-NOBMI2-NEXT:    shrl %cl, %eax
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI2-NEXT:    shll %cl, %eax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: clear_lowbits32_c1_indexzext:
@@ -475,10 +477,10 @@
 ; X64-NOBMI2-LABEL: clear_lowbits32_c4_commutative:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movl %esi, %ecx
-; X64-NOBMI2-NEXT:    shrl %cl, %edi
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI2-NEXT:    shll %cl, %edi
 ; X64-NOBMI2-NEXT:    movl %edi, %eax
+; X64-NOBMI2-NEXT:    shrl %cl, %eax
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI2-NEXT:    shll %cl, %eax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: clear_lowbits32_c4_commutative:
@@ -530,10 +532,10 @@
 ; X64-NOBMI2-LABEL: clear_lowbits64_c0:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movq %rsi, %rcx
-; X64-NOBMI2-NEXT:    shrq %cl, %rdi
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI2-NEXT:    shlq %cl, %rdi
 ; X64-NOBMI2-NEXT:    movq %rdi, %rax
+; X64-NOBMI2-NEXT:    shrq %cl, %rax
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NOBMI2-NEXT:    shlq %cl, %rax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: clear_lowbits64_c0:
@@ -583,10 +585,10 @@
 ; X64-NOBMI2-LABEL: clear_lowbits64_c1_indexzext:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movl %esi, %ecx
-; X64-NOBMI2-NEXT:    shrq %cl, %rdi
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI2-NEXT:    shlq %cl, %rdi
 ; X64-NOBMI2-NEXT:    movq %rdi, %rax
+; X64-NOBMI2-NEXT:    shrq %cl, %rax
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI2-NEXT:    shlq %cl, %rax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: clear_lowbits64_c1_indexzext:
@@ -760,10 +762,10 @@
 ; X64-NOBMI2-LABEL: clear_lowbits64_c4_commutative:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movq %rsi, %rcx
-; X64-NOBMI2-NEXT:    shrq %cl, %rdi
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI2-NEXT:    shlq %cl, %rdi
 ; X64-NOBMI2-NEXT:    movq %rdi, %rax
+; X64-NOBMI2-NEXT:    shrq %cl, %rax
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NOBMI2-NEXT:    shlq %cl, %rax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: clear_lowbits64_c4_commutative:
@@ -794,11 +796,12 @@
 ;
 ; X64-LABEL: clear_lowbits8_ic0:
 ; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    movb $8, %cl
 ; X64-NEXT:    subb %sil, %cl
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    shlb %cl, %dil
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shrb %cl, %al
+; X64-NEXT:    shlb %cl, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %numhighbits = sub i8 8, %numlowbits
   %mask = shl i8 -1, %numhighbits
@@ -844,11 +847,12 @@
 ;
 ; X64-LABEL: clear_lowbits8_ic4_commutative:
 ; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    movb $8, %cl
 ; X64-NEXT:    subb %sil, %cl
-; X64-NEXT:    shrb %cl, %dil
-; X64-NEXT:    shlb %cl, %dil
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shrb %cl, %al
+; X64-NEXT:    shlb %cl, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %numhighbits = sub i8 8, %numlowbits
   %mask = shl i8 -1, %numhighbits
@@ -1126,11 +1130,11 @@
 ; X64-NOBMI2-LABEL: clear_lowbits32_ic0:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movl %esi, %ecx
-; X64-NOBMI2-NEXT:    negl %ecx
-; X64-NOBMI2-NEXT:    shrl %cl, %edi
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI2-NEXT:    shll %cl, %edi
 ; X64-NOBMI2-NEXT:    movl %edi, %eax
+; X64-NOBMI2-NEXT:    negl %ecx
+; X64-NOBMI2-NEXT:    shrl %cl, %eax
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI2-NEXT:    shll %cl, %eax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: clear_lowbits32_ic0:
@@ -1167,11 +1171,11 @@
 ; X64-NOBMI2-LABEL: clear_lowbits32_ic1_indexzext:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movl %esi, %ecx
-; X64-NOBMI2-NEXT:    negb %cl
-; X64-NOBMI2-NEXT:    shrl %cl, %edi
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI2-NEXT:    shll %cl, %edi
 ; X64-NOBMI2-NEXT:    movl %edi, %eax
+; X64-NOBMI2-NEXT:    negb %cl
+; X64-NOBMI2-NEXT:    shrl %cl, %eax
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI2-NEXT:    shll %cl, %eax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: clear_lowbits32_ic1_indexzext:
@@ -1298,11 +1302,11 @@
 ; X64-NOBMI2-LABEL: clear_lowbits32_ic4_commutative:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movl %esi, %ecx
-; X64-NOBMI2-NEXT:    negl %ecx
-; X64-NOBMI2-NEXT:    shrl %cl, %edi
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI2-NEXT:    shll %cl, %edi
 ; X64-NOBMI2-NEXT:    movl %edi, %eax
+; X64-NOBMI2-NEXT:    negl %ecx
+; X64-NOBMI2-NEXT:    shrl %cl, %eax
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI2-NEXT:    shll %cl, %eax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: clear_lowbits32_ic4_commutative:
@@ -1358,11 +1362,11 @@
 ; X64-NOBMI2-LABEL: clear_lowbits64_ic0:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movq %rsi, %rcx
-; X64-NOBMI2-NEXT:    negl %ecx
-; X64-NOBMI2-NEXT:    shrq %cl, %rdi
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI2-NEXT:    shlq %cl, %rdi
 ; X64-NOBMI2-NEXT:    movq %rdi, %rax
+; X64-NOBMI2-NEXT:    negl %ecx
+; X64-NOBMI2-NEXT:    shrq %cl, %rax
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NOBMI2-NEXT:    shlq %cl, %rax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: clear_lowbits64_ic0:
@@ -1416,11 +1420,11 @@
 ; X64-NOBMI2-LABEL: clear_lowbits64_ic1_indexzext:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movl %esi, %ecx
-; X64-NOBMI2-NEXT:    negb %cl
-; X64-NOBMI2-NEXT:    shrq %cl, %rdi
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI2-NEXT:    shlq %cl, %rdi
 ; X64-NOBMI2-NEXT:    movq %rdi, %rax
+; X64-NOBMI2-NEXT:    negb %cl
+; X64-NOBMI2-NEXT:    shrq %cl, %rax
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI2-NEXT:    shlq %cl, %rax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: clear_lowbits64_ic1_indexzext:
@@ -1608,11 +1612,11 @@
 ; X64-NOBMI2-LABEL: clear_lowbits64_ic4_commutative:
 ; X64-NOBMI2:       # %bb.0:
 ; X64-NOBMI2-NEXT:    movq %rsi, %rcx
-; X64-NOBMI2-NEXT:    negl %ecx
-; X64-NOBMI2-NEXT:    shrq %cl, %rdi
-; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI2-NEXT:    shlq %cl, %rdi
 ; X64-NOBMI2-NEXT:    movq %rdi, %rax
+; X64-NOBMI2-NEXT:    negl %ecx
+; X64-NOBMI2-NEXT:    shrq %cl, %rax
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NOBMI2-NEXT:    shlq %cl, %rax
 ; X64-NOBMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: clear_lowbits64_ic4_commutative:
@@ -1670,9 +1674,10 @@
 ; X64-NOBMI2-NEXT:    pushq %rbp
 ; X64-NOBMI2-NEXT:    pushq %rbx
 ; X64-NOBMI2-NEXT:    pushq %rax
+; X64-NOBMI2-NEXT:    movl %esi, %ecx
 ; X64-NOBMI2-NEXT:    movl %edi, %ebx
 ; X64-NOBMI2-NEXT:    movl $-1, %ebp
-; X64-NOBMI2-NEXT:    movl %esi, %ecx
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI2-NEXT:    shll %cl, %ebp
 ; X64-NOBMI2-NEXT:    movl %ebp, %edi
 ; X64-NOBMI2-NEXT:    callq use32
@@ -1770,9 +1775,10 @@
 ; X64-NOBMI2-NEXT:    pushq %r14
 ; X64-NOBMI2-NEXT:    pushq %rbx
 ; X64-NOBMI2-NEXT:    pushq %rax
+; X64-NOBMI2-NEXT:    movq %rsi, %rcx
 ; X64-NOBMI2-NEXT:    movq %rdi, %r14
 ; X64-NOBMI2-NEXT:    movq $-1, %rbx
-; X64-NOBMI2-NEXT:    movl %esi, %ecx
+; X64-NOBMI2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI2-NEXT:    shlq %cl, %rbx
 ; X64-NOBMI2-NEXT:    movq %rbx, %rdi
 ; X64-NOBMI2-NEXT:    callq use64
diff --git a/llvm/test/CodeGen/X86/cmov-into-branch.ll b/llvm/test/CodeGen/X86/cmov-into-branch.ll
index 51c1ac5..9ce99c78 100644
--- a/llvm/test/CodeGen/X86/cmov-into-branch.ll
+++ b/llvm/test/CodeGen/X86/cmov-into-branch.ll
@@ -5,9 +5,9 @@
 define i32 @test1(double %a, double* nocapture %b, i32 %x, i32 %y)  {
 ; CHECK-LABEL: test1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    ucomisd (%rdi), %xmm0
-; CHECK-NEXT:    cmovbel %edx, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    ucomisd (%rdi), %xmm0
+; CHECK-NEXT:    cmovbel %edx, %eax
 ; CHECK-NEXT:    retq
   %load = load double, double* %b, align 8
   %cmp = fcmp olt double %load, %a
@@ -19,9 +19,9 @@
 define i32 @test2(double %a, double %b, i32 %x, i32 %y)  {
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    ucomisd %xmm1, %xmm0
-; CHECK-NEXT:    cmovbel %esi, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    ucomisd %xmm1, %xmm0
+; CHECK-NEXT:    cmovbel %esi, %eax
 ; CHECK-NEXT:    retq
   %cmp = fcmp ogt double %a, %b
   %cond = select i1 %cmp, i32 %x, i32 %y
@@ -48,10 +48,10 @@
 define i32 @test5(i32 %a, i32* nocapture %b, i32 %x, i32 %y) {
 ; CHECK-LABEL: test5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl %edi, (%rsi)
-; CHECK-NEXT:    cmoval %edi, %ecx
-; CHECK-NEXT:    cmovael %edx, %ecx
 ; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    cmpl %edi, (%rsi)
+; CHECK-NEXT:    cmoval %edi, %eax
+; CHECK-NEXT:    cmovael %edx, %eax
 ; CHECK-NEXT:    retq
   %load = load i32, i32* %b, align 4
   %cmp = icmp ult i32 %load, %a
@@ -83,9 +83,9 @@
 define i32 @weighted_select1(i32 %a, i32 %b) {
 ; CHECK-LABEL: weighted_select1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    cmovnel %edi, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    cmovnel %edi, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ne i32 %a, 0
   %sel = select i1 %cmp, i32 %a, i32 %b, !prof !0
@@ -96,12 +96,12 @@
 define i32 @weighted_select2(i32 %a, i32 %b) {
 ; CHECK-LABEL: weighted_select2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    testl %edi, %edi
 ; CHECK-NEXT:    jne .LBB6_2
 ; CHECK-NEXT:  # %bb.1: # %select.false
-; CHECK-NEXT:    movl %esi, %edi
+; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:  .LBB6_2: # %select.end
-; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ne i32 %a, 0
   %sel = select i1 %cmp, i32 %a, i32 %b, !prof !1
@@ -115,14 +115,13 @@
 define i32 @weighted_select3(i32 %a, i32 %b) {
 ; CHECK-LABEL: weighted_select3:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    testl %edi, %edi
 ; CHECK-NEXT:    je .LBB7_1
 ; CHECK-NEXT:  # %bb.2: # %select.end
-; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB7_1: # %select.false
-; CHECK-NEXT:    movl %esi, %edi
-; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ne i32 %a, 0
   %sel = select i1 %cmp, i32 %a, i32 %b, !prof !2
@@ -133,9 +132,9 @@
 define i32 @unweighted_select(i32 %a, i32 %b) {
 ; CHECK-LABEL: unweighted_select:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    cmovnel %edi, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    cmovnel %edi, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ne i32 %a, 0
   %sel = select i1 %cmp, i32 %a, i32 %b, !prof !3
diff --git a/llvm/test/CodeGen/X86/cmov.ll b/llvm/test/CodeGen/X86/cmov.ll
index fb921ee..2e92d8e 100644
--- a/llvm/test/CodeGen/X86/cmov.ll
+++ b/llvm/test/CodeGen/X86/cmov.ll
@@ -194,11 +194,14 @@
 ; CHECK-LABEL: test7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    jne .LBB6_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    movl %edx, %esi
-; CHECK-NEXT:  .LBB6_2:
+; CHECK-NEXT:    jne .LBB6_1
+; CHECK-NEXT:  # %bb.2:
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB6_1:
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %d = select i1 %c, i8 %a, i8 %b
   ret i8 %d
diff --git a/llvm/test/CodeGen/X86/cmovcmov.ll b/llvm/test/CodeGen/X86/cmovcmov.ll
index 98a7eb7..23fd0fb 100644
--- a/llvm/test/CodeGen/X86/cmovcmov.ll
+++ b/llvm/test/CodeGen/X86/cmovcmov.ll
@@ -9,10 +9,10 @@
 
 ; CHECK-LABEL: test_select_fcmp_oeq_i32:
 
-; CMOV-NEXT: ucomiss  %xmm1, %xmm0
-; CMOV-NEXT: cmovnel  %esi, %edi
-; CMOV-NEXT: cmovpl  %esi, %edi
 ; CMOV-NEXT: movl  %edi, %eax
+; CMOV-NEXT: ucomiss  %xmm1, %xmm0
+; CMOV-NEXT: cmovnel  %esi, %eax
+; CMOV-NEXT: cmovpl  %esi, %eax
 ; CMOV-NEXT: retq
 
 ; NOCMOV-NEXT:  flds  8(%esp)
@@ -36,10 +36,10 @@
 
 ; CHECK-LABEL: test_select_fcmp_oeq_i64:
 
-; CMOV-NEXT:   ucomiss  %xmm1, %xmm0
-; CMOV-NEXT:   cmovneq  %rsi, %rdi
-; CMOV-NEXT:   cmovpq  %rsi, %rdi
 ; CMOV-NEXT:   movq  %rdi, %rax
+; CMOV-NEXT:   ucomiss  %xmm1, %xmm0
+; CMOV-NEXT:   cmovneq  %rsi, %rax
+; CMOV-NEXT:   cmovpq  %rsi, %rax
 ; CMOV-NEXT:   retq
 
 ; NOCMOV-NEXT:   flds  8(%esp)
@@ -64,10 +64,10 @@
 
 ; CHECK-LABEL: test_select_fcmp_une_i64:
 
-; CMOV-NEXT:   ucomiss  %xmm1, %xmm0
-; CMOV-NEXT:   cmovneq  %rdi, %rsi
-; CMOV-NEXT:   cmovpq  %rdi, %rsi
 ; CMOV-NEXT:   movq  %rsi, %rax
+; CMOV-NEXT:   ucomiss  %xmm1, %xmm0
+; CMOV-NEXT:   cmovneq  %rdi, %rax
+; CMOV-NEXT:   cmovpq  %rdi, %rax
 ; CMOV-NEXT:   retq
 
 ; NOCMOV-NEXT:   flds  8(%esp)
diff --git a/llvm/test/CodeGen/X86/cmp.ll b/llvm/test/CodeGen/X86/cmp.ll
index 71cb960..b6ecda0 100644
--- a/llvm/test/CodeGen/X86/cmp.ll
+++ b/llvm/test/CodeGen/X86/cmp.ll
@@ -271,9 +271,9 @@
 define i32 @test13(i32 %mask, i32 %base, i32 %intra) {
 ; CHECK-LABEL: test13:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    testb $8, %dil # encoding: [0x40,0xf6,0xc7,0x08]
-; CHECK-NEXT:    cmovnel %edx, %esi # encoding: [0x0f,0x45,0xf2]
 ; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; CHECK-NEXT:    testb $8, %dil # encoding: [0x40,0xf6,0xc7,0x08]
+; CHECK-NEXT:    cmovnel %edx, %eax # encoding: [0x0f,0x45,0xc2]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
   %and = and i32 %mask, 8
@@ -286,9 +286,9 @@
 define i32 @test14(i32 %mask, i32 %base, i32 %intra) {
 ; CHECK-LABEL: test14:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shrl $7, %edi # encoding: [0xc1,0xef,0x07]
-; CHECK-NEXT:    cmovnsl %edx, %esi # encoding: [0x0f,0x49,0xf2]
 ; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; CHECK-NEXT:    shrl $7, %edi # encoding: [0xc1,0xef,0x07]
+; CHECK-NEXT:    cmovnsl %edx, %eax # encoding: [0x0f,0x49,0xc2]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 entry:
   %s = lshr i32 %mask, 7
diff --git a/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll
index dc83676..0cc2d42 100644
--- a/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll
+++ b/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll
@@ -151,6 +151,7 @@
 ;
 ; 64-ALL-LABEL: test_control_flow:
 ; 64-ALL:       # %bb.0: # %entry
+; 64-ALL-NEXT:    movl %esi, %eax
 ; 64-ALL-NEXT:    cmpl %edx, %esi
 ; 64-ALL-NEXT:    jle .LBB1_5
 ; 64-ALL-NEXT:    .p2align 4, 0x90
@@ -171,9 +172,8 @@
 ; 64-ALL-NEXT:    lock cmpxchgl %eax, (%rdi)
 ; 64-ALL-NEXT:    jne .LBB1_1
 ; 64-ALL-NEXT:  # %bb.4:
-; 64-ALL-NEXT:    xorl %esi, %esi
+; 64-ALL-NEXT:    xorl %eax, %eax
 ; 64-ALL-NEXT:  .LBB1_5: # %cond.end
-; 64-ALL-NEXT:    movl %esi, %eax
 ; 64-ALL-NEXT:    retq
 entry:
   %cmp = icmp sgt i32 %i, %j
diff --git a/llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll b/llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll
index d2dbb30..42f29f3 100644
--- a/llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll
+++ b/llvm/test/CodeGen/X86/cmpxchg-i128-i1.ll
@@ -7,10 +7,9 @@
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset %rbx, -16
-; CHECK-NEXT:    movq %rcx, %r9
+; CHECK-NEXT:    movq %rcx, %rbx
 ; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    movq %r8, %rcx
-; CHECK-NEXT:    movq %r9, %rbx
 ; CHECK-NEXT:    lock cmpxchg16b (%rdi)
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    popq %rbx
@@ -27,10 +26,9 @@
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset %rbx, -16
-; CHECK-NEXT:    movq %rcx, %r9
+; CHECK-NEXT:    movq %rcx, %rbx
 ; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    movq %r8, %rcx
-; CHECK-NEXT:    movq %r9, %rbx
 ; CHECK-NEXT:    lock cmpxchg16b (%rdi)
 ; CHECK-NEXT:    jne .LBB1_2
 ; CHECK-NEXT:  # %bb.1: # %true
@@ -64,14 +62,13 @@
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset %rbx, -16
-; CHECK-NEXT:    movq %rcx, %r9
-; CHECK-NEXT:    movq %rdx, %r10
+; CHECK-NEXT:    movq %rcx, %rbx
+; CHECK-NEXT:    movq %rdx, %r9
 ; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    movq %r8, %rcx
-; CHECK-NEXT:    movq %r9, %rbx
 ; CHECK-NEXT:    lock cmpxchg16b (%rdi)
 ; CHECK-NEXT:    cmpq %rsi, %rax
-; CHECK-NEXT:    sbbq %r10, %rdx
+; CHECK-NEXT:    sbbq %r9, %rdx
 ; CHECK-NEXT:    setge %al
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -88,15 +85,14 @@
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset %rbx, -16
-; CHECK-NEXT:    movq %rcx, %r9
-; CHECK-NEXT:    xorl %r10d, %r10d
+; CHECK-NEXT:    movq %rcx, %rbx
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    movq %r8, %rcx
-; CHECK-NEXT:    movq %r9, %rbx
 ; CHECK-NEXT:    lock cmpxchg16b (%rdi)
-; CHECK-NEXT:    sete %r10b
+; CHECK-NEXT:    sete %sil
 ; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    movq %r10, %rax
+; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/combine-add.ll b/llvm/test/CodeGen/X86/combine-add.ll
index 9a9f535..7e3b996 100644
--- a/llvm/test/CodeGen/X86/combine-add.ll
+++ b/llvm/test/CodeGen/X86/combine-add.ll
@@ -103,8 +103,8 @@
 define <4 x i32> @combine_vec_add_sub_add0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; SSE-LABEL: combine_vec_add_sub_add0:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    psubd %xmm2, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psubd %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_add_sub_add0:
@@ -121,8 +121,8 @@
 define <4 x i32> @combine_vec_add_sub_add1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; SSE-LABEL: combine_vec_add_sub_add1:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    psubd %xmm2, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psubd %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_add_sub_add1:
@@ -139,8 +139,8 @@
 define <4 x i32> @combine_vec_add_sub_add2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; SSE-LABEL: combine_vec_add_sub_add2:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    paddd %xmm2, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    paddd %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_add_sub_add2:
@@ -157,8 +157,8 @@
 define <4 x i32> @combine_vec_add_sub_add3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; SSE-LABEL: combine_vec_add_sub_add3:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    psubd %xmm2, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psubd %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_add_sub_add3:
@@ -203,9 +203,9 @@
 ;
 ; AVX-LABEL: combine_vec_add_uniquebits:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [61680,61680,61680,61680]
 ; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3855,3855,3855,3855]
 ; AVX-NEXT:    vandps %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/combine-rotates.ll b/llvm/test/CodeGen/X86/combine-rotates.ll
index 93fc787..15aec1a 100644
--- a/llvm/test/CodeGen/X86/combine-rotates.ll
+++ b/llvm/test/CodeGen/X86/combine-rotates.ll
@@ -98,8 +98,8 @@
 define i32 @combine_rot_select_zero(i32, i32) {
 ; CHECK-LABEL: combine_rot_select_zero:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    movl %esi, %ecx
+; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    roll %cl, %eax
 ; CHECK-NEXT:    testl %esi, %esi
 ; CHECK-NEXT:    cmovel %edi, %eax
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 7c7a6f8..660bff4 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -29,8 +29,8 @@
 define i32 @combine_sdiv_by_negone(i32 %x) {
 ; CHECK-LABEL: combine_sdiv_by_negone:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    negl %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    negl %eax
 ; CHECK-NEXT:    retq
   %1 = sdiv i32 %x, -1
   ret i32 %1
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index ae1e7c6..cddf746 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -64,8 +64,8 @@
 define i32 @combine_udiv_by_minsigned(i32 %x) {
 ; CHECK-LABEL: combine_udiv_by_minsigned:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrl $31, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shrl $31, %eax
 ; CHECK-NEXT:    retq
   %1 = udiv i32 %x, -2147483648
   ret i32 %1
diff --git a/llvm/test/CodeGen/X86/combine-urem.ll b/llvm/test/CodeGen/X86/combine-urem.ll
index b507c06..04c0abe 100644
--- a/llvm/test/CodeGen/X86/combine-urem.ll
+++ b/llvm/test/CodeGen/X86/combine-urem.ll
@@ -62,8 +62,8 @@
 define i32 @combine_urem_by_minsigned(i32 %x) {
 ; CHECK-LABEL: combine_urem_by_minsigned:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $2147483647, %edi # imm = 0x7FFFFFFF
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
 ; CHECK-NEXT:    retq
   %1 = urem i32 %x, -2147483648
   ret i32 %1
diff --git a/llvm/test/CodeGen/X86/conditional-indecrement.ll b/llvm/test/CodeGen/X86/conditional-indecrement.ll
index 6a68144..924a456 100644
--- a/llvm/test/CodeGen/X86/conditional-indecrement.ll
+++ b/llvm/test/CodeGen/X86/conditional-indecrement.ll
@@ -4,9 +4,9 @@
 define i32 @test1(i32 %a, i32 %b) nounwind readnone {
 ; CHECK-LABEL: test1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl $1, %edi
-; CHECK-NEXT:    sbbl $-1, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    sbbl $-1, %eax
 ; CHECK-NEXT:    retq
   %not.cmp = icmp ne i32 %a, 0
   %inc = zext i1 %not.cmp to i32
@@ -17,9 +17,9 @@
 define i32 @test1_commute(i32 %a, i32 %b) nounwind readnone {
 ; CHECK-LABEL: test1_commute:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl $1, %edi
-; CHECK-NEXT:    sbbl $-1, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    sbbl $-1, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ne i32 %a, 0
   %inc = zext i1 %cmp to i32
@@ -30,9 +30,9 @@
 define i32 @test2(i32 %a, i32 %b) nounwind readnone {
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl $1, %edi
-; CHECK-NEXT:    adcl $0, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    adcl $0, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp eq i32 %a, 0
   %inc = zext i1 %cmp to i32
@@ -43,9 +43,9 @@
 define i32 @test3(i32 %a, i32 %b) nounwind readnone {
 ; CHECK-LABEL: test3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl $1, %edi
-; CHECK-NEXT:    adcl $0, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    adcl $0, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp eq i32 %a, 0
   %inc = zext i1 %cmp to i32
@@ -56,9 +56,9 @@
 define i32 @test4(i32 %a, i32 %b) nounwind readnone {
 ; CHECK-LABEL: test4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl $1, %edi
-; CHECK-NEXT:    sbbl $-1, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    sbbl $-1, %eax
 ; CHECK-NEXT:    retq
   %not.cmp = icmp ne i32 %a, 0
   %inc = zext i1 %not.cmp to i32
@@ -69,9 +69,9 @@
 define i32 @test5(i32 %a, i32 %b) nounwind readnone {
 ; CHECK-LABEL: test5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl $1, %edi
-; CHECK-NEXT:    adcl $-1, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    adcl $-1, %eax
 ; CHECK-NEXT:    retq
   %not.cmp = icmp ne i32 %a, 0
   %inc = zext i1 %not.cmp to i32
@@ -82,9 +82,9 @@
 define i32 @test6(i32 %a, i32 %b) nounwind readnone {
 ; CHECK-LABEL: test6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl $1, %edi
-; CHECK-NEXT:    sbbl $0, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    sbbl $0, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp eq i32 %a, 0
   %inc = zext i1 %cmp to i32
@@ -95,9 +95,9 @@
 define i32 @test7(i32 %a, i32 %b) nounwind readnone {
 ; CHECK-LABEL: test7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl $1, %edi
-; CHECK-NEXT:    sbbl $0, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    sbbl $0, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp eq i32 %a, 0
   %inc = zext i1 %cmp to i32
@@ -108,9 +108,9 @@
 define i32 @test8(i32 %a, i32 %b) nounwind readnone {
 ; CHECK-LABEL: test8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl $1, %edi
-; CHECK-NEXT:    adcl $-1, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    adcl $-1, %eax
 ; CHECK-NEXT:    retq
   %not.cmp = icmp ne i32 %a, 0
   %inc = zext i1 %not.cmp to i32
diff --git a/llvm/test/CodeGen/X86/dagcombine-select.ll b/llvm/test/CodeGen/X86/dagcombine-select.ll
index 45d8324..506234b 100644
--- a/llvm/test/CodeGen/X86/dagcombine-select.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-select.ll
@@ -194,10 +194,11 @@
 define i32 @shl_constant_sel_constants(i1 %cond) {
 ; CHECK-LABEL: shl_constant_sel_constants:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andb $1, %dil
-; CHECK-NEXT:    xorb $3, %dil
-; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    xorb $3, %cl
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shll %cl, %eax
 ; CHECK-NEXT:    retq
   %sel = select i1 %cond, i32 2, i32 3
@@ -208,10 +209,11 @@
 define i32 @lshr_constant_sel_constants(i1 %cond) {
 ; CHECK-LABEL: lshr_constant_sel_constants:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andb $1, %dil
-; CHECK-NEXT:    xorb $3, %dil
-; CHECK-NEXT:    movl $64, %eax
 ; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    xorb $3, %cl
+; CHECK-NEXT:    movl $64, %eax
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrl %cl, %eax
 ; CHECK-NEXT:    retq
   %sel = select i1 %cond, i32 2, i32 3
@@ -222,10 +224,11 @@
 define i32 @ashr_constant_sel_constants(i1 %cond) {
 ; CHECK-LABEL: ashr_constant_sel_constants:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andb $1, %dil
-; CHECK-NEXT:    xorb $3, %dil
-; CHECK-NEXT:    movl $128, %eax
 ; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    xorb $3, %cl
+; CHECK-NEXT:    movl $128, %eax
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrl %cl, %eax
 ; CHECK-NEXT:    retq
   %sel = select i1 %cond, i32 2, i32 3
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index 00a0e31..9fbef11 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -95,8 +95,8 @@
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl $365384439, %eax # imm = 0x15C752F7
 ; X32-NEXT:    mull {{[0-9]+}}(%esp)
-; X32-NEXT:    shrl $27, %edx
 ; X32-NEXT:    movl %edx, %eax
+; X32-NEXT:    shrl $27, %eax
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test5:
@@ -217,9 +217,9 @@
 ;
 ; X64-LABEL: testsize1:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    pushq $32
 ; X64-NEXT:    popq %rcx
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    cltd
 ; X64-NEXT:    idivl %ecx
 ; X64-NEXT:    retq
@@ -240,9 +240,9 @@
 ;
 ; X64-LABEL: testsize2:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    pushq $33
 ; X64-NEXT:    popq %rcx
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    cltd
 ; X64-NEXT:    idivl %ecx
 ; X64-NEXT:    retq
@@ -260,8 +260,8 @@
 ;
 ; X64-LABEL: testsize3:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    shrl $5, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shrl $5, %eax
 ; X64-NEXT:    retq
 entry:
 	%div = udiv i32 %x, 32
@@ -280,10 +280,10 @@
 ;
 ; X64-LABEL: testsize4:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    pushq $33
 ; X64-NEXT:    popq %rcx
 ; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    divl %ecx
 ; X64-NEXT:    retq
 entry:
@@ -311,38 +311,36 @@
 ;
 ; X64-FAST-LABEL: PR23590:
 ; X64-FAST:       # %bb.0: # %entry
-; X64-FAST-NEXT:    movq %rdi, %rcx
-; X64-FAST-NEXT:    movabsq $6120523590596543007, %rdx # imm = 0x54F077C718E7C21F
+; X64-FAST-NEXT:    movabsq $6120523590596543007, %rcx # imm = 0x54F077C718E7C21F
 ; X64-FAST-NEXT:    movq %rdi, %rax
-; X64-FAST-NEXT:    mulq %rdx
+; X64-FAST-NEXT:    mulq %rcx
 ; X64-FAST-NEXT:    shrq $12, %rdx
 ; X64-FAST-NEXT:    imulq $12345, %rdx, %rax # imm = 0x3039
-; X64-FAST-NEXT:    subq %rax, %rcx
-; X64-FAST-NEXT:    movabsq $2635249153387078803, %rdx # imm = 0x2492492492492493
-; X64-FAST-NEXT:    movq %rcx, %rax
-; X64-FAST-NEXT:    mulq %rdx
-; X64-FAST-NEXT:    subq %rdx, %rcx
-; X64-FAST-NEXT:    shrq %rcx
-; X64-FAST-NEXT:    leaq (%rcx,%rdx), %rax
+; X64-FAST-NEXT:    subq %rax, %rdi
+; X64-FAST-NEXT:    movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
+; X64-FAST-NEXT:    movq %rdi, %rax
+; X64-FAST-NEXT:    mulq %rcx
+; X64-FAST-NEXT:    subq %rdx, %rdi
+; X64-FAST-NEXT:    shrq %rdi
+; X64-FAST-NEXT:    leaq (%rdi,%rdx), %rax
 ; X64-FAST-NEXT:    shrq $2, %rax
 ; X64-FAST-NEXT:    retq
 ;
 ; X64-SLOW-LABEL: PR23590:
 ; X64-SLOW:       # %bb.0: # %entry
-; X64-SLOW-NEXT:    movq %rdi, %rcx
-; X64-SLOW-NEXT:    movabsq $6120523590596543007, %rdx # imm = 0x54F077C718E7C21F
+; X64-SLOW-NEXT:    movabsq $6120523590596543007, %rcx # imm = 0x54F077C718E7C21F
 ; X64-SLOW-NEXT:    movq %rdi, %rax
-; X64-SLOW-NEXT:    mulq %rdx
+; X64-SLOW-NEXT:    mulq %rcx
 ; X64-SLOW-NEXT:    shrq $12, %rdx
 ; X64-SLOW-NEXT:    imulq $12345, %rdx, %rax # imm = 0x3039
-; X64-SLOW-NEXT:    subq %rax, %rcx
-; X64-SLOW-NEXT:    imulq $613566757, %rcx, %rax # imm = 0x24924925
+; X64-SLOW-NEXT:    subq %rax, %rdi
+; X64-SLOW-NEXT:    imulq $613566757, %rdi, %rax # imm = 0x24924925
 ; X64-SLOW-NEXT:    shrq $32, %rax
-; X64-SLOW-NEXT:    subl %eax, %ecx
-; X64-SLOW-NEXT:    shrl %ecx
-; X64-SLOW-NEXT:    addl %eax, %ecx
-; X64-SLOW-NEXT:    shrl $2, %ecx
-; X64-SLOW-NEXT:    movq %rcx, %rax
+; X64-SLOW-NEXT:    subl %eax, %edi
+; X64-SLOW-NEXT:    shrl %edi
+; X64-SLOW-NEXT:    addl %eax, %edi
+; X64-SLOW-NEXT:    shrl $2, %edi
+; X64-SLOW-NEXT:    movq %rdi, %rax
 ; X64-SLOW-NEXT:    retq
 entry:
 	%rem = urem i64 %x, 12345
@@ -390,10 +388,10 @@
 ; X64-NEXT:    shrq $11, %rax
 ; X64-NEXT:    movabsq $4835703278458517, %rcx # imm = 0x112E0BE826D695
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    shrq $9, %rdx
-; X64-NEXT:    imull $-294967296, %edx, %eax # imm = 0xEE6B2800
-; X64-NEXT:    subl %eax, %edi
 ; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    shrq $9, %rax
+; X64-NEXT:    imull $-294967296, %eax, %ecx # imm = 0xEE6B2800
+; X64-NEXT:    subl %ecx, %edi
 ; X64-NEXT:    movl %edi, %edx
 ; X64-NEXT:    retq
   %2 = udiv i64 %0, 4000000000
diff --git a/llvm/test/CodeGen/X86/divrem.ll b/llvm/test/CodeGen/X86/divrem.ll
index 67acba0..df312b5 100644
--- a/llvm/test/CodeGen/X86/divrem.ll
+++ b/llvm/test/CodeGen/X86/divrem.ll
@@ -101,6 +101,7 @@
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    cwtd
 ; X64-NEXT:    idivw %si
 ; X64-NEXT:    movw %ax, (%r8)
@@ -131,6 +132,7 @@
 ; X64-LABEL: si8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    cbtw
 ; X64-NEXT:    idivb %sil
 ; X64-NEXT:    movsbl %ah, %esi
@@ -182,8 +184,8 @@
 ; X64-LABEL: ui64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    divq %rsi
 ; X64-NEXT:    movq %rax, (%r8)
 ; X64-NEXT:    movq %rdx, (%rcx)
@@ -212,8 +214,8 @@
 ; X64-LABEL: ui32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    divl %esi
 ; X64-NEXT:    movl %eax, (%r8)
 ; X64-NEXT:    movl %edx, (%rcx)
@@ -242,8 +244,9 @@
 ; X64-LABEL: ui16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    divw %si
 ; X64-NEXT:    movw %ax, (%r8)
 ; X64-NEXT:    movw %dx, (%rcx)
diff --git a/llvm/test/CodeGen/X86/divrem8_ext.ll b/llvm/test/CodeGen/X86/divrem8_ext.ll
index 313aa86..b9b6e1e 100644
--- a/llvm/test/CodeGen/X86/divrem8_ext.ll
+++ b/llvm/test/CodeGen/X86/divrem8_ext.ll
@@ -112,6 +112,7 @@
 ; X64-LABEL: test_sdivrem_sext_ah:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    cbtw
 ; X64-NEXT:    idivb %sil
 ; X64-NEXT:    movsbl %ah, %ecx
@@ -137,6 +138,7 @@
 ; X64-LABEL: test_srem_sext_ah:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    cbtw
 ; X64-NEXT:    idivb %sil
 ; X64-NEXT:    movsbl %ah, %eax
@@ -161,6 +163,7 @@
 ; X64-LABEL: test_srem_noext_ah:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    cbtw
 ; X64-NEXT:    idivb %sil
 ; X64-NEXT:    movsbl %ah, %eax
@@ -186,6 +189,7 @@
 ; X64-LABEL: test_srem_sext64_ah:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    cbtw
 ; X64-NEXT:    idivb %sil
 ; X64-NEXT:    movsbl %ah, %eax
diff --git a/llvm/test/CodeGen/X86/extract-lowbits.ll b/llvm/test/CodeGen/X86/extract-lowbits.ll
index 6e1fa02..ee71543 100644
--- a/llvm/test/CodeGen/X86/extract-lowbits.ll
+++ b/llvm/test/CodeGen/X86/extract-lowbits.ll
@@ -45,8 +45,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi32_a0:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl $1, %eax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    movl $1, %eax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shll %cl, %eax
 ; X64-NOBMI-NEXT:    decl %eax
 ; X64-NOBMI-NEXT:    andl %edi, %eax
@@ -80,8 +81,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi32_a1_indexzext:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl $1, %eax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    movl $1, %eax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shll %cl, %eax
 ; X64-NOBMI-NEXT:    decl %eax
 ; X64-NOBMI-NEXT:    andl %edi, %eax
@@ -118,8 +120,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi32_a2_load:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl $1, %eax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    movl $1, %eax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shll %cl, %eax
 ; X64-NOBMI-NEXT:    decl %eax
 ; X64-NOBMI-NEXT:    andl (%rdi), %eax
@@ -156,8 +159,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi32_a3_load_indexzext:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl $1, %eax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    movl $1, %eax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shll %cl, %eax
 ; X64-NOBMI-NEXT:    decl %eax
 ; X64-NOBMI-NEXT:    andl (%rdi), %eax
@@ -193,8 +197,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi32_a4_commutative:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl $1, %eax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    movl $1, %eax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shll %cl, %eax
 ; X64-NOBMI-NEXT:    decl %eax
 ; X64-NOBMI-NEXT:    andl %edi, %eax
@@ -253,8 +258,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi64_a0:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movq %rsi, %rcx
 ; X64-NOBMI-NEXT:    movl $1, %eax
-; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI-NEXT:    shlq %cl, %rax
 ; X64-NOBMI-NEXT:    decq %rax
 ; X64-NOBMI-NEXT:    andq %rdi, %rax
@@ -311,8 +317,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi64_a1_indexzext:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl $1, %eax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    movl $1, %eax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shlq %cl, %rax
 ; X64-NOBMI-NEXT:    decq %rax
 ; X64-NOBMI-NEXT:    andq %rdi, %rax
@@ -377,8 +384,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi64_a2_load:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movq %rsi, %rcx
 ; X64-NOBMI-NEXT:    movl $1, %eax
-; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI-NEXT:    shlq %cl, %rax
 ; X64-NOBMI-NEXT:    decq %rax
 ; X64-NOBMI-NEXT:    andq (%rdi), %rax
@@ -442,8 +450,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi64_a3_load_indexzext:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl $1, %eax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    movl $1, %eax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shlq %cl, %rax
 ; X64-NOBMI-NEXT:    decq %rax
 ; X64-NOBMI-NEXT:    andq (%rdi), %rax
@@ -503,8 +512,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi64_a4_commutative:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movq %rsi, %rcx
 ; X64-NOBMI-NEXT:    movl $1, %eax
-; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI-NEXT:    shlq %cl, %rax
 ; X64-NOBMI-NEXT:    decq %rax
 ; X64-NOBMI-NEXT:    andq %rdi, %rax
@@ -542,8 +552,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi32_b0:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl $-1, %eax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    movl $-1, %eax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shll %cl, %eax
 ; X64-NOBMI-NEXT:    notl %eax
 ; X64-NOBMI-NEXT:    andl %edi, %eax
@@ -577,8 +588,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi32_b1_indexzext:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl $-1, %eax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    movl $-1, %eax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shll %cl, %eax
 ; X64-NOBMI-NEXT:    notl %eax
 ; X64-NOBMI-NEXT:    andl %edi, %eax
@@ -615,8 +627,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi32_b2_load:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl $-1, %eax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    movl $-1, %eax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shll %cl, %eax
 ; X64-NOBMI-NEXT:    notl %eax
 ; X64-NOBMI-NEXT:    andl (%rdi), %eax
@@ -653,8 +666,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi32_b3_load_indexzext:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl $-1, %eax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    movl $-1, %eax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shll %cl, %eax
 ; X64-NOBMI-NEXT:    notl %eax
 ; X64-NOBMI-NEXT:    andl (%rdi), %eax
@@ -690,8 +704,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi32_b4_commutative:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl $-1, %eax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    movl $-1, %eax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shll %cl, %eax
 ; X64-NOBMI-NEXT:    notl %eax
 ; X64-NOBMI-NEXT:    andl %edi, %eax
@@ -749,8 +764,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi64_b0:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movq %rsi, %rcx
 ; X64-NOBMI-NEXT:    movq $-1, %rax
-; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI-NEXT:    shlq %cl, %rax
 ; X64-NOBMI-NEXT:    notq %rax
 ; X64-NOBMI-NEXT:    andq %rdi, %rax
@@ -806,8 +822,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi64_b1_indexzext:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movq $-1, %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    movq $-1, %rax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shlq %cl, %rax
 ; X64-NOBMI-NEXT:    notq %rax
 ; X64-NOBMI-NEXT:    andq %rdi, %rax
@@ -869,8 +886,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi64_b2_load:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movq %rsi, %rcx
 ; X64-NOBMI-NEXT:    movq $-1, %rax
-; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI-NEXT:    shlq %cl, %rax
 ; X64-NOBMI-NEXT:    notq %rax
 ; X64-NOBMI-NEXT:    andq (%rdi), %rax
@@ -931,8 +949,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi64_b3_load_indexzext:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movq $-1, %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    movq $-1, %rax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shlq %cl, %rax
 ; X64-NOBMI-NEXT:    notq %rax
 ; X64-NOBMI-NEXT:    andq (%rdi), %rax
@@ -991,8 +1010,9 @@
 ;
 ; X64-NOBMI-LABEL: bzhi64_b4_commutative:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movq %rsi, %rcx
 ; X64-NOBMI-NEXT:    movq $-1, %rax
-; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI-NEXT:    shlq %cl, %rax
 ; X64-NOBMI-NEXT:    notq %rax
 ; X64-NOBMI-NEXT:    andq %rdi, %rax
@@ -1032,11 +1052,11 @@
 ; X64-NOBMI-LABEL: bzhi32_c0:
 ; X64-NOBMI:       # %bb.0:
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    negl %ecx
-; X64-NOBMI-NEXT:    shll %cl, %edi
-; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %edi
 ; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    negl %ecx
+; X64-NOBMI-NEXT:    shll %cl, %eax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI-NEXT:    shrl %cl, %eax
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c0:
@@ -1069,11 +1089,11 @@
 ; X64-NOBMI-LABEL: bzhi32_c1_indexzext:
 ; X64-NOBMI:       # %bb.0:
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    negb %cl
-; X64-NOBMI-NEXT:    shll %cl, %edi
-; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %edi
 ; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    negb %cl
+; X64-NOBMI-NEXT:    shll %cl, %eax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI-NEXT:    shrl %cl, %eax
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c1_indexzext:
@@ -1188,11 +1208,11 @@
 ; X64-NOBMI-LABEL: bzhi32_c4_commutative:
 ; X64-NOBMI:       # %bb.0:
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    negl %ecx
-; X64-NOBMI-NEXT:    shll %cl, %edi
-; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %edi
 ; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    negl %ecx
+; X64-NOBMI-NEXT:    shll %cl, %eax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI-NEXT:    shrl %cl, %eax
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c4_commutative:
@@ -1246,11 +1266,11 @@
 ; X64-NOBMI-LABEL: bzhi64_c0:
 ; X64-NOBMI:       # %bb.0:
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
-; X64-NOBMI-NEXT:    negl %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rdi
-; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI-NEXT:    shrq %cl, %rdi
 ; X64-NOBMI-NEXT:    movq %rdi, %rax
+; X64-NOBMI-NEXT:    negl %ecx
+; X64-NOBMI-NEXT:    shlq %cl, %rax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NOBMI-NEXT:    shrq %cl, %rax
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c0:
@@ -1302,11 +1322,11 @@
 ; X64-NOBMI-LABEL: bzhi64_c1_indexzext:
 ; X64-NOBMI:       # %bb.0:
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    negb %cl
-; X64-NOBMI-NEXT:    shlq %cl, %rdi
-; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrq %cl, %rdi
 ; X64-NOBMI-NEXT:    movq %rdi, %rax
+; X64-NOBMI-NEXT:    negb %cl
+; X64-NOBMI-NEXT:    shlq %cl, %rax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI-NEXT:    shrq %cl, %rax
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c1_indexzext:
@@ -1488,11 +1508,11 @@
 ; X64-NOBMI-LABEL: bzhi64_c4_commutative:
 ; X64-NOBMI:       # %bb.0:
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
-; X64-NOBMI-NEXT:    negl %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rdi
-; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI-NEXT:    shrq %cl, %rdi
 ; X64-NOBMI-NEXT:    movq %rdi, %rax
+; X64-NOBMI-NEXT:    negl %ecx
+; X64-NOBMI-NEXT:    shlq %cl, %rax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NOBMI-NEXT:    shrq %cl, %rax
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c4_commutative:
@@ -1529,11 +1549,11 @@
 ; X64-NOBMI-LABEL: bzhi32_d0:
 ; X64-NOBMI:       # %bb.0:
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    negl %ecx
-; X64-NOBMI-NEXT:    shll %cl, %edi
-; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %edi
 ; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    negl %ecx
+; X64-NOBMI-NEXT:    shll %cl, %eax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI-NEXT:    shrl %cl, %eax
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_d0:
@@ -1566,11 +1586,11 @@
 ; X64-NOBMI-LABEL: bzhi32_d1_indexzext:
 ; X64-NOBMI:       # %bb.0:
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    negb %cl
-; X64-NOBMI-NEXT:    shll %cl, %edi
-; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %edi
 ; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    negb %cl
+; X64-NOBMI-NEXT:    shll %cl, %eax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI-NEXT:    shrl %cl, %eax
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_d1_indexzext:
@@ -1742,11 +1762,11 @@
 ; X64-NOBMI-LABEL: bzhi64_d0:
 ; X64-NOBMI:       # %bb.0:
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
-; X64-NOBMI-NEXT:    negl %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rdi
-; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI-NEXT:    shrq %cl, %rdi
 ; X64-NOBMI-NEXT:    movq %rdi, %rax
+; X64-NOBMI-NEXT:    negl %ecx
+; X64-NOBMI-NEXT:    shlq %cl, %rax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NOBMI-NEXT:    shrq %cl, %rax
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_d0:
@@ -1834,11 +1854,11 @@
 ; X64-NOBMI-LABEL: bzhi64_d1_indexzext:
 ; X64-NOBMI:       # %bb.0:
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    negb %cl
-; X64-NOBMI-NEXT:    shlq %cl, %rdi
-; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrq %cl, %rdi
 ; X64-NOBMI-NEXT:    movq %rdi, %rax
+; X64-NOBMI-NEXT:    negb %cl
+; X64-NOBMI-NEXT:    shlq %cl, %rax
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI-NEXT:    shrq %cl, %rax
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_d1_indexzext:
@@ -2060,8 +2080,8 @@
 ;
 ; X64-LABEL: bzhi32_constant_mask32:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl $2147483647, %edi # imm = 0x7FFFFFFF
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
 ; X64-NEXT:    retq
   %masked = and i32 %val, 2147483647
   ret i32 %masked
@@ -2094,8 +2114,8 @@
 ;
 ; X64-LABEL: bzhi32_constant_mask16:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl $32767, %edi # imm = 0x7FFF
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $32767, %eax # imm = 0x7FFF
 ; X64-NEXT:    retq
   %masked = and i32 %val, 32767
   ret i32 %masked
@@ -2128,8 +2148,8 @@
 ;
 ; X64-LABEL: bzhi32_constant_mask8:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl $127, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $127, %eax
 ; X64-NEXT:    retq
   %masked = and i32 %val, 127
   ret i32 %masked
@@ -2223,8 +2243,8 @@
 ;
 ; X64-LABEL: bzhi64_constant_mask32:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl $2147483647, %edi # imm = 0x7FFFFFFF
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
 ; X64-NEXT:    retq
   %masked = and i64 %val, 2147483647
   ret i64 %masked
@@ -2259,8 +2279,8 @@
 ;
 ; X64-LABEL: bzhi64_constant_mask16:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl $32767, %edi # imm = 0x7FFF
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    andl $32767, %eax # imm = 0x7FFF
 ; X64-NEXT:    retq
   %masked = and i64 %val, 32767
   ret i64 %masked
@@ -2295,8 +2315,8 @@
 ;
 ; X64-LABEL: bzhi64_constant_mask8:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl $127, %edi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    andl $127, %eax
 ; X64-NEXT:    retq
   %masked = and i64 %val, 127
   ret i64 %masked
diff --git a/llvm/test/CodeGen/X86/fast-isel-fold-mem.ll b/llvm/test/CodeGen/X86/fast-isel-fold-mem.ll
index 1c51719..ebd8fb7 100644
--- a/llvm/test/CodeGen/X86/fast-isel-fold-mem.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-fold-mem.ll
@@ -5,8 +5,8 @@
 define i64 @fold_load(i64* %a, i64 %b) {
 ; CHECK-LABEL: fold_load:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    addq (%rdi), %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    addq (%rdi), %rax
 ; CHECK-NEXT:    retq
   %1 = load i64, i64* %a, align 8
   %2 = add i64 %1, %b
diff --git a/llvm/test/CodeGen/X86/fast-isel-select-cmov.ll b/llvm/test/CodeGen/X86/fast-isel-select-cmov.ll
index 3e9b99f..f1cdb85 100644
--- a/llvm/test/CodeGen/X86/fast-isel-select-cmov.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-select-cmov.ll
@@ -31,9 +31,9 @@
 define i32 @select_cmov_i32(i1 zeroext %cond, i32 %a, i32 %b) {
 ; CHECK-LABEL: select_cmov_i32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    cmovel %edx, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    cmovel %edx, %eax
 ; CHECK-NEXT:    retq
   %1 = select i1 %cond, i32 %a, i32 %b
   ret i32 %1
@@ -42,9 +42,9 @@
 define i32 @select_cmp_cmov_i32(i32 %a, i32 %b) {
 ; CHECK-LABEL: select_cmp_cmov_i32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    cmpl %esi, %edi
-; CHECK-NEXT:    cmovbl %edi, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    cmovbl %edi, %eax
 ; CHECK-NEXT:    retq
   %1 = icmp ult i32 %a, %b
   %2 = select i1 %1, i32 %a, i32 %b
@@ -54,9 +54,9 @@
 define i64 @select_cmov_i64(i1 zeroext %cond, i64 %a, i64 %b) {
 ; CHECK-LABEL: select_cmov_i64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    cmoveq %rdx, %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    cmoveq %rdx, %rax
 ; CHECK-NEXT:    retq
   %1 = select i1 %cond, i64 %a, i64 %b
   ret i64 %1
@@ -65,9 +65,9 @@
 define i64 @select_cmp_cmov_i64(i64 %a, i64 %b) {
 ; CHECK-LABEL: select_cmp_cmov_i64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    cmovbq %rdi, %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    cmovbq %rdi, %rax
 ; CHECK-NEXT:    retq
   %1 = icmp ult i64 %a, %b
   %2 = select i1 %1, i64 %a, i64 %b
diff --git a/llvm/test/CodeGen/X86/fast-isel-select-cmov2.ll b/llvm/test/CodeGen/X86/fast-isel-select-cmov2.ll
index 3dd4d2b..eb2bd08 100644
--- a/llvm/test/CodeGen/X86/fast-isel-select-cmov2.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-select-cmov2.ll
@@ -19,30 +19,30 @@
 define i64 @select_fcmp_oeq_cmov(double %a, double %b, i64 %c, i64 %d) {
 ; SDAG-LABEL: select_fcmp_oeq_cmov:
 ; SDAG:       ## %bb.0:
-; SDAG-NEXT:    ucomisd %xmm1, %xmm0
-; SDAG-NEXT:    cmovneq %rsi, %rdi
-; SDAG-NEXT:    cmovpq %rsi, %rdi
 ; SDAG-NEXT:    movq %rdi, %rax
+; SDAG-NEXT:    ucomisd %xmm1, %xmm0
+; SDAG-NEXT:    cmovneq %rsi, %rax
+; SDAG-NEXT:    cmovpq %rsi, %rax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: select_fcmp_oeq_cmov:
 ; FAST:       ## %bb.0:
-; FAST-NEXT:    ucomisd %xmm1, %xmm0
-; FAST-NEXT:    setnp %al
-; FAST-NEXT:    sete %cl
-; FAST-NEXT:    testb %al, %cl
-; FAST-NEXT:    cmoveq %rsi, %rdi
 ; FAST-NEXT:    movq %rdi, %rax
+; FAST-NEXT:    ucomisd %xmm1, %xmm0
+; FAST-NEXT:    setnp %cl
+; FAST-NEXT:    sete %dl
+; FAST-NEXT:    testb %cl, %dl
+; FAST-NEXT:    cmoveq %rsi, %rax
 ; FAST-NEXT:    retq
 ;
 ; FAST_AVX-LABEL: select_fcmp_oeq_cmov:
 ; FAST_AVX:       ## %bb.0:
-; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
-; FAST_AVX-NEXT:    setnp %al
-; FAST_AVX-NEXT:    sete %cl
-; FAST_AVX-NEXT:    testb %al, %cl
-; FAST_AVX-NEXT:    cmoveq %rsi, %rdi
 ; FAST_AVX-NEXT:    movq %rdi, %rax
+; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT:    setnp %cl
+; FAST_AVX-NEXT:    sete %dl
+; FAST_AVX-NEXT:    testb %cl, %dl
+; FAST_AVX-NEXT:    cmoveq %rsi, %rax
 ; FAST_AVX-NEXT:    retq
   %1 = fcmp oeq double %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -52,16 +52,16 @@
 define i64 @select_fcmp_ogt_cmov(double %a, double %b, i64 %c, i64 %d) {
 ; NOAVX-LABEL: select_fcmp_ogt_cmov:
 ; NOAVX:       ## %bb.0:
-; NOAVX-NEXT:    ucomisd %xmm1, %xmm0
-; NOAVX-NEXT:    cmovbeq %rsi, %rdi
 ; NOAVX-NEXT:    movq %rdi, %rax
+; NOAVX-NEXT:    ucomisd %xmm1, %xmm0
+; NOAVX-NEXT:    cmovbeq %rsi, %rax
 ; NOAVX-NEXT:    retq
 ;
 ; FAST_AVX-LABEL: select_fcmp_ogt_cmov:
 ; FAST_AVX:       ## %bb.0:
-; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
-; FAST_AVX-NEXT:    cmovbeq %rsi, %rdi
 ; FAST_AVX-NEXT:    movq %rdi, %rax
+; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT:    cmovbeq %rsi, %rax
 ; FAST_AVX-NEXT:    retq
   %1 = fcmp ogt double %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -71,16 +71,16 @@
 define i64 @select_fcmp_oge_cmov(double %a, double %b, i64 %c, i64 %d) {
 ; NOAVX-LABEL: select_fcmp_oge_cmov:
 ; NOAVX:       ## %bb.0:
-; NOAVX-NEXT:    ucomisd %xmm1, %xmm0
-; NOAVX-NEXT:    cmovbq %rsi, %rdi
 ; NOAVX-NEXT:    movq %rdi, %rax
+; NOAVX-NEXT:    ucomisd %xmm1, %xmm0
+; NOAVX-NEXT:    cmovbq %rsi, %rax
 ; NOAVX-NEXT:    retq
 ;
 ; FAST_AVX-LABEL: select_fcmp_oge_cmov:
 ; FAST_AVX:       ## %bb.0:
-; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
-; FAST_AVX-NEXT:    cmovbq %rsi, %rdi
 ; FAST_AVX-NEXT:    movq %rdi, %rax
+; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT:    cmovbq %rsi, %rax
 ; FAST_AVX-NEXT:    retq
   %1 = fcmp oge double %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -90,16 +90,16 @@
 define i64 @select_fcmp_olt_cmov(double %a, double %b, i64 %c, i64 %d) {
 ; NOAVX-LABEL: select_fcmp_olt_cmov:
 ; NOAVX:       ## %bb.0:
-; NOAVX-NEXT:    ucomisd %xmm0, %xmm1
-; NOAVX-NEXT:    cmovbeq %rsi, %rdi
 ; NOAVX-NEXT:    movq %rdi, %rax
+; NOAVX-NEXT:    ucomisd %xmm0, %xmm1
+; NOAVX-NEXT:    cmovbeq %rsi, %rax
 ; NOAVX-NEXT:    retq
 ;
 ; FAST_AVX-LABEL: select_fcmp_olt_cmov:
 ; FAST_AVX:       ## %bb.0:
-; FAST_AVX-NEXT:    vucomisd %xmm0, %xmm1
-; FAST_AVX-NEXT:    cmovbeq %rsi, %rdi
 ; FAST_AVX-NEXT:    movq %rdi, %rax
+; FAST_AVX-NEXT:    vucomisd %xmm0, %xmm1
+; FAST_AVX-NEXT:    cmovbeq %rsi, %rax
 ; FAST_AVX-NEXT:    retq
   %1 = fcmp olt double %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -109,16 +109,16 @@
 define i64 @select_fcmp_ole_cmov(double %a, double %b, i64 %c, i64 %d) {
 ; NOAVX-LABEL: select_fcmp_ole_cmov:
 ; NOAVX:       ## %bb.0:
-; NOAVX-NEXT:    ucomisd %xmm0, %xmm1
-; NOAVX-NEXT:    cmovbq %rsi, %rdi
 ; NOAVX-NEXT:    movq %rdi, %rax
+; NOAVX-NEXT:    ucomisd %xmm0, %xmm1
+; NOAVX-NEXT:    cmovbq %rsi, %rax
 ; NOAVX-NEXT:    retq
 ;
 ; FAST_AVX-LABEL: select_fcmp_ole_cmov:
 ; FAST_AVX:       ## %bb.0:
-; FAST_AVX-NEXT:    vucomisd %xmm0, %xmm1
-; FAST_AVX-NEXT:    cmovbq %rsi, %rdi
 ; FAST_AVX-NEXT:    movq %rdi, %rax
+; FAST_AVX-NEXT:    vucomisd %xmm0, %xmm1
+; FAST_AVX-NEXT:    cmovbq %rsi, %rax
 ; FAST_AVX-NEXT:    retq
   %1 = fcmp ole double %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -128,16 +128,16 @@
 define i64 @select_fcmp_one_cmov(double %a, double %b, i64 %c, i64 %d) {
 ; NOAVX-LABEL: select_fcmp_one_cmov:
 ; NOAVX:       ## %bb.0:
-; NOAVX-NEXT:    ucomisd %xmm1, %xmm0
-; NOAVX-NEXT:    cmoveq %rsi, %rdi
 ; NOAVX-NEXT:    movq %rdi, %rax
+; NOAVX-NEXT:    ucomisd %xmm1, %xmm0
+; NOAVX-NEXT:    cmoveq %rsi, %rax
 ; NOAVX-NEXT:    retq
 ;
 ; FAST_AVX-LABEL: select_fcmp_one_cmov:
 ; FAST_AVX:       ## %bb.0:
-; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
-; FAST_AVX-NEXT:    cmoveq %rsi, %rdi
 ; FAST_AVX-NEXT:    movq %rdi, %rax
+; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT:    cmoveq %rsi, %rax
 ; FAST_AVX-NEXT:    retq
   %1 = fcmp one double %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -147,16 +147,16 @@
 define i64 @select_fcmp_ord_cmov(double %a, double %b, i64 %c, i64 %d) {
 ; NOAVX-LABEL: select_fcmp_ord_cmov:
 ; NOAVX:       ## %bb.0:
-; NOAVX-NEXT:    ucomisd %xmm1, %xmm0
-; NOAVX-NEXT:    cmovpq %rsi, %rdi
 ; NOAVX-NEXT:    movq %rdi, %rax
+; NOAVX-NEXT:    ucomisd %xmm1, %xmm0
+; NOAVX-NEXT:    cmovpq %rsi, %rax
 ; NOAVX-NEXT:    retq
 ;
 ; FAST_AVX-LABEL: select_fcmp_ord_cmov:
 ; FAST_AVX:       ## %bb.0:
-; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
-; FAST_AVX-NEXT:    cmovpq %rsi, %rdi
 ; FAST_AVX-NEXT:    movq %rdi, %rax
+; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT:    cmovpq %rsi, %rax
 ; FAST_AVX-NEXT:    retq
   %1 = fcmp ord double %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -166,16 +166,16 @@
 define i64 @select_fcmp_uno_cmov(double %a, double %b, i64 %c, i64 %d) {
 ; NOAVX-LABEL: select_fcmp_uno_cmov:
 ; NOAVX:       ## %bb.0:
-; NOAVX-NEXT:    ucomisd %xmm1, %xmm0
-; NOAVX-NEXT:    cmovnpq %rsi, %rdi
 ; NOAVX-NEXT:    movq %rdi, %rax
+; NOAVX-NEXT:    ucomisd %xmm1, %xmm0
+; NOAVX-NEXT:    cmovnpq %rsi, %rax
 ; NOAVX-NEXT:    retq
 ;
 ; FAST_AVX-LABEL: select_fcmp_uno_cmov:
 ; FAST_AVX:       ## %bb.0:
-; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
-; FAST_AVX-NEXT:    cmovnpq %rsi, %rdi
 ; FAST_AVX-NEXT:    movq %rdi, %rax
+; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT:    cmovnpq %rsi, %rax
 ; FAST_AVX-NEXT:    retq
   %1 = fcmp uno double %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -185,16 +185,16 @@
 define i64 @select_fcmp_ueq_cmov(double %a, double %b, i64 %c, i64 %d) {
 ; NOAVX-LABEL: select_fcmp_ueq_cmov:
 ; NOAVX:       ## %bb.0:
-; NOAVX-NEXT:    ucomisd %xmm1, %xmm0
-; NOAVX-NEXT:    cmovneq %rsi, %rdi
 ; NOAVX-NEXT:    movq %rdi, %rax
+; NOAVX-NEXT:    ucomisd %xmm1, %xmm0
+; NOAVX-NEXT:    cmovneq %rsi, %rax
 ; NOAVX-NEXT:    retq
 ;
 ; FAST_AVX-LABEL: select_fcmp_ueq_cmov:
 ; FAST_AVX:       ## %bb.0:
-; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
-; FAST_AVX-NEXT:    cmovneq %rsi, %rdi
 ; FAST_AVX-NEXT:    movq %rdi, %rax
+; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT:    cmovneq %rsi, %rax
 ; FAST_AVX-NEXT:    retq
   %1 = fcmp ueq double %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -204,16 +204,16 @@
 define i64 @select_fcmp_ugt_cmov(double %a, double %b, i64 %c, i64 %d) {
 ; NOAVX-LABEL: select_fcmp_ugt_cmov:
 ; NOAVX:       ## %bb.0:
-; NOAVX-NEXT:    ucomisd %xmm0, %xmm1
-; NOAVX-NEXT:    cmovaeq %rsi, %rdi
 ; NOAVX-NEXT:    movq %rdi, %rax
+; NOAVX-NEXT:    ucomisd %xmm0, %xmm1
+; NOAVX-NEXT:    cmovaeq %rsi, %rax
 ; NOAVX-NEXT:    retq
 ;
 ; FAST_AVX-LABEL: select_fcmp_ugt_cmov:
 ; FAST_AVX:       ## %bb.0:
-; FAST_AVX-NEXT:    vucomisd %xmm0, %xmm1
-; FAST_AVX-NEXT:    cmovaeq %rsi, %rdi
 ; FAST_AVX-NEXT:    movq %rdi, %rax
+; FAST_AVX-NEXT:    vucomisd %xmm0, %xmm1
+; FAST_AVX-NEXT:    cmovaeq %rsi, %rax
 ; FAST_AVX-NEXT:    retq
   %1 = fcmp ugt double %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -223,16 +223,16 @@
 define i64 @select_fcmp_uge_cmov(double %a, double %b, i64 %c, i64 %d) {
 ; NOAVX-LABEL: select_fcmp_uge_cmov:
 ; NOAVX:       ## %bb.0:
-; NOAVX-NEXT:    ucomisd %xmm0, %xmm1
-; NOAVX-NEXT:    cmovaq %rsi, %rdi
 ; NOAVX-NEXT:    movq %rdi, %rax
+; NOAVX-NEXT:    ucomisd %xmm0, %xmm1
+; NOAVX-NEXT:    cmovaq %rsi, %rax
 ; NOAVX-NEXT:    retq
 ;
 ; FAST_AVX-LABEL: select_fcmp_uge_cmov:
 ; FAST_AVX:       ## %bb.0:
-; FAST_AVX-NEXT:    vucomisd %xmm0, %xmm1
-; FAST_AVX-NEXT:    cmovaq %rsi, %rdi
 ; FAST_AVX-NEXT:    movq %rdi, %rax
+; FAST_AVX-NEXT:    vucomisd %xmm0, %xmm1
+; FAST_AVX-NEXT:    cmovaq %rsi, %rax
 ; FAST_AVX-NEXT:    retq
   %1 = fcmp uge double %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -242,16 +242,16 @@
 define i64 @select_fcmp_ult_cmov(double %a, double %b, i64 %c, i64 %d) {
 ; NOAVX-LABEL: select_fcmp_ult_cmov:
 ; NOAVX:       ## %bb.0:
-; NOAVX-NEXT:    ucomisd %xmm1, %xmm0
-; NOAVX-NEXT:    cmovaeq %rsi, %rdi
 ; NOAVX-NEXT:    movq %rdi, %rax
+; NOAVX-NEXT:    ucomisd %xmm1, %xmm0
+; NOAVX-NEXT:    cmovaeq %rsi, %rax
 ; NOAVX-NEXT:    retq
 ;
 ; FAST_AVX-LABEL: select_fcmp_ult_cmov:
 ; FAST_AVX:       ## %bb.0:
-; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
-; FAST_AVX-NEXT:    cmovaeq %rsi, %rdi
 ; FAST_AVX-NEXT:    movq %rdi, %rax
+; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT:    cmovaeq %rsi, %rax
 ; FAST_AVX-NEXT:    retq
   %1 = fcmp ult double %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -261,16 +261,16 @@
 define i64 @select_fcmp_ule_cmov(double %a, double %b, i64 %c, i64 %d) {
 ; NOAVX-LABEL: select_fcmp_ule_cmov:
 ; NOAVX:       ## %bb.0:
-; NOAVX-NEXT:    ucomisd %xmm1, %xmm0
-; NOAVX-NEXT:    cmovaq %rsi, %rdi
 ; NOAVX-NEXT:    movq %rdi, %rax
+; NOAVX-NEXT:    ucomisd %xmm1, %xmm0
+; NOAVX-NEXT:    cmovaq %rsi, %rax
 ; NOAVX-NEXT:    retq
 ;
 ; FAST_AVX-LABEL: select_fcmp_ule_cmov:
 ; FAST_AVX:       ## %bb.0:
-; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
-; FAST_AVX-NEXT:    cmovaq %rsi, %rdi
 ; FAST_AVX-NEXT:    movq %rdi, %rax
+; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT:    cmovaq %rsi, %rax
 ; FAST_AVX-NEXT:    retq
   %1 = fcmp ule double %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -280,30 +280,30 @@
 define i64 @select_fcmp_une_cmov(double %a, double %b, i64 %c, i64 %d) {
 ; SDAG-LABEL: select_fcmp_une_cmov:
 ; SDAG:       ## %bb.0:
-; SDAG-NEXT:    ucomisd %xmm1, %xmm0
-; SDAG-NEXT:    cmovneq %rdi, %rsi
-; SDAG-NEXT:    cmovpq %rdi, %rsi
 ; SDAG-NEXT:    movq %rsi, %rax
+; SDAG-NEXT:    ucomisd %xmm1, %xmm0
+; SDAG-NEXT:    cmovneq %rdi, %rax
+; SDAG-NEXT:    cmovpq %rdi, %rax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: select_fcmp_une_cmov:
 ; FAST:       ## %bb.0:
-; FAST-NEXT:    ucomisd %xmm1, %xmm0
-; FAST-NEXT:    setp %al
-; FAST-NEXT:    setne %cl
-; FAST-NEXT:    orb %al, %cl
-; FAST-NEXT:    cmoveq %rsi, %rdi
 ; FAST-NEXT:    movq %rdi, %rax
+; FAST-NEXT:    ucomisd %xmm1, %xmm0
+; FAST-NEXT:    setp %cl
+; FAST-NEXT:    setne %dl
+; FAST-NEXT:    orb %cl, %dl
+; FAST-NEXT:    cmoveq %rsi, %rax
 ; FAST-NEXT:    retq
 ;
 ; FAST_AVX-LABEL: select_fcmp_une_cmov:
 ; FAST_AVX:       ## %bb.0:
-; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
-; FAST_AVX-NEXT:    setp %al
-; FAST_AVX-NEXT:    setne %cl
-; FAST_AVX-NEXT:    orb %al, %cl
-; FAST_AVX-NEXT:    cmoveq %rsi, %rdi
 ; FAST_AVX-NEXT:    movq %rdi, %rax
+; FAST_AVX-NEXT:    vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT:    setp %cl
+; FAST_AVX-NEXT:    setne %dl
+; FAST_AVX-NEXT:    orb %cl, %dl
+; FAST_AVX-NEXT:    cmoveq %rsi, %rax
 ; FAST_AVX-NEXT:    retq
   %1 = fcmp une double %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -323,9 +323,9 @@
 define i64 @select_icmp_eq_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
 ; CHECK-LABEL: select_icmp_eq_cmov:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    cmovneq %rcx, %rdx
 ; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    cmovneq %rcx, %rax
 ; CHECK-NEXT:    retq
   %1 = icmp eq i64 %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -335,9 +335,9 @@
 define i64 @select_icmp_ne_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
 ; CHECK-LABEL: select_icmp_ne_cmov:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    cmoveq %rcx, %rdx
 ; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    cmoveq %rcx, %rax
 ; CHECK-NEXT:    retq
   %1 = icmp ne i64 %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -347,9 +347,9 @@
 define i64 @select_icmp_ugt_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
 ; CHECK-LABEL: select_icmp_ugt_cmov:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    cmovbeq %rcx, %rdx
 ; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    cmovbeq %rcx, %rax
 ; CHECK-NEXT:    retq
   %1 = icmp ugt i64 %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -360,9 +360,9 @@
 define i64 @select_icmp_uge_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
 ; CHECK-LABEL: select_icmp_uge_cmov:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    cmovbq %rcx, %rdx
 ; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    cmovbq %rcx, %rax
 ; CHECK-NEXT:    retq
   %1 = icmp uge i64 %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -372,9 +372,9 @@
 define i64 @select_icmp_ult_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
 ; CHECK-LABEL: select_icmp_ult_cmov:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    cmovaeq %rcx, %rdx
 ; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    cmovaeq %rcx, %rax
 ; CHECK-NEXT:    retq
   %1 = icmp ult i64 %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -384,9 +384,9 @@
 define i64 @select_icmp_ule_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
 ; CHECK-LABEL: select_icmp_ule_cmov:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    cmovaq %rcx, %rdx
 ; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    cmovaq %rcx, %rax
 ; CHECK-NEXT:    retq
   %1 = icmp ule i64 %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -396,9 +396,9 @@
 define i64 @select_icmp_sgt_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
 ; CHECK-LABEL: select_icmp_sgt_cmov:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    cmovleq %rcx, %rdx
 ; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    cmovleq %rcx, %rax
 ; CHECK-NEXT:    retq
   %1 = icmp sgt i64 %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -408,9 +408,9 @@
 define i64 @select_icmp_sge_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
 ; CHECK-LABEL: select_icmp_sge_cmov:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    cmovlq %rcx, %rdx
 ; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    cmovlq %rcx, %rax
 ; CHECK-NEXT:    retq
   %1 = icmp sge i64 %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -420,9 +420,9 @@
 define i64 @select_icmp_slt_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
 ; CHECK-LABEL: select_icmp_slt_cmov:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    cmovgeq %rcx, %rdx
 ; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    cmovgeq %rcx, %rax
 ; CHECK-NEXT:    retq
   %1 = icmp slt i64 %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
@@ -432,9 +432,9 @@
 define i64 @select_icmp_sle_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
 ; CHECK-LABEL: select_icmp_sle_cmov:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    cmovgq %rcx, %rdx
 ; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    cmovgq %rcx, %rax
 ; CHECK-NEXT:    retq
   %1 = icmp sle i64 %a, %b
   %2 = select i1 %1, i64 %c, i64 %d
diff --git a/llvm/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll b/llvm/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll
index 3ab0407..58b378a 100644
--- a/llvm/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll
@@ -281,11 +281,14 @@
 ; CHECK-LABEL: select_icmp_sle_i8:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    jle LBB12_2
-; CHECK-NEXT:  ## %bb.1:
-; CHECK-NEXT:    movl %ecx, %edx
-; CHECK-NEXT:  LBB12_2:
+; CHECK-NEXT:    jle LBB12_1
+; CHECK-NEXT:  ## %bb.2:
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB12_1:
 ; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %1 = icmp sle i64 %a, %b
   %2 = select i1 %1, i8 %c, i8 %d
diff --git a/llvm/test/CodeGen/X86/fast-isel-sext-zext.ll b/llvm/test/CodeGen/X86/fast-isel-sext-zext.ll
index 82ed6c7..9a83db5 100644
--- a/llvm/test/CodeGen/X86/fast-isel-sext-zext.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-sext-zext.ll
@@ -12,9 +12,10 @@
 ;
 ; X64-LABEL: test1:
 ; X64:       ## %bb.0:
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    negb %dil
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $1, %al
+; X64-NEXT:    negb %al
+; X64-NEXT:    ## kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %z = trunc i8 %x to i1
   %u = sext i1 %z to i8
@@ -92,8 +93,9 @@
 ;
 ; X64-LABEL: test5:
 ; X64:       ## %bb.0:
-; X64-NEXT:    andb $1, %dil
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $1, %al
+; X64-NEXT:    ## kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %z = trunc i8 %x to i1
   %u = zext i1 %z to i8
diff --git a/llvm/test/CodeGen/X86/fast-isel-shift.ll b/llvm/test/CodeGen/X86/fast-isel-shift.ll
index 4dc56f3..2aff7cf 100644
--- a/llvm/test/CodeGen/X86/fast-isel-shift.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-shift.ll
@@ -5,8 +5,10 @@
 ; CHECK-LABEL: shl_i8:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    shlb %cl, %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    ## kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shlb %cl, %al
+; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %c = shl i8 %a, %b
   ret i8 %c
@@ -16,9 +18,11 @@
 ; CHECK-LABEL: shl_i16:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    ## kill: def $cl killed $cx
-; CHECK-NEXT:    shlw %cl, %di
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    ## kill: def $cx killed $cx killed $ecx
+; CHECK-NEXT:    ## kill: def $cl killed $cx
+; CHECK-NEXT:    shlw %cl, %ax
+; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %c = shl i16 %a, %b
   ret i16 %c
@@ -28,9 +32,9 @@
 ; CHECK-LABEL: shl_i32:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    ## kill: def $cl killed $ecx
-; CHECK-NEXT:    shll %cl, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    ## kill: def $cl killed $ecx
+; CHECK-NEXT:    shll %cl, %eax
 ; CHECK-NEXT:    retq
   %c = shl i32 %a, %b
   ret i32 %c
@@ -40,9 +44,9 @@
 ; CHECK-LABEL: shl_i64:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movq %rsi, %rcx
-; CHECK-NEXT:    ## kill: def $cl killed $rcx
-; CHECK-NEXT:    shlq %cl, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    ## kill: def $cl killed $rcx
+; CHECK-NEXT:    shlq %cl, %rax
 ; CHECK-NEXT:    retq
   %c = shl i64 %a, %b
   ret i64 %c
@@ -52,8 +56,10 @@
 ; CHECK-LABEL: lshr_i8:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    shrb %cl, %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    ## kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrb %cl, %al
+; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %c = lshr i8 %a, %b
   ret i8 %c
@@ -63,9 +69,11 @@
 ; CHECK-LABEL: lshr_i16:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    ## kill: def $cl killed $cx
-; CHECK-NEXT:    shrw %cl, %di
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    ## kill: def $cx killed $cx killed $ecx
+; CHECK-NEXT:    ## kill: def $cl killed $cx
+; CHECK-NEXT:    shrw %cl, %ax
+; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %c = lshr i16 %a, %b
   ret i16 %c
@@ -75,9 +83,9 @@
 ; CHECK-LABEL: lshr_i32:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    ## kill: def $cl killed $ecx
-; CHECK-NEXT:    shrl %cl, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    ## kill: def $cl killed $ecx
+; CHECK-NEXT:    shrl %cl, %eax
 ; CHECK-NEXT:    retq
   %c = lshr i32 %a, %b
   ret i32 %c
@@ -87,9 +95,9 @@
 ; CHECK-LABEL: lshr_i64:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movq %rsi, %rcx
-; CHECK-NEXT:    ## kill: def $cl killed $rcx
-; CHECK-NEXT:    shrq %cl, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    ## kill: def $cl killed $rcx
+; CHECK-NEXT:    shrq %cl, %rax
 ; CHECK-NEXT:    retq
   %c = lshr i64 %a, %b
   ret i64 %c
@@ -99,8 +107,10 @@
 ; CHECK-LABEL: ashr_i8:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    sarb %cl, %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    ## kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarb %cl, %al
+; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %c = ashr i8 %a, %b
   ret i8 %c
@@ -110,9 +120,11 @@
 ; CHECK-LABEL: ashr_i16:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    ## kill: def $cl killed $cx
-; CHECK-NEXT:    sarw %cl, %di
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    ## kill: def $cx killed $cx killed $ecx
+; CHECK-NEXT:    ## kill: def $cl killed $cx
+; CHECK-NEXT:    sarw %cl, %ax
+; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %c = ashr i16 %a, %b
   ret i16 %c
@@ -122,9 +134,9 @@
 ; CHECK-LABEL: ashr_i32:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    ## kill: def $cl killed $ecx
-; CHECK-NEXT:    sarl %cl, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    ## kill: def $cl killed $ecx
+; CHECK-NEXT:    sarl %cl, %eax
 ; CHECK-NEXT:    retq
   %c = ashr i32 %a, %b
   ret i32 %c
@@ -134,9 +146,9 @@
 ; CHECK-LABEL: ashr_i64:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movq %rsi, %rcx
-; CHECK-NEXT:    ## kill: def $cl killed $rcx
-; CHECK-NEXT:    sarq %cl, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    ## kill: def $cl killed $rcx
+; CHECK-NEXT:    sarq %cl, %rax
 ; CHECK-NEXT:    retq
   %c = ashr i64 %a, %b
   ret i64 %c
@@ -145,8 +157,9 @@
 define i8 @shl_imm1_i8(i8 %a) {
 ; CHECK-LABEL: shl_imm1_i8:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    shlb $1, %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shlb $1, %al
+; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %c = shl i8 %a, 1
   ret i8 %c
@@ -185,8 +198,9 @@
 define i8 @lshr_imm1_i8(i8 %a) {
 ; CHECK-LABEL: lshr_imm1_i8:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    shrb $1, %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shrb $1, %al
+; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %c = lshr i8 %a, 1
   ret i8 %c
@@ -195,8 +209,9 @@
 define i16 @lshr_imm1_i16(i16 %a) {
 ; CHECK-LABEL: lshr_imm1_i16:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    shrw $1, %di
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shrw $1, %ax
+; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %c = lshr i16 %a, 1
   ret i16 %c
@@ -205,8 +220,8 @@
 define i32 @lshr_imm1_i32(i32 %a) {
 ; CHECK-LABEL: lshr_imm1_i32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    shrl $1, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shrl $1, %eax
 ; CHECK-NEXT:    retq
   %c = lshr i32 %a, 1
   ret i32 %c
@@ -215,8 +230,8 @@
 define i64 @lshr_imm1_i64(i64 %a) {
 ; CHECK-LABEL: lshr_imm1_i64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    shrq $1, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    shrq $1, %rax
 ; CHECK-NEXT:    retq
   %c = lshr i64 %a, 1
   ret i64 %c
@@ -225,8 +240,9 @@
 define i8 @ashr_imm1_i8(i8 %a) {
 ; CHECK-LABEL: ashr_imm1_i8:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    sarb $1, %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    sarb $1, %al
+; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %c = ashr i8 %a, 1
   ret i8 %c
@@ -235,8 +251,9 @@
 define i16 @ashr_imm1_i16(i16 %a) {
 ; CHECK-LABEL: ashr_imm1_i16:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    sarw $1, %di
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    sarw $1, %ax
+; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %c = ashr i16 %a, 1
   ret i16 %c
@@ -245,8 +262,8 @@
 define i32 @ashr_imm1_i32(i32 %a) {
 ; CHECK-LABEL: ashr_imm1_i32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    sarl $1, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    sarl $1, %eax
 ; CHECK-NEXT:    retq
   %c = ashr i32 %a, 1
   ret i32 %c
@@ -255,8 +272,8 @@
 define i64 @ashr_imm1_i64(i64 %a) {
 ; CHECK-LABEL: ashr_imm1_i64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    sarq $1, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    sarq $1, %rax
 ; CHECK-NEXT:    retq
   %c = ashr i64 %a, 1
   ret i64 %c
@@ -265,8 +282,9 @@
 define i8 @shl_imm4_i8(i8 %a) {
 ; CHECK-LABEL: shl_imm4_i8:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    shlb $4, %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shlb $4, %al
+; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %c = shl i8 %a, 4
   ret i8 %c
@@ -275,8 +293,9 @@
 define i16 @shl_imm4_i16(i16 %a) {
 ; CHECK-LABEL: shl_imm4_i16:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    shlw $4, %di
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shlw $4, %ax
+; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %c = shl i16 %a, 4
   ret i16 %c
@@ -285,8 +304,8 @@
 define i32 @shl_imm4_i32(i32 %a) {
 ; CHECK-LABEL: shl_imm4_i32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    shll $4, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shll $4, %eax
 ; CHECK-NEXT:    retq
   %c = shl i32 %a, 4
   ret i32 %c
@@ -295,8 +314,8 @@
 define i64 @shl_imm4_i64(i64 %a) {
 ; CHECK-LABEL: shl_imm4_i64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    shlq $4, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    shlq $4, %rax
 ; CHECK-NEXT:    retq
   %c = shl i64 %a, 4
   ret i64 %c
@@ -305,8 +324,9 @@
 define i8 @lshr_imm4_i8(i8 %a) {
 ; CHECK-LABEL: lshr_imm4_i8:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    shrb $4, %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shrb $4, %al
+; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %c = lshr i8 %a, 4
   ret i8 %c
@@ -315,8 +335,9 @@
 define i16 @lshr_imm4_i16(i16 %a) {
 ; CHECK-LABEL: lshr_imm4_i16:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    shrw $4, %di
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shrw $4, %ax
+; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %c = lshr i16 %a, 4
   ret i16 %c
@@ -325,8 +346,8 @@
 define i32 @lshr_imm4_i32(i32 %a) {
 ; CHECK-LABEL: lshr_imm4_i32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    shrl $4, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shrl $4, %eax
 ; CHECK-NEXT:    retq
   %c = lshr i32 %a, 4
   ret i32 %c
@@ -335,8 +356,8 @@
 define i64 @lshr_imm4_i64(i64 %a) {
 ; CHECK-LABEL: lshr_imm4_i64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    shrq $4, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    shrq $4, %rax
 ; CHECK-NEXT:    retq
   %c = lshr i64 %a, 4
   ret i64 %c
@@ -345,8 +366,9 @@
 define i8 @ashr_imm4_i8(i8 %a) {
 ; CHECK-LABEL: ashr_imm4_i8:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    sarb $4, %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    sarb $4, %al
+; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %c = ashr i8 %a, 4
   ret i8 %c
@@ -355,8 +377,9 @@
 define i16 @ashr_imm4_i16(i16 %a) {
 ; CHECK-LABEL: ashr_imm4_i16:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    sarw $4, %di
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    sarw $4, %ax
+; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %c = ashr i16 %a, 4
   ret i16 %c
@@ -365,8 +388,8 @@
 define i32 @ashr_imm4_i32(i32 %a) {
 ; CHECK-LABEL: ashr_imm4_i32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    sarl $4, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    sarl $4, %eax
 ; CHECK-NEXT:    retq
   %c = ashr i32 %a, 4
   ret i32 %c
@@ -375,8 +398,8 @@
 define i64 @ashr_imm4_i64(i64 %a) {
 ; CHECK-LABEL: ashr_imm4_i64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    sarq $4, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    sarq $4, %rax
 ; CHECK-NEXT:    retq
   %c = ashr i64 %a, 4
   ret i64 %c
@@ -386,9 +409,10 @@
 define i8 @PR36731(i8 %a) {
 ; CHECK-LABEL: PR36731:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movb $255, %cl
-; CHECK-NEXT:    shlb %cl, %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    movb $255, %cl
+; CHECK-NEXT:    shlb %cl, %al
+; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %b = shl i8 %a, -1
   ret i8 %b
diff --git a/llvm/test/CodeGen/X86/fast-isel-store.ll b/llvm/test/CodeGen/X86/fast-isel-store.ll
index bf52b6c..1b91fcb 100644
--- a/llvm/test/CodeGen/X86/fast-isel-store.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-store.ll
@@ -11,8 +11,8 @@
 define i32 @test_store_32(i32* nocapture %addr, i32 %value) {
 ; ALL32-LABEL: test_store_32:
 ; ALL32:       # %bb.0: # %entry
-; ALL32-NEXT:    movl %esi, (%rdi)
 ; ALL32-NEXT:    movl %esi, %eax
+; ALL32-NEXT:    movl %esi, (%rdi)
 ; ALL32-NEXT:    retq
 ;
 ; ALL64-LABEL: test_store_32:
@@ -29,8 +29,9 @@
 define i16 @test_store_16(i16* nocapture %addr, i16 %value) {
 ; ALL32-LABEL: test_store_16:
 ; ALL32:       # %bb.0: # %entry
-; ALL32-NEXT:    movw %si, (%rdi)
 ; ALL32-NEXT:    movl %esi, %eax
+; ALL32-NEXT:    movw %ax, (%rdi)
+; ALL32-NEXT:    # kill: def $ax killed $ax killed $eax
 ; ALL32-NEXT:    retq
 ;
 ; ALL64-LABEL: test_store_16:
diff --git a/llvm/test/CodeGen/X86/fixup-bw-copy.ll b/llvm/test/CodeGen/X86/fixup-bw-copy.ll
index fe9b886..46cb1b1 100644
--- a/llvm/test/CodeGen/X86/fixup-bw-copy.ll
+++ b/llvm/test/CodeGen/X86/fixup-bw-copy.ll
@@ -7,15 +7,11 @@
 target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
 
 define i8 @test_movb(i8 %a0) {
-; BWON64-LABEL: test_movb:
-; BWON64:       # %bb.0:
-; BWON64-NEXT:    movl %edi, %eax
-; BWON64-NEXT:    retq
-;
-; BWOFF64-LABEL: test_movb:
-; BWOFF64:       # %bb.0:
-; BWOFF64-NEXT:    movb %dil, %al
-; BWOFF64-NEXT:    retq
+; X64-LABEL: test_movb:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_movb:
 ; X32:       # %bb.0:
@@ -25,15 +21,11 @@
 }
 
 define i16 @test_movw(i16 %a0) {
-; BWON64-LABEL: test_movw:
-; BWON64:       # %bb.0:
-; BWON64-NEXT:    movl %edi, %eax
-; BWON64-NEXT:    retq
-;
-; BWOFF64-LABEL: test_movw:
-; BWOFF64:       # %bb.0:
-; BWOFF64-NEXT:    movw %di, %ax
-; BWOFF64-NEXT:    retq
+; X64-LABEL: test_movw:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
 ;
 ; BWON32-LABEL: test_movw:
 ; BWON32:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll
index a8dd97b..69d01fe 100644
--- a/llvm/test/CodeGen/X86/fma.ll
+++ b/llvm/test/CodeGen/X86/fma.ll
@@ -1351,14 +1351,13 @@
 ; FMACALL64-NEXT:    ## xmm2 = xmm2[1,1,2,3]
 ; FMACALL64-NEXT:    callq _fmaf ## encoding: [0xe8,A,A,A,A]
 ; FMACALL64-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel
-; FMACALL64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
-; FMACALL64-NEXT:    ## encoding: [0x0f,0x28,0x4c,0x24,0x40]
-; FMACALL64-NEXT:    unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8]
-; FMACALL64-NEXT:    ## xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; FMACALL64-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
-; FMACALL64-NEXT:    ## encoding: [0x66,0x0f,0x14,0x4c,0x24,0x20]
-; FMACALL64-NEXT:    ## xmm1 = xmm1[0],mem[0]
-; FMACALL64-NEXT:    movaps %xmm1, %xmm3 ## encoding: [0x0f,0x28,0xd9]
+; FMACALL64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload
+; FMACALL64-NEXT:    ## encoding: [0x0f,0x28,0x5c,0x24,0x40]
+; FMACALL64-NEXT:    unpcklps %xmm0, %xmm3 ## encoding: [0x0f,0x14,0xd8]
+; FMACALL64-NEXT:    ## xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; FMACALL64-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Folded Reload
+; FMACALL64-NEXT:    ## encoding: [0x66,0x0f,0x14,0x5c,0x24,0x20]
+; FMACALL64-NEXT:    ## xmm3 = xmm3[0],mem[0]
 ; FMACALL64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
 ; FMACALL64-NEXT:    ## encoding: [0x0f,0x28,0x44,0x24,0x60]
 ; FMACALL64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/fold-vector-sext-crash2.ll b/llvm/test/CodeGen/X86/fold-vector-sext-crash2.ll
index ca1a1c1..d210f17 100644
--- a/llvm/test/CodeGen/X86/fold-vector-sext-crash2.ll
+++ b/llvm/test/CodeGen/X86/fold-vector-sext-crash2.ll
@@ -28,6 +28,7 @@
 ;
 ; X64-LABEL: test_sext1:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    xorps %xmm0, %xmm0
 ; X64-NEXT:    movaps %xmm0, 16(%rdi)
 ; X64-NEXT:    movaps %xmm0, (%rdi)
@@ -35,7 +36,6 @@
 ; X64-NEXT:    movq $-1, 48(%rdi)
 ; X64-NEXT:    movq $-1, 40(%rdi)
 ; X64-NEXT:    movq $-99, 32(%rdi)
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
   %Se = sext <2 x i8> <i8 -100, i8 -99> to <2 x i256>
   %Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> <i32 1, i32 3>
@@ -66,6 +66,7 @@
 ;
 ; X64-LABEL: test_sext2:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    xorps %xmm0, %xmm0
 ; X64-NEXT:    movaps %xmm0, 16(%rdi)
 ; X64-NEXT:    movaps %xmm0, (%rdi)
@@ -73,7 +74,6 @@
 ; X64-NEXT:    movq $-1, 48(%rdi)
 ; X64-NEXT:    movq $-1, 40(%rdi)
 ; X64-NEXT:    movq $-1999, 32(%rdi) # imm = 0xF831
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
   %Se = sext <2 x i128> <i128 -2000, i128 -1999> to <2 x i256>
   %Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> <i32 1, i32 3>
@@ -104,13 +104,13 @@
 ;
 ; X64-LABEL: test_zext1:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    xorps %xmm0, %xmm0
 ; X64-NEXT:    movaps %xmm0, 48(%rdi)
 ; X64-NEXT:    movaps %xmm0, 16(%rdi)
 ; X64-NEXT:    movaps %xmm0, (%rdi)
 ; X64-NEXT:    movq $0, 40(%rdi)
 ; X64-NEXT:    movq $254, 32(%rdi)
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
   %Se = zext <2 x i8> <i8 -1, i8 -2> to <2 x i256>
   %Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> <i32 1, i32 3>
@@ -141,13 +141,13 @@
 ;
 ; X64-LABEL: test_zext2:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    xorps %xmm0, %xmm0
 ; X64-NEXT:    movaps %xmm0, 48(%rdi)
 ; X64-NEXT:    movaps %xmm0, 16(%rdi)
 ; X64-NEXT:    movaps %xmm0, (%rdi)
 ; X64-NEXT:    movq $-1, 40(%rdi)
 ; X64-NEXT:    movq $-2, 32(%rdi)
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
   %Se = zext <2 x i128> <i128 -1, i128 -2> to <2 x i256>
   %Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> <i32 1, i32 3>
diff --git a/llvm/test/CodeGen/X86/funnel-shift-rot.ll b/llvm/test/CodeGen/X86/funnel-shift-rot.ll
index 3455a05..19e75ab 100644
--- a/llvm/test/CodeGen/X86/funnel-shift-rot.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift-rot.ll
@@ -25,8 +25,9 @@
 ;
 ; X64-AVX2-LABEL: rotl_i8_const_shift:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    rolb $3, %dil
 ; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    rolb $3, %al
+; X64-AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-AVX2-NEXT:    retq
   %f = call i8 @llvm.fshl.i8(i8 %x, i8 %x, i8 3)
   ret i8 %f
@@ -44,8 +45,8 @@
 ;
 ; X64-AVX2-LABEL: rotl_i64_const_shift:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    rolq $3, %rdi
 ; X64-AVX2-NEXT:    movq %rdi, %rax
+; X64-AVX2-NEXT:    rolq $3, %rax
 ; X64-AVX2-NEXT:    retq
   %f = call i64 @llvm.fshl.i64(i64 %x, i64 %x, i64 3)
   ret i64 %f
@@ -62,8 +63,10 @@
 ; X64-AVX2-LABEL: rotl_i16:
 ; X64-AVX2:       # %bb.0:
 ; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    rolw %cl, %di
 ; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX2-NEXT:    rolw %cl, %ax
+; X64-AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-AVX2-NEXT:    retq
   %f = call i16 @llvm.fshl.i16(i16 %x, i16 %x, i16 %z)
   ret i16 %f
@@ -80,8 +83,9 @@
 ; X64-AVX2-LABEL: rotl_i32:
 ; X64-AVX2:       # %bb.0:
 ; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    roll %cl, %edi
 ; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX2-NEXT:    roll %cl, %eax
 ; X64-AVX2-NEXT:    retq
   %f = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 %z)
   ret i32 %f
@@ -174,8 +178,9 @@
 ;
 ; X64-AVX2-LABEL: rotr_i8_const_shift:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    rorb $3, %dil
 ; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    rorb $3, %al
+; X64-AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-AVX2-NEXT:    retq
   %f = call i8 @llvm.fshr.i8(i8 %x, i8 %x, i8 3)
   ret i8 %f
@@ -190,8 +195,8 @@
 ;
 ; X64-AVX2-LABEL: rotr_i32_const_shift:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    rorl $3, %edi
 ; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    rorl $3, %eax
 ; X64-AVX2-NEXT:    retq
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 3)
   ret i32 %f
@@ -210,8 +215,10 @@
 ; X64-AVX2-LABEL: rotr_i16:
 ; X64-AVX2:       # %bb.0:
 ; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    rorw %cl, %di
 ; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX2-NEXT:    rorw %cl, %ax
+; X64-AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-AVX2-NEXT:    retq
   %f = call i16 @llvm.fshr.i16(i16 %x, i16 %x, i16 %z)
   ret i16 %f
@@ -257,9 +264,10 @@
 ;
 ; X64-AVX2-LABEL: rotr_i64:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    rorq %cl, %rdi
+; X64-AVX2-NEXT:    movq %rsi, %rcx
 ; X64-AVX2-NEXT:    movq %rdi, %rax
+; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-AVX2-NEXT:    rorq %cl, %rax
 ; X64-AVX2-NEXT:    retq
   %f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 %z)
   ret i64 %f
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index f349fca..678e447 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -33,11 +33,11 @@
 ;
 ; X64-AVX2-LABEL: fshl_i32:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    andl $31, %edx
-; X64-AVX2-NEXT:    movl %edi, %eax
 ; X64-AVX2-NEXT:    movl %edx, %ecx
+; X64-AVX2-NEXT:    andl $31, %ecx
+; X64-AVX2-NEXT:    movl %edi, %eax
 ; X64-AVX2-NEXT:    shldl %cl, %esi, %eax
-; X64-AVX2-NEXT:    testl %edx, %edx
+; X64-AVX2-NEXT:    testl %ecx, %ecx
 ; X64-AVX2-NEXT:    cmovel %edi, %eax
 ; X64-AVX2-NEXT:    retq
   %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
@@ -152,8 +152,8 @@
 ;
 ; X64-AVX2-LABEL: fshl_i32_const_shift:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    shldl $9, %esi, %edi
 ; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    shldl $9, %esi, %eax
 ; X64-AVX2-NEXT:    retq
   %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 9)
   ret i32 %f
@@ -171,8 +171,8 @@
 ;
 ; X64-AVX2-LABEL: fshl_i32_const_overshift:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    shldl $9, %esi, %edi
 ; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    shldl $9, %esi, %eax
 ; X64-AVX2-NEXT:    retq
   %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 41)
   ret i32 %f
@@ -192,8 +192,8 @@
 ;
 ; X64-AVX2-LABEL: fshl_i64_const_overshift:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    shldq $41, %rsi, %rdi
 ; X64-AVX2-NEXT:    movq %rdi, %rax
+; X64-AVX2-NEXT:    shldq $41, %rsi, %rax
 ; X64-AVX2-NEXT:    retq
   %f = call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 105)
   ret i64 %f
@@ -231,11 +231,11 @@
 ;
 ; X64-AVX2-LABEL: fshr_i32:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    andl $31, %edx
-; X64-AVX2-NEXT:    movl %esi, %eax
 ; X64-AVX2-NEXT:    movl %edx, %ecx
+; X64-AVX2-NEXT:    andl $31, %ecx
+; X64-AVX2-NEXT:    movl %esi, %eax
 ; X64-AVX2-NEXT:    shrdl %cl, %edi, %eax
-; X64-AVX2-NEXT:    testl %edx, %edx
+; X64-AVX2-NEXT:    testl %ecx, %ecx
 ; X64-AVX2-NEXT:    cmovel %esi, %eax
 ; X64-AVX2-NEXT:    retq
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
@@ -346,8 +346,8 @@
 ;
 ; X64-AVX2-LABEL: fshr_i32_const_shift:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    shldl $23, %esi, %edi
 ; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    shldl $23, %esi, %eax
 ; X64-AVX2-NEXT:    retq
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 9)
   ret i32 %f
@@ -365,8 +365,8 @@
 ;
 ; X64-AVX2-LABEL: fshr_i32_const_overshift:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    shldl $23, %esi, %edi
 ; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    shldl $23, %esi, %eax
 ; X64-AVX2-NEXT:    retq
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 41)
   ret i32 %f
@@ -386,8 +386,8 @@
 ;
 ; X64-AVX2-LABEL: fshr_i64_const_overshift:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    shldq $23, %rsi, %rdi
 ; X64-AVX2-NEXT:    movq %rdi, %rax
+; X64-AVX2-NEXT:    shldq $23, %rsi, %rax
 ; X64-AVX2-NEXT:    retq
   %f = call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 105)
   ret i64 %f
diff --git a/llvm/test/CodeGen/X86/ghc-cc64.ll b/llvm/test/CodeGen/X86/ghc-cc64.ll
index e8b0f06..9abd40b 100644
--- a/llvm/test/CodeGen/X86/ghc-cc64.ll
+++ b/llvm/test/CodeGen/X86/ghc-cc64.ll
@@ -22,8 +22,8 @@
 
 define void @zap(i64 %a, i64 %b) nounwind {
 entry:
-  ; CHECK:      movq %rdi, %r13
-  ; CHECK-NEXT: movq %rsi, %rbp
+  ; CHECK:      movq %rsi, %rbp
+  ; CHECK-NEXT: movq %rdi, %r13
   ; CHECK-NEXT: callq addtwo
   %0 = call ghccc i64 @addtwo(i64 %a, i64 %b)
   ; CHECK:      callq foo
diff --git a/llvm/test/CodeGen/X86/hipe-cc64.ll b/llvm/test/CodeGen/X86/hipe-cc64.ll
index cf5788f..1758b4c 100644
--- a/llvm/test/CodeGen/X86/hipe-cc64.ll
+++ b/llvm/test/CodeGen/X86/hipe-cc64.ll
@@ -4,11 +4,10 @@
 
 define void @zap(i64 %a, i64 %b) nounwind {
 entry:
-  ; CHECK:      movq %rsi, %rax
+  ; CHECK:      movq %rsi, %rdx
   ; CHECK-NEXT: movl $8, %ecx
   ; CHECK-NEXT: movl $9, %r8d
   ; CHECK-NEXT: movq %rdi, %rsi
-  ; CHECK-NEXT: movq %rax, %rdx
   ; CHECK-NEXT: callq addfour
   %0 = call cc 11 {i64, i64, i64} @addfour(i64 undef, i64 undef, i64 %a, i64 %b, i64 8, i64 9)
   %res = extractvalue {i64, i64, i64} %0, 2
diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll
index 30040bd..8069eab 100644
--- a/llvm/test/CodeGen/X86/i128-mul.ll
+++ b/llvm/test/CodeGen/X86/i128-mul.ll
@@ -336,17 +336,17 @@
 ; X64-BMI-NEXT:    movq %rcx, %r8
 ; X64-BMI-NEXT:    movq %rdx, %r9
 ; X64-BMI-NEXT:    xorl %r10d, %r10d
-; X64-BMI-NEXT:    xorl %eax, %eax
+; X64-BMI-NEXT:    xorl %ecx, %ecx
 ; X64-BMI-NEXT:    .p2align 4, 0x90
 ; X64-BMI-NEXT:  .LBB1_2: # %for.body
 ; X64-BMI-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X64-BMI-NEXT:    movq %r8, %rdx
-; X64-BMI-NEXT:    mulxq (%r9,%rax,8), %rcx, %rdx
-; X64-BMI-NEXT:    addq %r10, %rcx
+; X64-BMI-NEXT:    mulxq (%r9,%rcx,8), %rax, %rdx
+; X64-BMI-NEXT:    addq %r10, %rax
 ; X64-BMI-NEXT:    adcq $0, %rdx
-; X64-BMI-NEXT:    movq %rcx, (%rsi,%rax,8)
-; X64-BMI-NEXT:    incq %rax
-; X64-BMI-NEXT:    cmpq %rax, %rdi
+; X64-BMI-NEXT:    movq %rax, (%rsi,%rcx,8)
+; X64-BMI-NEXT:    incq %rcx
+; X64-BMI-NEXT:    cmpq %rcx, %rdi
 ; X64-BMI-NEXT:    movq %rdx, %r10
 ; X64-BMI-NEXT:    jne .LBB1_2
 ; X64-BMI-NEXT:  .LBB1_3: # %for.end
diff --git a/llvm/test/CodeGen/X86/iabs.ll b/llvm/test/CodeGen/X86/iabs.ll
index 96f8f5c..a731e53 100644
--- a/llvm/test/CodeGen/X86/iabs.ll
+++ b/llvm/test/CodeGen/X86/iabs.ll
@@ -22,10 +22,11 @@
 ; X64-LABEL: test_i8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    sarb $7, %al
-; X64-NEXT:    addb %al, %dil
-; X64-NEXT:    xorb %al, %dil
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    sarb $7, %cl
+; X64-NEXT:    addb %cl, %al
+; X64-NEXT:    xorb %cl, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %tmp1neg = sub i8 0, %a
   %b = icmp sgt i8 %a, -1
diff --git a/llvm/test/CodeGen/X86/imul.ll b/llvm/test/CodeGen/X86/imul.ll
index 0f1a824..0288e61 100644
--- a/llvm/test/CodeGen/X86/imul.ll
+++ b/llvm/test/CodeGen/X86/imul.ll
@@ -39,8 +39,8 @@
 define i32 @mul4096_32(i32 %A) {
 ; X64-LABEL: mul4096_32:
 ; X64:       # %bb.0:
-; X64-NEXT:    shll $12, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $12, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: mul4096_32:
@@ -55,8 +55,8 @@
 define i64 @mul4096_64(i64 %A) {
 ; X64-LABEL: mul4096_64:
 ; X64:       # %bb.0:
-; X64-NEXT:    shlq $12, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shlq $12, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: mul4096_64:
@@ -73,9 +73,9 @@
 define i32 @mulmin4096_32(i32 %A) {
 ; X64-LABEL: mulmin4096_32:
 ; X64:       # %bb.0:
-; X64-NEXT:    shll $12, %edi
-; X64-NEXT:    negl %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $12, %eax
+; X64-NEXT:    negl %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: mulmin4096_32:
@@ -91,9 +91,9 @@
 define i64 @mulmin4096_64(i64 %A) {
 ; X64-LABEL: mulmin4096_64:
 ; X64:       # %bb.0:
-; X64-NEXT:    shlq $12, %rdi
-; X64-NEXT:    negq %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shlq $12, %rax
+; X64-NEXT:    negq %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: mulmin4096_64:
@@ -268,8 +268,8 @@
 define i32 @mul4294967295_32(i32 %A) {
 ; X64-LABEL: mul4294967295_32:
 ; X64:       # %bb.0:
-; X64-NEXT:    negl %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    negl %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: mul4294967295_32:
@@ -284,8 +284,8 @@
 define i64 @mul18446744073709551615_64(i64 %A) {
 ; X64-LABEL: mul18446744073709551615_64:
 ; X64:       # %bb.0:
-; X64-NEXT:    negq %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    negq %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: mul18446744073709551615_64:
@@ -323,9 +323,9 @@
 ; X64-LABEL: test1:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shll $5, %eax
-; X64-NEXT:    subl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    shll $5, %ecx
+; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test1:
@@ -412,9 +412,9 @@
 ; X64-LABEL: test5:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shlq $5, %rax
-; X64-NEXT:    subq %rax, %rdi
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    shlq $5, %rcx
+; X64-NEXT:    subq %rcx, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test5:
@@ -530,9 +530,9 @@
 ; X64-LABEL: testNegOverflow:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shlq $63, %rax
-; X64-NEXT:    subq %rax, %rdi
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    shlq $63, %rcx
+; X64-NEXT:    subq %rcx, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: testNegOverflow:
diff --git a/llvm/test/CodeGen/X86/lea-opt.ll b/llvm/test/CodeGen/X86/lea-opt.ll
index 6899bab..55dbd02 100644
--- a/llvm/test/CodeGen/X86/lea-opt.ll
+++ b/llvm/test/CodeGen/X86/lea-opt.ll
@@ -311,9 +311,9 @@
 define  i32 @test5(i32 %x, i32 %y)  #0 {
 ; CHECK-LABEL: test5:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl %esi, %esi
-; CHECK-NEXT:    subl %esi, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    addl %esi, %esi
+; CHECK-NEXT:    subl %esi, %eax
 ; CHECK-NEXT:    retq
 entry:
   %mul = mul nsw i32 %y, -2
@@ -325,9 +325,9 @@
 ; CHECK-LABEL: test6:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
-; CHECK-NEXT:    leal (%rsi,%rsi,2), %eax
-; CHECK-NEXT:    subl %eax, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    leal (%rsi,%rsi,2), %ecx
+; CHECK-NEXT:    subl %ecx, %eax
 ; CHECK-NEXT:    retq
 entry:
   %mul = mul nsw i32 %y, -3
@@ -338,9 +338,9 @@
 define  i32 @test7(i32 %x, i32 %y)  #0 {
 ; CHECK-LABEL: test7:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shll $2, %esi
-; CHECK-NEXT:    subl %esi, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shll $2, %esi
+; CHECK-NEXT:    subl %esi, %eax
 ; CHECK-NEXT:    retq
 entry:
   %mul = mul nsw i32 %y, -4
@@ -365,9 +365,9 @@
 define  i32 @test9(i32 %x, i32 %y) #0 {
 ; CHECK-LABEL: test9:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl %esi, %esi
-; CHECK-NEXT:    subl %esi, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    addl %esi, %esi
+; CHECK-NEXT:    subl %esi, %eax
 ; CHECK-NEXT:    retq
 entry:
   %mul = mul nsw i32 -2, %y
@@ -379,9 +379,9 @@
 ; CHECK-LABEL: test10:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
-; CHECK-NEXT:    leal (%rsi,%rsi,2), %eax
-; CHECK-NEXT:    subl %eax, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    leal (%rsi,%rsi,2), %ecx
+; CHECK-NEXT:    subl %ecx, %eax
 ; CHECK-NEXT:    retq
 entry:
   %mul = mul nsw i32 -3, %y
@@ -392,9 +392,9 @@
 define  i32 @test11(i32 %x, i32 %y) #0 {
 ; CHECK-LABEL: test11:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shll $2, %esi
-; CHECK-NEXT:    subl %esi, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shll $2, %esi
+; CHECK-NEXT:    subl %esi, %eax
 ; CHECK-NEXT:    retq
 entry:
   %mul = mul nsw i32 -4, %y
@@ -418,9 +418,9 @@
 define  i64 @test13(i64 %x, i64 %y) #0 {
 ; CHECK-LABEL: test13:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shlq $2, %rsi
-; CHECK-NEXT:    subq %rsi, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    shlq $2, %rsi
+; CHECK-NEXT:    subq %rsi, %rax
 ; CHECK-NEXT:    retq
 entry:
   %mul = mul nsw i64 -4, %y
@@ -444,9 +444,10 @@
 define  zeroext i16 @test15(i16 zeroext %x, i16 zeroext %y) #0 {
 ; CHECK-LABEL: test15:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shll $3, %esi
-; CHECK-NEXT:    subl %esi, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shll $3, %esi
+; CHECK-NEXT:    subl %esi, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
 entry:
   %conv = zext i16 %x to i32
diff --git a/llvm/test/CodeGen/X86/legalize-shift-64.ll b/llvm/test/CodeGen/X86/legalize-shift-64.ll
index fdb21e0..8e549fe 100644
--- a/llvm/test/CodeGen/X86/legalize-shift-64.ll
+++ b/llvm/test/CodeGen/X86/legalize-shift-64.ll
@@ -88,6 +88,8 @@
 ; CHECK-NEXT:    .cfi_offset %ebx, -12
 ; CHECK-NEXT:    .cfi_offset %ebp, -8
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -101,12 +103,11 @@
 ; CHECK-NEXT:    movl %edi, %esi
 ; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:  .LBB4_2:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    movl %edx, %ebx
-; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:    movb %ch, %cl
 ; CHECK-NEXT:    shll %cl, %ebx
 ; CHECK-NEXT:    shldl %cl, %edx, %ebp
-; CHECK-NEXT:    testb $32, %cl
+; CHECK-NEXT:    testb $32, %ch
 ; CHECK-NEXT:    je .LBB4_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    movl %ebx, %ebp
diff --git a/llvm/test/CodeGen/X86/legalize-shl-vec.ll b/llvm/test/CodeGen/X86/legalize-shl-vec.ll
index fdd73b8..dcce437 100644
--- a/llvm/test/CodeGen/X86/legalize-shl-vec.ll
+++ b/llvm/test/CodeGen/X86/legalize-shl-vec.ll
@@ -42,21 +42,21 @@
 ;
 ; X64-LABEL: test_shl:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; X64-NEXT:    shldq $2, %rax, %rcx
-; X64-NEXT:    shldq $2, %rdx, %rax
-; X64-NEXT:    shldq $2, %r9, %rdx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    shldq $2, %rcx, %rdx
+; X64-NEXT:    shldq $2, %rsi, %rcx
+; X64-NEXT:    shldq $2, %r9, %rsi
 ; X64-NEXT:    shlq $2, %r9
-; X64-NEXT:    movq %rcx, 56(%rdi)
-; X64-NEXT:    movq %rax, 48(%rdi)
-; X64-NEXT:    movq %rdx, 40(%rdi)
+; X64-NEXT:    movq %rdx, 56(%rdi)
+; X64-NEXT:    movq %rcx, 48(%rdi)
+; X64-NEXT:    movq %rsi, 40(%rdi)
 ; X64-NEXT:    movq %r9, 32(%rdi)
 ; X64-NEXT:    xorps %xmm0, %xmm0
 ; X64-NEXT:    movaps %xmm0, 16(%rdi)
 ; X64-NEXT:    movaps %xmm0, (%rdi)
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
   %Amt = insertelement <2 x i256> <i256 1, i256 2>, i256 -1, i32 0
   %Out = shl <2 x i256> %In, %Amt
@@ -88,7 +88,7 @@
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    shldl $28, %eax, %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    shldl $28, %esi, %eax
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    shldl $28, %edi, %esi
@@ -101,7 +101,7 @@
 ; X32-NEXT:    shrl $4, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %edx, 60(%eax)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    movl %edx, 56(%eax)
 ; X32-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X32-NEXT:    movl %edx, 52(%eax)
@@ -132,21 +132,21 @@
 ;
 ; X64-LABEL: test_srl:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; X64-NEXT:    shrdq $4, %rdx, %r9
-; X64-NEXT:    shrdq $4, %rax, %rdx
-; X64-NEXT:    shrdq $4, %rcx, %rax
-; X64-NEXT:    shrq $4, %rcx
-; X64-NEXT:    movq %rcx, 56(%rdi)
-; X64-NEXT:    movq %rax, 48(%rdi)
-; X64-NEXT:    movq %rdx, 40(%rdi)
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    shrdq $4, %rsi, %r9
+; X64-NEXT:    shrdq $4, %rcx, %rsi
+; X64-NEXT:    shrdq $4, %rdx, %rcx
+; X64-NEXT:    shrq $4, %rdx
+; X64-NEXT:    movq %rdx, 56(%rdi)
+; X64-NEXT:    movq %rcx, 48(%rdi)
+; X64-NEXT:    movq %rsi, 40(%rdi)
 ; X64-NEXT:    movq %r9, 32(%rdi)
 ; X64-NEXT:    xorps %xmm0, %xmm0
 ; X64-NEXT:    movaps %xmm0, 16(%rdi)
 ; X64-NEXT:    movaps %xmm0, (%rdi)
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
   %Amt = insertelement <2 x i256> <i256 3, i256 4>, i256 -1, i32 0
   %Out = lshr <2 x i256> %In, %Amt
@@ -178,7 +178,7 @@
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    shldl $26, %eax, %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    shldl $26, %esi, %eax
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    shldl $26, %edi, %esi
@@ -191,7 +191,7 @@
 ; X32-NEXT:    sarl $6, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %edx, 60(%eax)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    movl %edx, 56(%eax)
 ; X32-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X32-NEXT:    movl %edx, 52(%eax)
@@ -224,23 +224,23 @@
 ;
 ; X64-LABEL: test_sra:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; X64-NEXT:    shrdq $6, %rdx, %r9
-; X64-NEXT:    shrdq $6, %rax, %rdx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    shrdq $6, %rsi, %r9
+; X64-NEXT:    shrdq $6, %rcx, %rsi
 ; X64-NEXT:    sarq $63, %r8
-; X64-NEXT:    shrdq $6, %rcx, %rax
-; X64-NEXT:    sarq $6, %rcx
-; X64-NEXT:    movq %rcx, 56(%rdi)
-; X64-NEXT:    movq %rax, 48(%rdi)
-; X64-NEXT:    movq %rdx, 40(%rdi)
+; X64-NEXT:    shrdq $6, %rdx, %rcx
+; X64-NEXT:    sarq $6, %rdx
+; X64-NEXT:    movq %rdx, 56(%rdi)
+; X64-NEXT:    movq %rcx, 48(%rdi)
+; X64-NEXT:    movq %rsi, 40(%rdi)
 ; X64-NEXT:    movq %r9, 32(%rdi)
 ; X64-NEXT:    movq %r8, 24(%rdi)
 ; X64-NEXT:    movq %r8, 16(%rdi)
 ; X64-NEXT:    movq %r8, 8(%rdi)
 ; X64-NEXT:    movq %r8, (%rdi)
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
   %Amt = insertelement <2 x i256> <i256 5, i256 6>, i256 -1, i32 0
   %Out = ashr <2 x i256> %In, %Amt
diff --git a/llvm/test/CodeGen/X86/machine-combiner-int.ll b/llvm/test/CodeGen/X86/machine-combiner-int.ll
index ba1a564..2d1fbb4 100644
--- a/llvm/test/CodeGen/X86/machine-combiner-int.ll
+++ b/llvm/test/CodeGen/X86/machine-combiner-int.ll
@@ -62,10 +62,11 @@
 define i8 @reassociate_ands_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) {
 ; CHECK-LABEL: reassociate_ands_i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subb  %sil, %dil
-; CHECK-NEXT:    andb  %cl, %dl
-; CHECK-NEXT:    andb  %dil, %dl
 ; CHECK-NEXT:    movl  %edx, %eax
+; CHECK-NEXT:    subb  %sil, %dil
+; CHECK-NEXT:    andb  %cl, %al
+; CHECK-NEXT:    andb  %dil, %al
+; CHECK-NEXT:    # kill
 ; CHECK-NEXT:    retq
   %t0 = sub i8 %x0, %x1
   %t1 = and i8 %x2, %t0
@@ -78,10 +79,10 @@
 define i32 @reassociate_ands_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
 ; CHECK-LABEL: reassociate_ands_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl  %esi, %edi
-; CHECK-NEXT:    andl  %ecx, %edx
-; CHECK-NEXT:    andl  %edi, %edx
 ; CHECK-NEXT:    movl  %edx, %eax
+; CHECK-NEXT:    subl  %esi, %edi
+; CHECK-NEXT:    andl  %ecx, %eax
+; CHECK-NEXT:    andl  %edi, %eax
 ; CHECK-NEXT:    retq
   %t0 = sub i32 %x0, %x1
   %t1 = and i32 %x2, %t0
@@ -92,10 +93,10 @@
 define i64 @reassociate_ands_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
 ; CHECK-LABEL: reassociate_ands_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq  %rsi, %rdi
-; CHECK-NEXT:    andq  %rcx, %rdx
-; CHECK-NEXT:    andq  %rdi, %rdx
 ; CHECK-NEXT:    movq  %rdx, %rax
+; CHECK-NEXT:    subq  %rsi, %rdi
+; CHECK-NEXT:    andq  %rcx, %rax
+; CHECK-NEXT:    andq  %rdi, %rax
 ; CHECK-NEXT:    retq
   %t0 = sub i64 %x0, %x1
   %t1 = and i64 %x2, %t0
@@ -109,10 +110,11 @@
 define i8 @reassociate_ors_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) {
 ; CHECK-LABEL: reassociate_ors_i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subb  %sil, %dil
-; CHECK-NEXT:    orb   %cl, %dl
-; CHECK-NEXT:    orb   %dil, %dl
 ; CHECK-NEXT:    movl  %edx, %eax
+; CHECK-NEXT:    subb  %sil, %dil
+; CHECK-NEXT:    orb   %cl, %al
+; CHECK-NEXT:    orb   %dil, %al
+; CHECK-NEXT:    # kill
 ; CHECK-NEXT:    retq
   %t0 = sub i8 %x0, %x1
   %t1 = or i8 %x2, %t0
@@ -125,10 +127,10 @@
 define i32 @reassociate_ors_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
 ; CHECK-LABEL: reassociate_ors_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl  %esi, %edi
-; CHECK-NEXT:    orl   %ecx, %edx
-; CHECK-NEXT:    orl   %edi, %edx
 ; CHECK-NEXT:    movl  %edx, %eax
+; CHECK-NEXT:    subl  %esi, %edi
+; CHECK-NEXT:    orl   %ecx, %eax
+; CHECK-NEXT:    orl   %edi, %eax
 ; CHECK-NEXT:    retq
   %t0 = sub i32 %x0, %x1
   %t1 = or i32 %x2, %t0
@@ -139,10 +141,10 @@
 define i64 @reassociate_ors_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
 ; CHECK-LABEL: reassociate_ors_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq  %rsi, %rdi
-; CHECK-NEXT:    orq   %rcx, %rdx
-; CHECK-NEXT:    orq   %rdi, %rdx
 ; CHECK-NEXT:    movq  %rdx, %rax
+; CHECK-NEXT:    subq  %rsi, %rdi
+; CHECK-NEXT:    orq   %rcx, %rax
+; CHECK-NEXT:    orq   %rdi, %rax
 ; CHECK-NEXT:    retq
   %t0 = sub i64 %x0, %x1
   %t1 = or i64 %x2, %t0
@@ -156,10 +158,11 @@
 define i8 @reassociate_xors_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) {
 ; CHECK-LABEL: reassociate_xors_i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subb  %sil, %dil
-; CHECK-NEXT:    xorb  %cl, %dl
-; CHECK-NEXT:    xorb  %dil, %dl
 ; CHECK-NEXT:    movl  %edx, %eax
+; CHECK-NEXT:    subb  %sil, %dil
+; CHECK-NEXT:    xorb  %cl, %al
+; CHECK-NEXT:    xorb  %dil, %al
+; CHECK-NEXT:    # kill
 ; CHECK-NEXT:    retq
   %t0 = sub i8 %x0, %x1
   %t1 = xor i8 %x2, %t0
@@ -172,10 +175,10 @@
 define i32 @reassociate_xors_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
 ; CHECK-LABEL: reassociate_xors_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl  %esi, %edi
-; CHECK-NEXT:    xorl  %ecx, %edx
-; CHECK-NEXT:    xorl  %edi, %edx
 ; CHECK-NEXT:    movl  %edx, %eax
+; CHECK-NEXT:    subl  %esi, %edi
+; CHECK-NEXT:    xorl  %ecx, %eax
+; CHECK-NEXT:    xorl  %edi, %eax
 ; CHECK-NEXT:    retq
   %t0 = sub i32 %x0, %x1
   %t1 = xor i32 %x2, %t0
@@ -186,10 +189,10 @@
 define i64 @reassociate_xors_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
 ; CHECK-LABEL: reassociate_xors_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq  %rsi, %rdi
-; CHECK-NEXT:    xorq  %rcx, %rdx
-; CHECK-NEXT:    xorq  %rdi, %rdx
 ; CHECK-NEXT:    movq  %rdx, %rax
+; CHECK-NEXT:    subq  %rsi, %rdi
+; CHECK-NEXT:    xorq  %rcx, %rax
+; CHECK-NEXT:    xorq  %rdi, %rax
 ; CHECK-NEXT:    retq
   %t0 = sub i64 %x0, %x1
   %t1 = xor i64 %x2, %t0
diff --git a/llvm/test/CodeGen/X86/machine-cp.ll b/llvm/test/CodeGen/X86/machine-cp.ll
index 3b6fcd7..a405e05 100644
--- a/llvm/test/CodeGen/X86/machine-cp.ll
+++ b/llvm/test/CodeGen/X86/machine-cp.ll
@@ -103,30 +103,29 @@
 ; CHECK:       ## %bb.0: ## %bb
 ; CHECK-NEXT:    movaps %xmm3, %xmm9
 ; CHECK-NEXT:    movaps %xmm2, %xmm8
-; CHECK-NEXT:    movaps %xmm1, %xmm6
 ; CHECK-NEXT:    movaps %xmm0, %xmm7
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
-; CHECK-NEXT:    movaps %xmm3, %xmm1
-; CHECK-NEXT:    cmpltps %xmm0, %xmm1
-; CHECK-NEXT:    movaps %xmm1, %xmm4
+; CHECK-NEXT:    movaps %xmm3, %xmm2
+; CHECK-NEXT:    cmpltps %xmm0, %xmm2
+; CHECK-NEXT:    movaps %xmm2, %xmm4
 ; CHECK-NEXT:    orps {{.*}}(%rip), %xmm4
 ; CHECK-NEXT:    movaps %xmm4, %xmm10
-; CHECK-NEXT:    andnps %xmm1, %xmm10
-; CHECK-NEXT:    movaps %xmm2, %xmm1
-; CHECK-NEXT:    cmpltps %xmm0, %xmm1
+; CHECK-NEXT:    andnps %xmm2, %xmm10
+; CHECK-NEXT:    movaps %xmm8, %xmm5
+; CHECK-NEXT:    cmpltps %xmm0, %xmm5
 ; CHECK-NEXT:    movaps {{.*#+}} xmm11 = [9,10,11,12]
-; CHECK-NEXT:    movaps %xmm1, %xmm3
-; CHECK-NEXT:    orps %xmm11, %xmm3
-; CHECK-NEXT:    movaps %xmm3, %xmm14
-; CHECK-NEXT:    andnps %xmm1, %xmm14
-; CHECK-NEXT:    cvttps2dq %xmm6, %xmm12
-; CHECK-NEXT:    cmpltps %xmm0, %xmm6
+; CHECK-NEXT:    movaps %xmm5, %xmm2
+; CHECK-NEXT:    orps %xmm11, %xmm2
+; CHECK-NEXT:    movaps %xmm2, %xmm14
+; CHECK-NEXT:    andnps %xmm5, %xmm14
+; CHECK-NEXT:    cvttps2dq %xmm1, %xmm12
+; CHECK-NEXT:    cmpltps %xmm0, %xmm1
 ; CHECK-NEXT:    movaps {{.*#+}} xmm13 = [5,6,7,8]
-; CHECK-NEXT:    movaps %xmm6, %xmm2
-; CHECK-NEXT:    orps %xmm13, %xmm2
-; CHECK-NEXT:    movaps %xmm2, %xmm5
-; CHECK-NEXT:    andnps %xmm6, %xmm5
-; CHECK-NEXT:    cvttps2dq %xmm7, %xmm6
+; CHECK-NEXT:    movaps %xmm1, %xmm6
+; CHECK-NEXT:    orps %xmm13, %xmm6
+; CHECK-NEXT:    movaps %xmm6, %xmm5
+; CHECK-NEXT:    andnps %xmm1, %xmm5
+; CHECK-NEXT:    cvttps2dq %xmm7, %xmm3
 ; CHECK-NEXT:    cmpltps %xmm0, %xmm7
 ; CHECK-NEXT:    movaps {{.*#+}} xmm15 = [1,2,3,4]
 ; CHECK-NEXT:    movaps %xmm7, %xmm0
@@ -134,30 +133,29 @@
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    andnps %xmm7, %xmm1
 ; CHECK-NEXT:    andps %xmm15, %xmm0
-; CHECK-NEXT:    cvtdq2ps %xmm6, %xmm6
-; CHECK-NEXT:    andps %xmm6, %xmm0
-; CHECK-NEXT:    movaps {{.*#+}} xmm6 = [1,1,1,1]
-; CHECK-NEXT:    andps %xmm6, %xmm1
+; CHECK-NEXT:    cvtdq2ps %xmm3, %xmm3
+; CHECK-NEXT:    andps %xmm3, %xmm0
+; CHECK-NEXT:    movaps {{.*#+}} xmm3 = [1,1,1,1]
+; CHECK-NEXT:    andps %xmm3, %xmm1
 ; CHECK-NEXT:    orps %xmm1, %xmm0
-; CHECK-NEXT:    andps %xmm13, %xmm2
+; CHECK-NEXT:    andps %xmm13, %xmm6
 ; CHECK-NEXT:    cvtdq2ps %xmm12, %xmm1
-; CHECK-NEXT:    andps %xmm1, %xmm2
-; CHECK-NEXT:    andps %xmm6, %xmm5
-; CHECK-NEXT:    orps %xmm5, %xmm2
-; CHECK-NEXT:    andps %xmm11, %xmm3
+; CHECK-NEXT:    andps %xmm1, %xmm6
+; CHECK-NEXT:    andps %xmm3, %xmm5
+; CHECK-NEXT:    orps %xmm5, %xmm6
+; CHECK-NEXT:    andps %xmm11, %xmm2
 ; CHECK-NEXT:    cvttps2dq %xmm8, %xmm1
 ; CHECK-NEXT:    cvtdq2ps %xmm1, %xmm1
-; CHECK-NEXT:    andps %xmm1, %xmm3
-; CHECK-NEXT:    andps %xmm6, %xmm14
-; CHECK-NEXT:    orps %xmm14, %xmm3
-; CHECK-NEXT:    andps %xmm6, %xmm10
+; CHECK-NEXT:    andps %xmm1, %xmm2
+; CHECK-NEXT:    andps %xmm3, %xmm14
+; CHECK-NEXT:    orps %xmm14, %xmm2
+; CHECK-NEXT:    andps %xmm3, %xmm10
 ; CHECK-NEXT:    andps {{.*}}(%rip), %xmm4
 ; CHECK-NEXT:    cvttps2dq %xmm9, %xmm1
 ; CHECK-NEXT:    cvtdq2ps %xmm1, %xmm1
 ; CHECK-NEXT:    andps %xmm1, %xmm4
 ; CHECK-NEXT:    orps %xmm10, %xmm4
-; CHECK-NEXT:    movaps %xmm2, %xmm1
-; CHECK-NEXT:    movaps %xmm3, %xmm2
+; CHECK-NEXT:    movaps %xmm6, %xmm1
 ; CHECK-NEXT:    movaps %xmm4, %xmm3
 ; CHECK-NEXT:    retq
 bb:
diff --git a/llvm/test/CodeGen/X86/machine-cse.ll b/llvm/test/CodeGen/X86/machine-cse.ll
index ec831cb..b55b43f 100644
--- a/llvm/test/CodeGen/X86/machine-cse.ll
+++ b/llvm/test/CodeGen/X86/machine-cse.ll
@@ -136,21 +136,21 @@
 ; CHECK-NEXT:    testq %rcx, %rcx
 ; CHECK-NEXT:    je .LBB3_4
 ; CHECK-NEXT:  # %bb.1: # %preheader
-; CHECK-NEXT:    movzbl %dl, %eax
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movzbl %dl, %edx
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB3_2: # %do.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    cmpl %eax, %esi
+; CHECK-NEXT:    cmpl %edx, %esi
 ; CHECK-NEXT:    je .LBB3_5
 ; CHECK-NEXT:  # %bb.3: # %do.cond
 ; CHECK-NEXT:    # in Loop: Header=BB3_2 Depth=1
-; CHECK-NEXT:    incq %rdi
+; CHECK-NEXT:    incq %rax
 ; CHECK-NEXT:    decq %rcx
 ; CHECK-NEXT:    jne .LBB3_2
 ; CHECK-NEXT:  .LBB3_4:
-; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:  .LBB3_5: # %return
-; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    retq
 entry:
   %cmp = icmp eq i64 %n, 0
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index 0bf224e..30320a7 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -2219,6 +2219,7 @@
 define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) {
 ; SSE2-LABEL: jumbled_indices32:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm0
 ; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm1
 ; SSE2-NEXT:    pmaddwd {{[0-9]+}}(%rsp), %xmm2
@@ -2235,7 +2236,6 @@
 ; SSE2-NEXT:    movdqa %xmm2, 32(%rdi)
 ; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
 ; SSE2-NEXT:    movdqa %xmm0, (%rdi)
-; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: jumbled_indices32:
@@ -2437,6 +2437,7 @@
 define <32 x i32> @pmaddwd_1024(<64 x i16>* %Aptr, <64 x i16>* %Bptr) {
 ; SSE2-LABEL: pmaddwd_1024:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    movdqa 112(%rsi), %xmm0
 ; SSE2-NEXT:    movdqa 96(%rsi), %xmm1
 ; SSE2-NEXT:    movdqa 80(%rsi), %xmm2
@@ -2461,7 +2462,6 @@
 ; SSE2-NEXT:    movdqa %xmm6, 32(%rdi)
 ; SSE2-NEXT:    movdqa %xmm5, 16(%rdi)
 ; SSE2-NEXT:    movdqa %xmm4, (%rdi)
-; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: pmaddwd_1024:
diff --git a/llvm/test/CodeGen/X86/mask-negated-bool.ll b/llvm/test/CodeGen/X86/mask-negated-bool.ll
index b0147c3..5a3c3be 100644
--- a/llvm/test/CodeGen/X86/mask-negated-bool.ll
+++ b/llvm/test/CodeGen/X86/mask-negated-bool.ll
@@ -4,8 +4,8 @@
 define i32 @mask_negated_zext_bool1(i1 %x) {
 ; CHECK-LABEL: mask_negated_zext_bool1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    retq
   %ext = zext i1 %x to i32
   %neg = sub i32 0, %ext
@@ -38,8 +38,8 @@
 define i32 @mask_negated_sext_bool1(i1 %x) {
 ; CHECK-LABEL: mask_negated_sext_bool1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    retq
   %ext = sext i1 %x to i32
   %neg = sub i32 0, %ext
diff --git a/llvm/test/CodeGen/X86/misched-matmul.ll b/llvm/test/CodeGen/X86/misched-matmul.ll
index be3f086..7de9af2 100644
--- a/llvm/test/CodeGen/X86/misched-matmul.ll
+++ b/llvm/test/CodeGen/X86/misched-matmul.ll
@@ -10,7 +10,7 @@
 ; more complex cases.
 ;
 ; CHECK: @wrap_mul4
-; CHECK: 23 regalloc - Number of spills inserted
+; CHECK: 25 regalloc - Number of spills inserted
 
 define void @wrap_mul4(double* nocapture %Out, [4 x double]* nocapture %A, [4 x double]* nocapture %B) #0 {
 entry:
diff --git a/llvm/test/CodeGen/X86/mul-constant-i16.ll b/llvm/test/CodeGen/X86/mul-constant-i16.ll
index cf367ec..c2950cd 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i16.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i16.ll
@@ -11,6 +11,7 @@
 ; X64-LABEL: test_mul_by_1:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 1
   ret i16 %mul
@@ -297,8 +298,9 @@
 ;
 ; X64-LABEL: test_mul_by_16:
 ; X64:       # %bb.0:
-; X64-NEXT:    shll $4, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $4, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 16
   ret i16 %mul
@@ -632,8 +634,9 @@
 ;
 ; X64-LABEL: test_mul_by_32:
 ; X64:       # %bb.0:
-; X64-NEXT:    shll $5, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $5, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 32
   ret i16 %mul
diff --git a/llvm/test/CodeGen/X86/mul-constant-i32.ll b/llvm/test/CodeGen/X86/mul-constant-i32.ll
index 04f867b..0ad1687 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i32.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i32.ll
@@ -787,14 +787,14 @@
 ;
 ; X64-HSW-LABEL: test_mul_by_16:
 ; X64-HSW:       # %bb.0:
-; X64-HSW-NEXT:    shll $4, %edi # sched: [1:0.50]
 ; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    shll $4, %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    retq # sched: [7:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_16:
 ; X64-JAG:       # %bb.0:
-; X64-JAG-NEXT:    shll $4, %edi # sched: [1:0.50]
 ; X64-JAG-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    shll $4, %eax # sched: [1:0.50]
 ; X64-JAG-NEXT:    retq # sched: [4:1.00]
 ;
 ; X86-NOOPT-LABEL: test_mul_by_16:
@@ -805,26 +805,26 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_16:
 ; HSW-NOOPT:       # %bb.0:
-; HSW-NOOPT-NEXT:    shll $4, %edi # sched: [1:0.50]
 ; HSW-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    shll $4, %eax # sched: [1:0.50]
 ; HSW-NOOPT-NEXT:    retq # sched: [7:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_16:
 ; JAG-NOOPT:       # %bb.0:
-; JAG-NOOPT-NEXT:    shll $4, %edi # sched: [1:0.50]
 ; JAG-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    shll $4, %eax # sched: [1:0.50]
 ; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
 ;
 ; X64-SLM-LABEL: test_mul_by_16:
 ; X64-SLM:       # %bb.0:
-; X64-SLM-NEXT:    shll $4, %edi # sched: [1:1.00]
 ; X64-SLM-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; X64-SLM-NEXT:    shll $4, %eax # sched: [1:1.00]
 ; X64-SLM-NEXT:    retq # sched: [4:1.00]
 ;
 ; SLM-NOOPT-LABEL: test_mul_by_16:
 ; SLM-NOOPT:       # %bb.0:
-; SLM-NOOPT-NEXT:    shll $4, %edi # sched: [1:1.00]
 ; SLM-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; SLM-NOOPT-NEXT:    shll $4, %eax # sched: [1:1.00]
 ; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 16
   ret i32 %mul
@@ -1633,14 +1633,14 @@
 ;
 ; X64-HSW-LABEL: test_mul_by_32:
 ; X64-HSW:       # %bb.0:
-; X64-HSW-NEXT:    shll $5, %edi # sched: [1:0.50]
 ; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    shll $5, %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    retq # sched: [7:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_32:
 ; X64-JAG:       # %bb.0:
-; X64-JAG-NEXT:    shll $5, %edi # sched: [1:0.50]
 ; X64-JAG-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    shll $5, %eax # sched: [1:0.50]
 ; X64-JAG-NEXT:    retq # sched: [4:1.00]
 ;
 ; X86-NOOPT-LABEL: test_mul_by_32:
@@ -1651,26 +1651,26 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_32:
 ; HSW-NOOPT:       # %bb.0:
-; HSW-NOOPT-NEXT:    shll $5, %edi # sched: [1:0.50]
 ; HSW-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    shll $5, %eax # sched: [1:0.50]
 ; HSW-NOOPT-NEXT:    retq # sched: [7:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_32:
 ; JAG-NOOPT:       # %bb.0:
-; JAG-NOOPT-NEXT:    shll $5, %edi # sched: [1:0.50]
 ; JAG-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    shll $5, %eax # sched: [1:0.50]
 ; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
 ;
 ; X64-SLM-LABEL: test_mul_by_32:
 ; X64-SLM:       # %bb.0:
-; X64-SLM-NEXT:    shll $5, %edi # sched: [1:1.00]
 ; X64-SLM-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; X64-SLM-NEXT:    shll $5, %eax # sched: [1:1.00]
 ; X64-SLM-NEXT:    retq # sched: [4:1.00]
 ;
 ; SLM-NOOPT-LABEL: test_mul_by_32:
 ; SLM-NOOPT:       # %bb.0:
-; SLM-NOOPT-NEXT:    shll $5, %edi # sched: [1:1.00]
 ; SLM-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; SLM-NOOPT-NEXT:    shll $5, %eax # sched: [1:1.00]
 ; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 32
   ret i32 %mul
@@ -2200,18 +2200,18 @@
 ;
 ; X64-HSW-LABEL: mul_neg_fold:
 ; X64-HSW:       # %bb.0:
-; X64-HSW-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    subl %eax, %esi # sched: [1:0.25]
 ; X64-HSW-NEXT:    movl %esi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %ecx # sched: [1:0.50]
+; X64-HSW-NEXT:    subl %ecx, %eax # sched: [1:0.25]
 ; X64-HSW-NEXT:    retq # sched: [7:1.00]
 ;
 ; X64-JAG-LABEL: mul_neg_fold:
 ; X64-JAG:       # %bb.0:
 ; X64-JAG-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-JAG-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [2:1.00]
-; X64-JAG-NEXT:    subl %eax, %esi # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rdi,8), %ecx # sched: [2:1.00]
 ; X64-JAG-NEXT:    movl %esi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    subl %ecx, %eax # sched: [1:0.50]
 ; X64-JAG-NEXT:    retq # sched: [4:1.00]
 ;
 ; X86-NOOPT-LABEL: mul_neg_fold:
@@ -2235,9 +2235,9 @@
 ; X64-SLM-LABEL: mul_neg_fold:
 ; X64-SLM:       # %bb.0:
 ; X64-SLM-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-SLM-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
-; X64-SLM-NEXT:    subl %eax, %esi # sched: [1:0.50]
 ; X64-SLM-NEXT:    movl %esi, %eax # sched: [1:0.50]
+; X64-SLM-NEXT:    leal (%rdi,%rdi,8), %ecx # sched: [1:1.00]
+; X64-SLM-NEXT:    subl %ecx, %eax # sched: [1:0.50]
 ; X64-SLM-NEXT:    retq # sched: [4:1.00]
 ;
 ; SLM-NOOPT-LABEL: mul_neg_fold:
diff --git a/llvm/test/CodeGen/X86/mul-constant-i64.ll b/llvm/test/CodeGen/X86/mul-constant-i64.ll
index af98a344..0e4680b 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i64.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i64.ll
@@ -811,14 +811,14 @@
 ;
 ; X64-HSW-LABEL: test_mul_by_16:
 ; X64-HSW:       # %bb.0:
-; X64-HSW-NEXT:    shlq $4, %rdi # sched: [1:0.50]
 ; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    shlq $4, %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    retq # sched: [7:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_16:
 ; X64-JAG:       # %bb.0:
-; X64-JAG-NEXT:    shlq $4, %rdi # sched: [1:0.50]
 ; X64-JAG-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    shlq $4, %rax # sched: [1:0.50]
 ; X64-JAG-NEXT:    retq # sched: [4:1.00]
 ;
 ; X86-NOOPT-LABEL: test_mul_by_16:
@@ -831,26 +831,26 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_16:
 ; HSW-NOOPT:       # %bb.0:
-; HSW-NOOPT-NEXT:    shlq $4, %rdi # sched: [1:0.50]
 ; HSW-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    shlq $4, %rax # sched: [1:0.50]
 ; HSW-NOOPT-NEXT:    retq # sched: [7:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_16:
 ; JAG-NOOPT:       # %bb.0:
-; JAG-NOOPT-NEXT:    shlq $4, %rdi # sched: [1:0.50]
 ; JAG-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    shlq $4, %rax # sched: [1:0.50]
 ; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
 ;
 ; X64-SLM-LABEL: test_mul_by_16:
 ; X64-SLM:       # %bb.0:
-; X64-SLM-NEXT:    shlq $4, %rdi # sched: [1:1.00]
 ; X64-SLM-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; X64-SLM-NEXT:    shlq $4, %rax # sched: [1:1.00]
 ; X64-SLM-NEXT:    retq # sched: [4:1.00]
 ;
 ; SLM-NOOPT-LABEL: test_mul_by_16:
 ; SLM-NOOPT:       # %bb.0:
-; SLM-NOOPT-NEXT:    shlq $4, %rdi # sched: [1:1.00]
 ; SLM-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; SLM-NOOPT-NEXT:    shlq $4, %rax # sched: [1:1.00]
 ; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 16
   ret i64 %mul
@@ -1716,14 +1716,14 @@
 ;
 ; X64-HSW-LABEL: test_mul_by_32:
 ; X64-HSW:       # %bb.0:
-; X64-HSW-NEXT:    shlq $5, %rdi # sched: [1:0.50]
 ; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    shlq $5, %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    retq # sched: [7:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_32:
 ; X64-JAG:       # %bb.0:
-; X64-JAG-NEXT:    shlq $5, %rdi # sched: [1:0.50]
 ; X64-JAG-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    shlq $5, %rax # sched: [1:0.50]
 ; X64-JAG-NEXT:    retq # sched: [4:1.00]
 ;
 ; X86-NOOPT-LABEL: test_mul_by_32:
@@ -1736,26 +1736,26 @@
 ;
 ; HSW-NOOPT-LABEL: test_mul_by_32:
 ; HSW-NOOPT:       # %bb.0:
-; HSW-NOOPT-NEXT:    shlq $5, %rdi # sched: [1:0.50]
 ; HSW-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    shlq $5, %rax # sched: [1:0.50]
 ; HSW-NOOPT-NEXT:    retq # sched: [7:1.00]
 ;
 ; JAG-NOOPT-LABEL: test_mul_by_32:
 ; JAG-NOOPT:       # %bb.0:
-; JAG-NOOPT-NEXT:    shlq $5, %rdi # sched: [1:0.50]
 ; JAG-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    shlq $5, %rax # sched: [1:0.50]
 ; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
 ;
 ; X64-SLM-LABEL: test_mul_by_32:
 ; X64-SLM:       # %bb.0:
-; X64-SLM-NEXT:    shlq $5, %rdi # sched: [1:1.00]
 ; X64-SLM-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; X64-SLM-NEXT:    shlq $5, %rax # sched: [1:1.00]
 ; X64-SLM-NEXT:    retq # sched: [4:1.00]
 ;
 ; SLM-NOOPT-LABEL: test_mul_by_32:
 ; SLM-NOOPT:       # %bb.0:
-; SLM-NOOPT-NEXT:    shlq $5, %rdi # sched: [1:1.00]
 ; SLM-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; SLM-NOOPT-NEXT:    shlq $5, %rax # sched: [1:1.00]
 ; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 32
   ret i64 %mul
diff --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll
index 8dd8a84..56db554 100644
--- a/llvm/test/CodeGen/X86/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/mul-i1024.ll
@@ -774,14 +774,15 @@
 ; X32-NEXT:    addl %ebx, %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    addl %edi, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    addl %ebp, %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebx, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl %ebx, %esi
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    addl %eax, %ecx
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 1-byte Folded Reload
-; X32-NEXT:    adcl %edx, %ebp
+; X32-NEXT:    movzbl %bl, %edi
+; X32-NEXT:    adcl %edx, %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 88(%eax), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -789,34 +790,35 @@
 ; X32-NEXT:    mull %edx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %esi
-; X32-NEXT:    addl %eax, %esi
-; X32-NEXT:    adcl %edx, %ebx
-; X32-NEXT:    addl %ecx, %esi
-; X32-NEXT:    adcl %ebp, %ebx
+; X32-NEXT:    movl %ebp, %ebx
+; X32-NEXT:    addl %eax, %ebx
+; X32-NEXT:    adcl %edx, %esi
+; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    adcl %edi, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %ebp, %ecx
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    adcl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    adcl %ebx, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %esi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NEXT:    adcl %ebx, %edx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    addl %edi, %esi
+; X32-NEXT:    addl %ecx, %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 84(%eax), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -860,34 +862,35 @@
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl 68(%ecx), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    movl 68(%ebp), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    xorl %ecx, %ecx
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    addl %ebx, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    addl %edi, %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    addl %ebp, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    addl %ebx, %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %ecx
+; X32-NEXT:    adcl %edi, %ecx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    addl %eax, %ecx
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; X32-NEXT:    adcl %edx, %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 72(%eax), %eax
+; X32-NEXT:    movl 72(%ebp), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    xorl %edx, %edx
 ; X32-NEXT:    mull %edx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebp, %edx
+; X32-NEXT:    movl %ebx, %edx
+; X32-NEXT:    movl %ebx, %ebp
 ; X32-NEXT:    addl %eax, %ebp
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    adcl %edi, %ebx
 ; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    adcl %esi, %ebx
@@ -1167,14 +1170,13 @@
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    xorl %ecx, %ecx
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    addl %esi, %ebx
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    addl %edi, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl %edi, %ebx
 ; X32-NEXT:    adcl %esi, %ebp
 ; X32-NEXT:    setb %cl
 ; X32-NEXT:    addl %eax, %ebp
@@ -1182,7 +1184,6 @@
 ; X32-NEXT:    adcl %edx, %eax
 ; X32-NEXT:    addl %edi, %ebp
 ; X32-NEXT:    adcl %esi, %eax
-; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -1191,17 +1192,17 @@
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    addl %eax, %edi
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %edx
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebp, %edx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl %ebx, %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    addl %edx, %eax
@@ -1209,12 +1210,12 @@
 ; X32-NEXT:    setb %dl
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl %ebx, %ecx
 ; X32-NEXT:    movzbl %dl, %eax
 ; X32-NEXT:    adcl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl %esi, %ebx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -1437,29 +1438,29 @@
 ; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl %edx, %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl %esi, %ebp
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %esi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl %esi, %ebp
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
@@ -1477,7 +1478,7 @@
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebx
@@ -2431,6 +2432,7 @@
 ; X32-NEXT:    adcl %ebp, %ecx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %ebx, %edi
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -2449,27 +2451,25 @@
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %edi, %esi
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl %ecx, %ebp
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -2647,30 +2647,30 @@
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edx, %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl %edi, %ebp
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebx, %ecx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %edi, %ebp
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %ebp
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
@@ -2682,8 +2682,7 @@
 ; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
@@ -2694,7 +2693,7 @@
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %edi
@@ -3360,29 +3359,29 @@
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edx, %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebx, %ecx
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    movzbl %bl, %eax
@@ -3479,20 +3478,20 @@
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ebp, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
@@ -3513,30 +3512,30 @@
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edx, %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl %edi, %ebp
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebx, %ecx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %edi, %ebp
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %ebp
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
@@ -4603,36 +4602,37 @@
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movzbl %cl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    addl %eax, %esi
-; X32-NEXT:    adcl %edx, %ecx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl %eax, %ecx
+; X32-NEXT:    adcl %edx, %esi
 ; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl %ebx, %edi
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %edi, %eax
@@ -4666,8 +4666,7 @@
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
@@ -4735,31 +4734,30 @@
 ; X32-NEXT:    adcl %ebx, %edi
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl %esi, %ebp
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movzbl %bl, %edi
 ; X32-NEXT:    adcl %edi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    addl %eax, %edi
-; X32-NEXT:    adcl %edx, %esi
+; X32-NEXT:    adcl %edx, %ecx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl %ecx, %edi
@@ -4772,7 +4770,7 @@
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -4796,8 +4794,8 @@
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %ebp, %edi
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
@@ -4817,7 +4815,7 @@
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %esi, %ebp
+; X32-NEXT:    movl %esi, %edi
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
@@ -4836,41 +4834,41 @@
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    imull %eax, %ebp
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    imull %ebp, %edi
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    addl %ebp, %edx
+; X32-NEXT:    addl %edi, %edx
 ; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    movl %ecx, %ebp
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    imull %ebx, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %esi, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    imull %edi, %esi
-; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    imull %edi, %ecx
+; X32-NEXT:    addl %edx, %ecx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebp, %esi
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl %edi, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %ebx, %ecx
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    addl %edi, %ebx
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
@@ -4880,7 +4878,7 @@
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebp, %esi
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movzbl %bl, %ecx
@@ -5647,7 +5645,6 @@
 ; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl %ebp, %esi
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl %ecx, %edi
@@ -5660,19 +5657,19 @@
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %ebx, %ecx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ebx
@@ -5818,7 +5815,6 @@
 ; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl %edi, %esi
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl %ecx, %ebx
@@ -5831,19 +5827,19 @@
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %ebp, %ecx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
@@ -6003,20 +5999,19 @@
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    addl %eax, %edi
+; X32-NEXT:    addl %eax, %ebp
 ; X32-NEXT:    adcl %edx, %ecx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    movl 104(%ebp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl 104(%eax), %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
@@ -6029,7 +6024,8 @@
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl 108(%ebp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl 108(%eax), %esi
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6057,8 +6053,7 @@
 ; X32-NEXT:    adcl %edx, %eax
 ; X32-NEXT:    addl %esi, %edi
 ; X32-NEXT:    adcl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, %edi
@@ -6110,12 +6105,11 @@
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %ebx, %ecx
-; X32-NEXT:    imull %eax, %ecx
+; X32-NEXT:    imull %eax, %ebx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    addl %ebx, %edx
 ; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    addl %edx, %esi
 ; X32-NEXT:    movl %esi, %ebx
@@ -6161,12 +6155,11 @@
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl 124(%edx), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl 124(%edi), %ecx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    imull %eax, %ecx
-; X32-NEXT:    movl 120(%edx), %esi
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl 120(%edi), %esi
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    addl %ecx, %edx
@@ -6230,25 +6223,25 @@
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %edi, %ebx
-; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ecx, %ebp
+; X32-NEXT:    adcl %edi, %ebp
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movzbl %bl, %edi
 ; X32-NEXT:    adcl %edi, %edx
@@ -6264,26 +6257,26 @@
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    addl %esi, %ebp
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebx, %edi
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl %esi, %ebp
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %ecx, %ebp
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movzbl %bl, %edi
 ; X32-NEXT:    adcl %edi, %edx
@@ -6321,51 +6314,51 @@
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %ebx, %ebp
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    addl %eax, %esi
-; X32-NEXT:    adcl %edx, %ecx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl %eax, %ecx
+; X32-NEXT:    adcl %edx, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %edi, %ecx
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %ebp, %edi
 ; X32-NEXT:    mull %ebp
-; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    setb %cl
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movzbl %cl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
@@ -6375,20 +6368,19 @@
 ; X32-NEXT:    adcl %edx, %ecx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %ebp, %ecx
-; X32-NEXT:    imull %eax, %ecx
+; X32-NEXT:    imull %eax, %edi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    addl %edi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    imull %ebp, %esi
 ; X32-NEXT:    addl %edx, %esi
@@ -6533,12 +6525,12 @@
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6572,12 +6564,12 @@
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6704,6 +6696,7 @@
 ; X64-NEXT:    adcq $0, %rbp
 ; X64-NEXT:    addq %rcx, %rbx
 ; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rcx, %r11
 ; X64-NEXT:    adcq %rdi, %rbp
 ; X64-NEXT:    setb %bl
 ; X64-NEXT:    movzbl %bl, %ebx
@@ -6713,17 +6706,16 @@
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq %rax, %rcx
+; X64-NEXT:    movq %rdi, %r14
+; X64-NEXT:    movq %rdi, %r8
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %rdx, %r14
+; X64-NEXT:    addq %rbp, %rcx
 ; X64-NEXT:    movq %rcx, %r12
-; X64-NEXT:    movq %rcx, %r8
-; X64-NEXT:    addq %rax, %r12
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %r9
-; X64-NEXT:    movq %rdi, (%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %rdx, %rax
-; X64-NEXT:    addq %rbp, %r12
-; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %rbx, %rax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %rbx, %r14
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq (%rsi), %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    xorl %ebp, %ebp
@@ -6733,7 +6725,7 @@
 ; X64-NEXT:    movq 8(%rsi), %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %rbp
-; X64-NEXT:    xorl %r11d, %r11d
+; X64-NEXT:    xorl %r9d, %r9d
 ; X64-NEXT:    movq %rax, %r15
 ; X64-NEXT:    addq %rcx, %r15
 ; X64-NEXT:    movq %rdx, %rbp
@@ -6749,22 +6741,21 @@
 ; X64-NEXT:    movq %rsi, %r13
 ; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdi, %r14
-; X64-NEXT:    addq %rax, %r14
-; X64-NEXT:    movq %rcx, %r11
-; X64-NEXT:    adcq %rdx, %r11
-; X64-NEXT:    addq %rbp, %r14
-; X64-NEXT:    adcq %rbx, %r11
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %r8, %rbp
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdi, %r9
+; X64-NEXT:    addq %rax, %r9
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    adcq %rdx, %rax
+; X64-NEXT:    addq %rbp, %r9
+; X64-NEXT:    adcq %rbx, %rax
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    adcq %rcx, %rax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %rcx, %r8
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq (%r10), %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    xorl %r8d, %r8d
@@ -6772,44 +6763,44 @@
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    movq %rdi, %r9
 ; X64-NEXT:    movq %rdx, %rax
 ; X64-NEXT:    adcq %rcx, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq 32(%r13), %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %r8
-; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    movq %rbx, %r8
 ; X64-NEXT:    addq %r13, %rax
 ; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    adcq %rdx, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    addq %rdi, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdi, %r11
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    adcq %r15, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r14, %r12
-; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    adcq %r11, %rax
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    adcq %r9, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r11, %rdi
+; X64-NEXT:    adcq %rbp, %r14
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rbp, %rdi
 ; X64-NEXT:    movq 8(%r10), %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rsi, %r11
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    addq %rsi, %r12
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    addq %rbx, %r11
+; X64-NEXT:    addq %rbx, %r12
 ; X64-NEXT:    adcq %rsi, %rbp
 ; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    setb %bl
@@ -6818,92 +6809,91 @@
 ; X64-NEXT:    adcq %rdx, %rbx
 ; X64-NEXT:    movq 16(%r10), %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rcx, %r8
-; X64-NEXT:    addq %rax, %r8
+; X64-NEXT:    movq %r8, %rcx
+; X64-NEXT:    addq %rax, %rcx
 ; X64-NEXT:    movq %rsi, %r10
 ; X64-NEXT:    adcq %rdx, %r10
-; X64-NEXT:    addq %rbp, %r8
-; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    addq %rbp, %rcx
 ; X64-NEXT:    adcq %rbx, %r10
-; X64-NEXT:    movq %rcx, %rdx
-; X64-NEXT:    movq %rcx, %r12
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    addq %r9, %rdx
+; X64-NEXT:    movq %r8, %rdx
+; X64-NEXT:    movq %r8, %r14
+; X64-NEXT:    movq %r8, (%rsp) # 8-byte Spill
+; X64-NEXT:    addq %r11, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r11, %r8
-; X64-NEXT:    adcq %r11, %r15
+; X64-NEXT:    movq %r12, %rsi
+; X64-NEXT:    adcq %r12, %r15
 ; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %rax, %r14
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    adcq %rcx, %r9
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rcx, %r8
 ; X64-NEXT:    adcq %r10, %rdi
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq 40(%rsi), %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq 40(%rdi), %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    xorl %r14d, %r14d
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    addq %r9, %rdi
+; X64-NEXT:    xorl %r9d, %r9d
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    addq %r11, %rcx
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    addq %r13, %rdi
-; X64-NEXT:    adcq %r9, %rbp
+; X64-NEXT:    addq %r13, %rcx
+; X64-NEXT:    adcq %r11, %rbp
 ; X64-NEXT:    setb %bl
 ; X64-NEXT:    addq %rax, %rbp
-; X64-NEXT:    movzbl %bl, %r11d
-; X64-NEXT:    adcq %rdx, %r11
-; X64-NEXT:    movq 48(%rsi), %rax
+; X64-NEXT:    movzbl %bl, %ebx
+; X64-NEXT:    adcq %rdx, %rbx
+; X64-NEXT:    movq 48(%rdi), %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r13, %rbx
-; X64-NEXT:    addq %rax, %rbx
-; X64-NEXT:    movq %r9, %rsi
-; X64-NEXT:    adcq %rdx, %rsi
-; X64-NEXT:    addq %rbp, %rbx
-; X64-NEXT:    adcq %r11, %rsi
+; X64-NEXT:    movq %r13, %r12
+; X64-NEXT:    addq %rax, %r12
+; X64-NEXT:    movq %r11, %rdi
+; X64-NEXT:    adcq %rdx, %rdi
+; X64-NEXT:    addq %rbp, %r12
+; X64-NEXT:    adcq %rbx, %rdi
 ; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    addq %r13, %r12
-; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %rdi, %r8
+; X64-NEXT:    addq %r13, %r14
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %rcx, %rsi
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %r12, %r8
 ; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %rbx, %rcx
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %rsi, %r10
+; X64-NEXT:    adcq %rdi, %r10
 ; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    addq %r13, %rax
-; X64-NEXT:    movq (%rsp), %rax # 8-byte Reload
-; X64-NEXT:    adcq %r9, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    adcq %r11, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %r8, %r10
 ; X64-NEXT:    addq %r13, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    movq 56(%rax), %r11
 ; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdi, %r10
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %rsi, %rbx
@@ -6918,7 +6908,7 @@
 ; X64-NEXT:    setb %cl
 ; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdi, %r11
+; X64-NEXT:    movq %rdi, %r13
 ; X64-NEXT:    addq %rsi, %rax
 ; X64-NEXT:    movzbl %cl, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
@@ -6930,26 +6920,26 @@
 ; X64-NEXT:    adcq %rdx, %r12
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %rsi, %rbx
 ; X64-NEXT:    adcq $0, %rbp
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %rcx, %r10
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rcx, %r11
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %rbx, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %rbp, %rcx
 ; X64-NEXT:    setb %bl
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    addq %rcx, %rsi
@@ -6957,49 +6947,47 @@
 ; X64-NEXT:    adcq %rax, %r13
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT:    addq %r9, %rsi
+; X64-NEXT:    addq %r14, %rsi
 ; X64-NEXT:    adcq %r8, %r13
 ; X64-NEXT:    adcq $0, %r15
 ; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    movq %r10, %rbx
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %r11, %rbx
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %r9
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rdi, %r11
+; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %rcx, %rbp
 ; X64-NEXT:    adcq $0, %rdi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    movq 24(%rax), %rcx
+; X64-NEXT:    movq 24(%rax), %r9
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rcx, %rbx
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    addq %rbp, %r8
 ; X64-NEXT:    adcq %rdi, %rcx
-; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    setb %bl
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    movzbl %dil, %ecx
+; X64-NEXT:    movzbl %bl, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    addq %r14, %rbp
-; X64-NEXT:    movq (%rsp), %rbx # 8-byte Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    adcq %r9, %rbx
+; X64-NEXT:    movq %r10, %rbp
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    addq %r11, %rbp
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    adcq %r10, %rbx
 ; X64-NEXT:    addq %rax, %rbp
 ; X64-NEXT:    adcq %rdx, %rbx
-; X64-NEXT:    addq %rsi, %r10
-; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq %rsi, %r14
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %r13, %r8
 ; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rbp
@@ -7009,76 +6997,74 @@
 ; X64-NEXT:    setb %r15b
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %r11, %rsi
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
 ; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %r11, %rdi
+; X64-NEXT:    addq %r14, %rdi
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rdi, %r11
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %rdi, %r14
 ; X64-NEXT:    adcq %rsi, %rcx
 ; X64-NEXT:    setb %sil
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %r8, %r12
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %r9, %r12
 ; X64-NEXT:    addq %rcx, %rax
 ; X64-NEXT:    movzbl %sil, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    addq %r14, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    adcq %r9, %r14
+; X64-NEXT:    addq %r11, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    adcq %r10, %r9
 ; X64-NEXT:    addq %rax, %rcx
-; X64-NEXT:    adcq %rdx, %r14
+; X64-NEXT:    adcq %rdx, %r9
 ; X64-NEXT:    addq %rbp, %r13
-; X64-NEXT:    adcq %rbx, %r11
+; X64-NEXT:    adcq %rbx, %r14
 ; X64-NEXT:    movzbl %r15b, %eax
 ; X64-NEXT:    adcq %rax, %rcx
-; X64-NEXT:    adcq $0, %r14
+; X64-NEXT:    adcq $0, %r9
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
 ; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
 ; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    movq 24(%rax), %rcx
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, %r11
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %rbx, %rbp
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %r15
 ; X64-NEXT:    addq %rbp, %r15
 ; X64-NEXT:    adcq %rsi, %rbx
 ; X64-NEXT:    setb %sil
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    addq %rbx, %rax
 ; X64-NEXT:    movzbl %sil, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
@@ -7090,19 +7076,19 @@
 ; X64-NEXT:    adcq %rdx, %r10
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %rdi, %rbx
 ; X64-NEXT:    adcq $0, %rbp
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %rcx, %r11
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rcx, %r9
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    addq %rbx, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -7110,7 +7096,7 @@
 ; X64-NEXT:    setb %cl
 ; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    movq %rsi, %rbp
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %rdi, %rbx
@@ -7122,11 +7108,11 @@
 ; X64-NEXT:    adcq %r15, %rsi
 ; X64-NEXT:    adcq $0, %r8
 ; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    movq %rbp, %r14
 ; X64-NEXT:    mulq %rdi
@@ -7135,11 +7121,11 @@
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %rcx, %rbp
 ; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %rbp, %rax
-; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    adcq %rdi, %rcx
 ; X64-NEXT:    setb %dil
 ; X64-NEXT:    movq %r14, %rax
@@ -7147,7 +7133,7 @@
 ; X64-NEXT:    addq %rcx, %rax
 ; X64-NEXT:    movzbl %dil, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq (%rsp), %rdi # 8-byte Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
 ; X64-NEXT:    addq %r13, %rdi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
@@ -7155,65 +7141,63 @@
 ; X64-NEXT:    adcq %r14, %rbp
 ; X64-NEXT:    addq %rax, %rdi
 ; X64-NEXT:    adcq %rdx, %rbp
-; X64-NEXT:    addq %rbx, %r9
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %rsi, %r11
+; X64-NEXT:    addq %rbx, %r11
 ; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %rsi, %r9
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rdi
 ; X64-NEXT:    adcq $0, %rbp
 ; X64-NEXT:    addq %r8, %rdi
 ; X64-NEXT:    adcq %r10, %rbp
-; X64-NEXT:    setb %r9b
+; X64-NEXT:    setb %r10b
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r10, %rbx
+; X64-NEXT:    addq %r8, %rbx
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %rbx, %r15
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rbx, %r8
 ; X64-NEXT:    adcq %rsi, %rcx
 ; X64-NEXT:    setb %bl
-; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %r12
 ; X64-NEXT:    addq %rcx, %rax
 ; X64-NEXT:    movzbl %bl, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    movq %r10, %rcx
-; X64-NEXT:    addq %r13, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT:    movq %rbx, %rsi
-; X64-NEXT:    movq %rbx, %r12
-; X64-NEXT:    adcq %r14, %rsi
-; X64-NEXT:    addq %rax, %rcx
-; X64-NEXT:    adcq %rdx, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    movq %r9, %r15
+; X64-NEXT:    addq %r13, %r15
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    movq %r12, %r13
+; X64-NEXT:    adcq %r14, %r13
+; X64-NEXT:    addq %rax, %r15
+; X64-NEXT:    adcq %rdx, %r13
 ; X64-NEXT:    addq %rdi, %r11
-; X64-NEXT:    adcq %rbp, %r15
-; X64-NEXT:    movzbl %r9b, %eax
-; X64-NEXT:    adcq %rax, %rcx
-; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    adcq %rbp, %r8
+; X64-NEXT:    movzbl %r10b, %eax
+; X64-NEXT:    adcq %rax, %r15
+; X64-NEXT:    adcq $0, %r13
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
 ; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
 ; X64-NEXT:    adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
@@ -7224,104 +7208,106 @@
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    movq %r8, %rbp
-; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rcx, %r11
+; X64-NEXT:    movq %rcx, %rbp
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    addq %rsi, %rcx
 ; X64-NEXT:    adcq $0, %rbx
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    addq %rcx, %r8
 ; X64-NEXT:    adcq %rbx, %rsi
 ; X64-NEXT:    setb %cl
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdi, %r15
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    addq %rsi, %rax
 ; X64-NEXT:    movzbl %cl, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    movq %r10, %r9
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
 ; X64-NEXT:    movq %r12, %r10
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
 ; X64-NEXT:    addq %rax, %r9
 ; X64-NEXT:    adcq %rdx, %r10
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rcx, %rbx
+; X64-NEXT:    addq %r12, %rbx
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rcx, %r12
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %rbx, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %rsi, %rcx
 ; X64-NEXT:    setb %sil
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rdi, %rbp
+; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %rcx, %rbx
 ; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    adcq %rax, %r15
+; X64-NEXT:    adcq %rax, %rdi
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
 ; X64-NEXT:    addq %r14, %rbx
-; X64-NEXT:    adcq %r8, %r15
+; X64-NEXT:    adcq %r8, %rdi
 ; X64-NEXT:    adcq $0, %r9
 ; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %rbp, %rsi
-; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq %r12, %r11
+; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %r8
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq %rbp, %r8
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    addq %r14, %rcx
 ; X64-NEXT:    adcq $0, %rbp
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    movq 56(%rax), %rdi
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq 56(%rax), %rsi
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rsi, %r11
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %rcx, %r14
 ; X64-NEXT:    adcq %rbp, %rsi
 ; X64-NEXT:    setb %cl
 ; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdi, %r8
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    addq %rsi, %rax
 ; X64-NEXT:    movzbl %cl, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT:    addq %r11, %rcx
+; X64-NEXT:    movq (%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    addq %r8, %rcx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT:    adcq %r13, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    adcq %r11, %rsi
 ; X64-NEXT:    addq %rax, %rcx
 ; X64-NEXT:    adcq %rdx, %rsi
 ; X64-NEXT:    addq %rbx, %r12
-; X64-NEXT:    adcq %r15, %r14
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %rdi, %r14
 ; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    addq %r9, %rcx
@@ -7336,69 +7322,65 @@
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
 ; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %r9, %rbx
-; X64-NEXT:    adcq $0, %r15
+; X64-NEXT:    adcq $0, %r12
 ; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    movq %r8, %rdi
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rbx, %r8
-; X64-NEXT:    adcq %r15, %r9
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    addq %rbx, %rbp
+; X64-NEXT:    adcq %r12, %r9
 ; X64-NEXT:    setb %bl
 ; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    addq %r9, %rax
 ; X64-NEXT:    movzbl %bl, %edi
 ; X64-NEXT:    adcq %rdi, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT:    addq %r11, %r15
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT:    adcq %r13, %rbp
-; X64-NEXT:    addq %rax, %r15
-; X64-NEXT:    adcq %rdx, %rbp
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    addq %r8, %r12
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    adcq %r11, %r10
+; X64-NEXT:    addq %rax, %r12
+; X64-NEXT:    adcq %rdx, %r10
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
 ; X64-NEXT:    addq %rcx, %rdx
-; X64-NEXT:    adcq %rsi, %r8
+; X64-NEXT:    adcq %rsi, %rbp
 ; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; X64-NEXT:    adcq %rax, %r15
-; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    adcq %rax, %r12
+; X64-NEXT:    adcq $0, %r10
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT:    adcq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT:    adcq %r13, %r14
 ; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rdx
-; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    adcq $0, %r15
 ; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    adcq $0, %r12
+; X64-NEXT:    adcq $0, %r10
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
 ; X64-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, %r10
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %r11, %rbx
@@ -7407,176 +7389,171 @@
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %rbx, %r12
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    addq %rbx, %r9
 ; X64-NEXT:    adcq %rdi, %rcx
 ; X64-NEXT:    setb %bl
-; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, %r9
+; X64-NEXT:    movq %rsi, %r13
 ; X64-NEXT:    addq %rcx, %rax
 ; X64-NEXT:    movzbl %bl, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
 ; X64-NEXT:    addq %rax, %r8
-; X64-NEXT:    adcq %rdx, %rcx
-; X64-NEXT:    movq %rcx, %r14
+; X64-NEXT:    adcq %rdx, %r15
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %r11, %rbx
 ; X64-NEXT:    adcq $0, %rdi
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %rcx, %r13
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rcx, %r14
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %rbx, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %rdi, %rcx
 ; X64-NEXT:    setb %bl
 ; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    addq %rcx, %r13
 ; X64-NEXT:    movzbl %bl, %eax
 ; X64-NEXT:    adcq %rax, %r11
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
-; X64-NEXT:    adcq %r12, %r11
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    adcq %r9, %r11
 ; X64-NEXT:    adcq $0, %r8
 ; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %r14
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    adcq $0, %r15
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    movq %rsi, %r9
+; X64-NEXT:    movq %rsi, %r15
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rcx, %r10
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %r8, %rcx
+; X64-NEXT:    addq %rbx, %rcx
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    addq %rcx, %rax
 ; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rcx, %r8
 ; X64-NEXT:    adcq %rsi, %rbx
 ; X64-NEXT:    setb %cl
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %r13, %r9
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    addq %rbx, %rax
 ; X64-NEXT:    movzbl %cl, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT:    addq %r13, %rsi
-; X64-NEXT:    movq (%rsp), %rcx # 8-byte Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    adcq %r14, %rcx
-; X64-NEXT:    addq %rax, %rsi
-; X64-NEXT:    adcq %rdx, %rcx
-; X64-NEXT:    addq %rdi, %r12
-; X64-NEXT:    adcq %r11, %r8
-; X64-NEXT:    movq %r8, %r11
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
+; X64-NEXT:    addq %rax, %r14
+; X64-NEXT:    adcq %rdx, %rcx
+; X64-NEXT:    addq %r13, %r9
+; X64-NEXT:    movq %r9, %r13
+; X64-NEXT:    adcq %r11, %r8
+; X64-NEXT:    movq %r8, %r15
+; X64-NEXT:    adcq $0, %r14
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
 ; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    addq %rcx, %rdi
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %rdi, %r10
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    addq %rdi, %r9
 ; X64-NEXT:    adcq %rsi, %rcx
-; X64-NEXT:    setb %bl
+; X64-NEXT:    setb %sil
 ; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    movzbl %bl, %ecx
+; X64-NEXT:    movzbl %sil, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    addq %r13, %rsi
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    adcq %r14, %rcx
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
 ; X64-NEXT:    addq %rax, %rsi
 ; X64-NEXT:    adcq %rdx, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT:    adcq (%rsp), %r10 # 8-byte Folded Reload
+; X64-NEXT:    addq %r14, %r11
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
 ; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X64-NEXT:    adcq %rax, %rsi
 ; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT:    adcq %r15, %r12
-; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %rbp, %r11
-; X64-NEXT:    movq %r11, (%rsp) # 8-byte Spill
+; X64-NEXT:    adcq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT:    adcq %r12, %r13
+; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %r10, %r15
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; X64-NEXT:    adcq %rax, %r14
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %rax, %r11
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    movq 64(%r9), %r11
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq 64(%rcx), %r11
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %rsi, %rbx
 ; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq 72(%rcx), %rsi
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq 72(%r9), %rsi
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rsi, %rcx
 ; X64-NEXT:    movq %rdx, %rsi
@@ -7584,9 +7561,9 @@
 ; X64-NEXT:    addq %rbx, %r8
 ; X64-NEXT:    adcq %rbp, %rsi
 ; X64-NEXT:    setb %bl
-; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rcx, %r10
+; X64-NEXT:    movq %rcx, %r13
 ; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %rdi
@@ -7598,141 +7575,138 @@
 ; X64-NEXT:    mulq %rdx
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; X64-NEXT:    addq %rax, %r12
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    addq %rax, %r10
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
 ; X64-NEXT:    adcq %rdx, %r15
-; X64-NEXT:    addq %rdi, %r12
+; X64-NEXT:    addq %rdi, %r10
 ; X64-NEXT:    adcq %rcx, %r15
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    movq %r11, %rsi
 ; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    addq %r11, %rdi
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %rcx, %r11
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %rdi, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %rsi, %rcx
 ; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq %rbp, %r11
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    addq %rcx, %rax
 ; X64-NEXT:    movzbl %sil, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT:    adcq %r13, %r14
 ; X64-NEXT:    addq %rax, %rbx
 ; X64-NEXT:    adcq %rdx, %r14
-; X64-NEXT:    addq %r13, %rbx
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
 ; X64-NEXT:    adcq %r8, %r14
-; X64-NEXT:    adcq $0, %r12
+; X64-NEXT:    adcq $0, %r10
 ; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT:    movq 80(%rbp), %rdi
-; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq 80(%r9), %rdi
+; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    addq %r8, %rcx
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq 88(%rbp), %r10
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    movq 88(%r9), %r9
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    addq %rcx, %r8
 ; X64-NEXT:    adcq %rsi, %rbp
-; X64-NEXT:    setb %r11b
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    setb %r12b
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    addq %rbp, %rsi
-; X64-NEXT:    movzbl %r11b, %eax
+; X64-NEXT:    movzbl %r12b, %eax
 ; X64-NEXT:    adcq %rax, %rcx
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    mulq %rdx
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT:    addq %rax, %rbp
+; X64-NEXT:    movq %rdx, %r12
+; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    adcq %rdx, %rax
-; X64-NEXT:    addq %rsi, %rbp
-; X64-NEXT:    adcq %rcx, %rax
-; X64-NEXT:    addq %rbx, %r13
-; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq %r11, %rax
+; X64-NEXT:    adcq %rdx, %r13
+; X64-NEXT:    addq %rsi, %rax
+; X64-NEXT:    adcq %rcx, %r13
+; X64-NEXT:    addq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    adcq %r14, %r8
 ; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %rbp
 ; X64-NEXT:    adcq $0, %rax
-; X64-NEXT:    addq %r12, %rbp
-; X64-NEXT:    movq %rbp, %r8
-; X64-NEXT:    adcq %r15, %rax
-; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    adcq $0, %r13
+; X64-NEXT:    addq %r10, %rax
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    adcq %r15, %r13
 ; X64-NEXT:    setb %r14b
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %r12
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
 ; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r15, %rbx
+; X64-NEXT:    addq %rcx, %rbx
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %rbx, %rax
-; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    adcq %rsi, %rcx
-; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    setb %bl
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    movzbl %sil, %ecx
+; X64-NEXT:    movzbl %bl, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    addq %r9, %rsi
+; X64-NEXT:    addq %r11, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT:    adcq %r12, %rcx
 ; X64-NEXT:    addq %rax, %rsi
 ; X64-NEXT:    adcq %rdx, %rcx
-; X64-NEXT:    addq %r8, %r12
-; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r11, %rbx
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq %r8, %r15
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %r13, %rbp
+; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movzbl %r14b, %eax
 ; X64-NEXT:    adcq %rax, %rsi
 ; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    imulq %rax, %r10
-; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    imulq %rax, %r9
+; X64-NEXT:    movq %rax, %r10
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %r10, %rdx
+; X64-NEXT:    addq %r9, %rdx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
 ; X64-NEXT:    imulq %rbp, %rdi
 ; X64-NEXT:    addq %rdx, %rdi
@@ -7752,11 +7726,11 @@
 ; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    movq %rcx, %rdi
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %rcx, %rbx
@@ -7777,12 +7751,11 @@
 ; X64-NEXT:    adcq %rax, %r12
 ; X64-NEXT:    addq %r9, %r13
 ; X64-NEXT:    adcq %r8, %r12
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq 120(%rdx), %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; X64-NEXT:    movq 120(%rbp), %rcx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
 ; X64-NEXT:    imulq %r10, %rcx
-; X64-NEXT:    movq 112(%rdx), %rsi
-; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    movq 112(%rbp), %rsi
 ; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rax, %r11
@@ -7840,46 +7813,45 @@
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq 80(%rsi), %rdi
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    movq 80(%r9), %rsi
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq 88(%rsi), %rax
-; X64-NEXT:    movq %rsi, %r9
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq 88(%r9), %r8
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rcx, %r11
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r8, %rbx
+; X64-NEXT:    addq %rdi, %rbx
 ; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %rbx, %r14
 ; X64-NEXT:    adcq %rbp, %rcx
-; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    setb %r10b
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %rcx, %rbx
-; X64-NEXT:    movzbl %r8b, %eax
+; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %rbp
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    movq (%rsp), %r12 # 8-byte Reload
 ; X64-NEXT:    addq %r12, %rsi
 ; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
@@ -7891,8 +7863,8 @@
 ; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq 72(%r9), %r9
-; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq 72(%r9), %rdi
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rbx
@@ -7905,8 +7877,7 @@
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %rbp, %rcx
 ; X64-NEXT:    setb %r11b
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %rbp
@@ -7924,20 +7895,20 @@
 ; X64-NEXT:    addq %rbp, %rcx
 ; X64-NEXT:    adcq %rbx, %r8
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq %r14, %r8
 ; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    movq %rdi, %r8
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %rcx, %rbp
@@ -7950,74 +7921,73 @@
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    adcq %rdi, %rcx
 ; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    addq %rcx, %rax
 ; X64-NEXT:    movzbl %dil, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    addq %r14, %r15
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT:    adcq %r13, %r11
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    addq %r9, %r15
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    adcq %r8, %r11
 ; X64-NEXT:    addq %rax, %r15
 ; X64-NEXT:    adcq %rdx, %r11
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq (%rsp), %r12 # 8-byte Folded Reload
+; X64-NEXT:    movq %r12, (%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
 ; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %r15
 ; X64-NEXT:    adcq $0, %r11
 ; X64-NEXT:    addq %rsi, %r15
 ; X64-NEXT:    adcq %r10, %r11
-; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %r8, %r12
+; X64-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rcx, %rbx
+; X64-NEXT:    addq %r10, %rbx
 ; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    addq %rbx, %rax
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    adcq %rdi, %rcx
-; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, %rdi
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    movzbl %r8b, %ecx
-; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    addq %r14, %rsi
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    adcq %r13, %rcx
-; X64-NEXT:    addq %rax, %rsi
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    adcq %rdi, %r10
+; X64-NEXT:    setb %bl
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rcx, %rdi
+; X64-NEXT:    addq %r10, %rax
+; X64-NEXT:    movzbl %bl, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT:    addq %r9, %rbx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    adcq %r8, %rcx
+; X64-NEXT:    addq %rax, %rbx
 ; X64-NEXT:    adcq %rdx, %rcx
-; X64-NEXT:    addq %r15, %r9
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r11, %rbx
+; X64-NEXT:    addq %r15, %rbp
+; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %r11, %r12
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; X64-NEXT:    adcq %rax, %rbx
 ; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %rsi
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
 ; X64-NEXT:    movq 96(%rbp), %rcx
 ; X64-NEXT:    imulq %rcx, %rdi
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %r12, %rsi
-; X64-NEXT:    mulq %r12
+; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    addq %rdi, %rdx
 ; X64-NEXT:    movq 104(%rbp), %r8
@@ -8067,32 +8037,31 @@
 ; X64-NEXT:    addq %r10, %rbp
 ; X64-NEXT:    adcq %rdi, %rbx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    imulq %rax, %rsi
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    imulq %r13, %rsi
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    addq %rsi, %rdx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT:    imulq %r11, %rcx
-; X64-NEXT:    addq %rdx, %rcx
-; X64-NEXT:    movq %rcx, %r9
+; X64-NEXT:    imulq %r11, %r8
+; X64-NEXT:    addq %rdx, %r8
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
 ; X64-NEXT:    imulq %r15, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rax, %r10
 ; X64-NEXT:    addq %rcx, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    imulq %r14, %rax
-; X64-NEXT:    addq %rdx, %rax
-; X64-NEXT:    addq %r8, %r10
-; X64-NEXT:    adcq %r9, %rax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    imulq %rdi, %rax
+; X64-NEXT:    addq %rdx, %rax
+; X64-NEXT:    addq %r9, %r10
+; X64-NEXT:    adcq %r8, %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdi, %r14
 ; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %r8
@@ -8128,7 +8097,7 @@
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; X64-NEXT:    movq (%rsp), %rbp # 8-byte Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
@@ -8141,7 +8110,7 @@
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
 ; X64-NEXT:    movq %rdi, %r10
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; X64-NEXT:    adcq (%rsp), %rbx # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/mul-i256.ll b/llvm/test/CodeGen/X86/mul-i256.ll
index 7be8a8e..df64b90 100644
--- a/llvm/test/CodeGen/X86/mul-i256.ll
+++ b/llvm/test/CodeGen/X86/mul-i256.ll
@@ -25,15 +25,15 @@
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl 12(%ecx), %ebp
 ; X32-NEXT:    movl 8(%ecx), %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl (%eax), %ebx
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebx
@@ -44,60 +44,60 @@
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %ecx, %edi
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl %ecx, %edi
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %ebx
 ; X32-NEXT:    xorl %edx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %edx
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    xorl %edx, %edx
 ; X32-NEXT:    mull %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    adcl %ebp, %edx
 ; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebx, %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl (%esi), %ebp
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %ecx, %edi
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl 4(%esi), %esi
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %ecx
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %esi, %eax
@@ -107,84 +107,84 @@
 ; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    xorl %edx, %edx
 ; X32-NEXT:    mull %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    addl %eax, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %edx, %eax
 ; X32-NEXT:    addl %ebp, %ecx
 ; X32-NEXT:    adcl %edi, %eax
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    movl 8(%eax), %ebx
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl 8(%edi), %ebx
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl %esi, %edi
+; X32-NEXT:    movl %esi, %ecx
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl 12(%ecx), %ecx
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 12(%edi), %edi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edi, %ecx
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebp, %edi
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl %edi, %ebp
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %esi
 ; X32-NEXT:    movl %ebx, %edi
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    xorl %ecx, %ecx
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    addl %eax, %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %edx, %eax
 ; X32-NEXT:    addl %ebp, %ebx
 ; X32-NEXT:    adcl %esi, %eax
 ; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NEXT:    addl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
@@ -192,80 +192,80 @@
 ; X32-NEXT:    addl %ebx, %ebp
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    adcl %edi, %esi
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %esi # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    addl %eax, %esi
 ; X32-NEXT:    adcl %edx, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    addl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl 16(%ecx), %esi
 ; X32-NEXT:    imull %esi, %ebx
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    addl %ebx, %edx
 ; X32-NEXT:    movl 20(%ecx), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    imull %eax, %edi
 ; X32-NEXT:    addl %edx, %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl 24(%ecx), %eax
 ; X32-NEXT:    movl %ecx, %ebp
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    imull %ecx, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    addl %edi, %edx
 ; X32-NEXT:    movl 28(%ebp), %ebp
 ; X32-NEXT:    imull %ebx, %ebp
 ; X32-NEXT:    addl %edx, %ebp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    addl %edx, (%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl %ebx, %edi
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ecx, %esi
 ; X32-NEXT:    setb %cl
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movzbl %cl, %ecx
@@ -273,37 +273,37 @@
 ; X32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl %ebp, %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X32-NEXT:    movl 28(%ebx), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    imull %esi, %ecx
 ; X32-NEXT:    movl 24(%ebx), %edi
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    imull {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    addl %edx, %edi
 ; X32-NEXT:    movl 16(%ebx), %ebp
 ; X32-NEXT:    movl 20(%ebx), %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    imull %ebx, %ecx
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    imull %ebp, %ecx
 ; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
@@ -311,38 +311,38 @@
 ; X32-NEXT:    addl %edi, %ebx
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    addl %ebx, %esi
 ; X32-NEXT:    adcl %ecx, %edi
 ; X32-NEXT:    setb %cl
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movzbl %cl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl (%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, %ebx
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, (%ecx)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, 4(%ecx)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, 8(%ecx)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, 12(%ecx)
 ; X32-NEXT:    movl %ebx, 16(%ecx)
 ; X32-NEXT:    movl %esi, 20(%ecx)
diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll
index d834548..a0ed135 100644
--- a/llvm/test/CodeGen/X86/mul-i512.ll
+++ b/llvm/test/CodeGen/X86/mul-i512.ll
@@ -12,9 +12,9 @@
 ; X32-NEXT:    subl $244, %esp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl 20(%ecx), %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl 16(%ecx), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ecx, %ebp
 ; X32-NEXT:    xorl %ebx, %ebx
 ; X32-NEXT:    mull %ebx
@@ -27,37 +27,37 @@
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    addl %esi, %edi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ecx, %ebx
 ; X32-NEXT:    movl %ecx, %edi
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb %cl
 ; X32-NEXT:    addl %eax, %ebx
 ; X32-NEXT:    movzbl %cl, %ecx
 ; X32-NEXT:    adcl %edx, %ecx
 ; X32-NEXT:    movl 24(%ebp), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    xorl %edx, %edx
 ; X32-NEXT:    mull %edx
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    addl %eax, %esi
 ; X32-NEXT:    adcl %edx, %edi
 ; X32-NEXT:    addl %ebx, %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ecx, %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl (%ecx), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    xorl %ebx, %ebx
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl 4(%ecx), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ecx, %esi
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, %ebx
@@ -65,73 +65,73 @@
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    addl %ebp, %ebx
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %ecx
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    addl %eax, %ecx
 ; X32-NEXT:    movzbl %bl, %ebx
 ; X32-NEXT:    adcl %edx, %ebx
 ; X32-NEXT:    movl 8(%esi), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    xorl %edx, %edx
 ; X32-NEXT:    mull %edx
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebp, %esi
 ; X32-NEXT:    addl %eax, %esi
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    adcl %edx, %eax
 ; X32-NEXT:    addl %ecx, %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %ebp, %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %edi, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl (%ecx), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    xorl %ebp, %ebp
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    addl %esi, %edx
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    adcl %edi, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 16(%eax), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ecx, %edi
 ; X32-NEXT:    movl %ecx, %ebp
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    addl %eax, %edi
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    adcl %edx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    adcl (%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 4(%eax), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    xorl %edi, %edi
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, %esi
@@ -139,107 +139,107 @@
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    addl %ebp, %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebx, %ecx
 ; X32-NEXT:    movl %ebx, %esi
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    addl %eax, %ecx
 ; X32-NEXT:    movzbl %bl, %ebx
 ; X32-NEXT:    adcl %edx, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 8(%eax), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebp, %edi
 ; X32-NEXT:    addl %eax, %edi
 ; X32-NEXT:    adcl %edx, %esi
 ; X32-NEXT:    addl %ecx, %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl %esi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    adcl %eax, (%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl 20(%esi), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    xorl %ecx, %ecx
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    addl %edi, %ebx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    addl %ebp, %ebx
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %ecx
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    addl %eax, %ecx
 ; X32-NEXT:    movzbl %bl, %ebx
 ; X32-NEXT:    adcl %edx, %ebx
 ; X32-NEXT:    movl 24(%esi), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    xorl %edx, %edx
 ; X32-NEXT:    mull %edx
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebp, %esi
 ; X32-NEXT:    addl %eax, %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    adcl %edx, %edi
 ; X32-NEXT:    addl %ecx, %esi
 ; X32-NEXT:    adcl %ebx, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebp, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
-; X32-NEXT:    adcl %ebp, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl %esi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    addl %edx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    addl %edx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 28(%eax), %esi
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl %ebp, %ebx
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %ebp
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %esi, %eax
@@ -247,54 +247,54 @@
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    addl %eax, %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edx, %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebx, %ecx
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %edi, %ebp
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
@@ -303,154 +303,154 @@
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 12(%eax), %ecx
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    mull (%esp) # 4-byte Folded Reload
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    addl %eax, %ebp
 ; X32-NEXT:    adcl %edx, %ecx
-; X32-NEXT:    addl %ebx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %esi # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    addl %eax, %edi
 ; X32-NEXT:    adcl %edx, %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    addl %ebp, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edi
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 12(%eax), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl %esi, %ebp
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %esi
 ; X32-NEXT:    setb %cl
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movzbl %cl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    addl %eax, %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edx, %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %ecx
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ebx, %edi
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl %ecx, %ebp
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %edi
@@ -458,93 +458,93 @@
 ; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %esi
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    addl %eax, %ebx
 ; X32-NEXT:    adcl %edx, %ecx
-; X32-NEXT:    addl %ebp, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %esi # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    addl %eax, %ebp
 ; X32-NEXT:    adcl %edx, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    addl %ebx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %ebp
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl (%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %edi
@@ -552,11 +552,11 @@
 ; X32-NEXT:    addl %esi, %ecx
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %esi
 ; X32-NEXT:    setb %cl
 ; X32-NEXT:    movl %ebp, %eax
@@ -564,21 +564,21 @@
 ; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movzbl %cl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    addl %eax, %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edx, %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
@@ -586,137 +586,137 @@
 ; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl %ecx, %edi
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 28(%eax), %ebp
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ebp, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ebx
 ; X32-NEXT:    setb %cl
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movzbl %cl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    addl %eax, %ebp
 ; X32-NEXT:    adcl %edx, %ecx
-; X32-NEXT:    addl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %edx
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movzbl %bl, %esi
 ; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    addl %eax, %edi
 ; X32-NEXT:    adcl %edx, %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    addl %ebp, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edi
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    addl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebx
@@ -724,11 +724,11 @@
 ; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebx, %ecx
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %ebp, %eax
@@ -736,21 +736,21 @@
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    addl %eax, %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edx, %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
@@ -758,33 +758,33 @@
 ; X32-NEXT:    addl %esi, %ebx
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %ecx
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl %ebp, %edi
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %edi
@@ -792,101 +792,101 @@
 ; X32-NEXT:    addl %esi, %ecx
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %ebp
 ; X32-NEXT:    setb %cl
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movzbl %cl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    addl %eax, %esi
 ; X32-NEXT:    adcl %edx, %ecx
-; X32-NEXT:    addl %ebx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl (%esp), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl %edi, %ebp
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    addl %eax, %esi
 ; X32-NEXT:    adcl %edx, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %esi
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    addl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl 32(%ecx), %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
@@ -898,10 +898,10 @@
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %esi, %ecx
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %esi
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %ebp, %eax
@@ -911,27 +911,27 @@
 ; X32-NEXT:    addl %esi, %ebp
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    xorl %edx, %edx
 ; X32-NEXT:    mull %edx
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    addl %eax, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    adcl %edx, %esi
 ; X32-NEXT:    addl %ebp, %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl %ebx, %esi
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
@@ -939,102 +939,101 @@
 ; X32-NEXT:    addl %edi, %ebp
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl %ebx, %esi
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    addl %eax, %edi
 ; X32-NEXT:    adcl %edx, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 40(%eax), %ebp
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl 44(%ebx), %ebx
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl 44(%eax), %ebx
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %edi
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    addl %edi, %esi
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %ecx
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    xorl %edx, %edx
 ; X32-NEXT:    mull %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    addl %eax, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %edx, %eax
 ; X32-NEXT:    addl %esi, %edi
 ; X32-NEXT:    adcl %ecx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    addl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    adcl $0, %eax
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %edi, %eax
@@ -1042,104 +1041,103 @@
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    addl %eax, %esi
 ; X32-NEXT:    adcl %edx, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    addl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    imull %eax, %ebp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    addl %ebp, %edx
-; X32-NEXT:    imull {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    addl %edx, %esi
 ; X32-NEXT:    movl %esi, %ebp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    imull %ebx, %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %esi, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    imull %edi, %esi
 ; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, %edi
 ; X32-NEXT:    adcl %ebp, %esi
-; X32-NEXT:    movl %esi, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl %ebp, %ebx
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl 60(%edx), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl 60(%edi), %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    imull %eax, %ecx
-; X32-NEXT:    movl 56(%edx), %esi
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl 56(%edi), %esi
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    imull {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    addl %edx, %esi
 ; X32-NEXT:    movl 48(%edi), %ebx
 ; X32-NEXT:    movl 52(%edi), %ebp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    imull %ebp, %edi
-; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %edi, %edx
-; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    imull %ebx, %ecx
 ; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
@@ -1147,98 +1145,98 @@
 ; X32-NEXT:    addl %esi, %edi
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl %edi, %ebx
 ; X32-NEXT:    adcl %ebp, %ecx
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl %esi, %ecx
 ; X32-NEXT:    movl 40(%esi), %ebx
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl 44(%ecx), %ecx
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl %ebp, %edi
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ebx
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl %ebx, %edi
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    xorl %ecx, %ecx
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl 32(%esi), %edi
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl 36(%esi), %esi
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebx, %edi
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %esi, %eax
@@ -1248,30 +1246,30 @@
 ; X32-NEXT:    addl %edi, %esi
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %ebp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    xorl %ecx, %ecx
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    addl %eax, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %edx, %eax
 ; X32-NEXT:    addl %esi, %ecx
 ; X32-NEXT:    adcl %ebp, %eax
-; X32-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
@@ -1279,48 +1277,48 @@
 ; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    adcl %esi, %edi
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %ecx, %ebx
 ; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    addl %eax, %esi
 ; X32-NEXT:    adcl %edx, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    addl %eax, (%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %ebx
@@ -1328,193 +1326,193 @@
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # 1-byte Folded Reload
+; X32-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    addl %eax, %edi
 ; X32-NEXT:    adcl %edx, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    addl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl 48(%ecx), %ebp
 ; X32-NEXT:    imull %ebp, %ebx
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    addl %ebx, %edx
 ; X32-NEXT:    movl 52(%ecx), %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    imull %eax, %edi
 ; X32-NEXT:    addl %edx, %edi
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl 56(%ecx), %eax
 ; X32-NEXT:    movl %ecx, %ebx
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    imull %esi, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    addl %edi, %edx
 ; X32-NEXT:    movl 60(%ebx), %ebx
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    imull %ecx, %ebx
 ; X32-NEXT:    addl %edx, %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    addl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ecx, %esi
 ; X32-NEXT:    setb %cl
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movzbl %cl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebx, %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    imull %ebp, %edi
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    addl %edi, %edx
-; X32-NEXT:    imull {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    imull %ebx, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    imull %edi, %ecx
 ; X32-NEXT:    addl %edx, %ecx
 ; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %esi, %ecx
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    adcl %edi, %esi
 ; X32-NEXT:    setb %cl
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movzbl %cl, %ecx
 ; X32-NEXT:    adcl %ecx, %ebx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, (%ecx)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, 4(%ecx)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, 8(%ecx)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, 12(%ecx)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, 16(%ecx)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, 20(%ecx)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, 24(%ecx)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, 28(%ecx)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, 32(%ecx)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, 36(%ecx)
-; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, 40(%ecx)
 ; X32-NEXT:    movl %esi, 44(%ecx)
 ; X32-NEXT:    movl %edx, 48(%ecx)
@@ -1540,7 +1538,7 @@
 ; X64-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
 ; X64-NEXT:    movq 24(%rdi), %r11
 ; X64-NEXT:    movq 16(%rdi), %r15
-; X64-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq (%rsi), %rdx
 ; X64-NEXT:    movq 8(%rsi), %rbp
 ; X64-NEXT:    movq %r15, %rax
@@ -1549,7 +1547,7 @@
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    movq %r11, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rsi, %r10
 ; X64-NEXT:    movq %rdx, %rbx
@@ -1557,7 +1555,7 @@
 ; X64-NEXT:    addq %r9, %rsi
 ; X64-NEXT:    adcq $0, %rbx
 ; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    movq %r15, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r9
@@ -1568,37 +1566,37 @@
 ; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rbp, %r14
-; X64-NEXT:    movq %rbp, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %rcx, %rbp
 ; X64-NEXT:    adcq %rbx, %rsi
 ; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    movq %r10, %rbx
-; X64-NEXT:    movq %r10, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    movq %rax, %r10
 ; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    addq %r10, %r15
 ; X64-NEXT:    adcq %r13, %rdx
 ; X64-NEXT:    addq %rbp, %r15
 ; X64-NEXT:    adcq %rsi, %rdx
 ; X64-NEXT:    movq %rdx, %r12
-; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq (%rdi), %rcx
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq 8(%rdi), %rdi
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %rsi
@@ -1608,7 +1606,7 @@
 ; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    addq %rsi, %rax
-; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %rbp, %rbx
 ; X64-NEXT:    setb %r11b
 ; X64-NEXT:    movq %rdi, %rax
@@ -1631,16 +1629,16 @@
 ; X64-NEXT:    adcq %r9, %r13
 ; X64-NEXT:    adcq $0, %r15
 ; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    movq %r12, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; X64-NEXT:    movq 16(%rsi), %r8
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    movq %rcx, %r9
-; X64-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rbp
@@ -1652,7 +1650,7 @@
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    addq %rbx, %rax
-; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %rbp, %rsi
 ; X64-NEXT:    setb %bpl
 ; X64-NEXT:    movq %rcx, %rax
@@ -1665,31 +1663,31 @@
 ; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %rax, %r11
 ; X64-NEXT:    adcq %rdx, %r14
 ; X64-NEXT:    addq %r9, %r11
 ; X64-NEXT:    adcq %rbx, %r14
 ; X64-NEXT:    addq %r10, %r12
-; X64-NEXT:    movq %r12, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r13, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    adcq $0, %r11
 ; X64-NEXT:    adcq $0, %r14
 ; X64-NEXT:    addq %r15, %r11
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %r14 # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
 ; X64-NEXT:    setb %r9b
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r10 # 8-byte Reload
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r15, %rbx
+; X64-NEXT:    addq %r10, %rbx
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %rdi
@@ -1698,27 +1696,27 @@
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    adcq %rsi, %rcx
 ; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    addq %rcx, %rax
 ; X64-NEXT:    movzbl %sil, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; X64-NEXT:    addq %rbp, %rsi
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
 ; X64-NEXT:    addq %rax, %rsi
 ; X64-NEXT:    adcq %rdx, %rcx
 ; X64-NEXT:    addq %r11, %r12
-; X64-NEXT:    movq %r12, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %r14, %rbx
-; X64-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movzbl %r9b, %eax
 ; X64-NEXT:    adcq %rax, %rsi
-; X64-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq 32(%rcx), %rsi
 ; X64-NEXT:    imulq %rsi, %rdi
 ; X64-NEXT:    movq %rsi, %rax
@@ -1731,9 +1729,9 @@
 ; X64-NEXT:    movq 48(%rcx), %rax
 ; X64-NEXT:    movq %rcx, %rbx
 ; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    imulq %rcx, %rdi
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
 ; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %rdi, %rdx
@@ -1746,7 +1744,7 @@
 ; X64-NEXT:    movq %rbp, %r10
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    movq %rcx, %r8
 ; X64-NEXT:    mulq %rsi
@@ -1770,33 +1768,32 @@
 ; X64-NEXT:    adcq %rax, %r11
 ; X64-NEXT:    addq %r14, %r9
 ; X64-NEXT:    adcq %rbx, %r11
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx # 8-byte Reload
-; X64-NEXT:    movq 56(%rdx), %rcx
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r10 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; X64-NEXT:    movq 56(%rbp), %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
 ; X64-NEXT:    imulq %r10, %rcx
-; X64-NEXT:    movq 48(%rdx), %rbx
-; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    movq 48(%rbp), %rbx
 ; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    addq %rcx, %rdx
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r15 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
 ; X64-NEXT:    imulq %r15, %rbx
 ; X64-NEXT:    addq %rdx, %rbx
 ; X64-NEXT:    movq 32(%rbp), %rdi
 ; X64-NEXT:    movq 40(%rbp), %r8
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    imulq %r8, %rcx
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %rcx, %rdx
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    imulq %rdi, %rax
 ; X64-NEXT:    addq %rdx, %rax
 ; X64-NEXT:    addq %rsi, %r14
 ; X64-NEXT:    adcq %rbx, %rax
-; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %r12
@@ -1820,23 +1817,23 @@
 ; X64-NEXT:    movzbl %cl, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    addq %r14, %rax
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload
-; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    adcq %r13, %rdi
 ; X64-NEXT:    adcq %r9, %rax
 ; X64-NEXT:    adcq %r11, %rdx
-; X64-NEXT:    addq -{{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rdi # 8-byte Folded Reload
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload
-; X64-NEXT:    adcq -{{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
 ; X64-NEXT:    movq (%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
 ; X64-NEXT:    movq %rbp, (%rcx)
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
 ; X64-NEXT:    movq %rbp, 8(%rcx)
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
 ; X64-NEXT:    movq %rbp, 16(%rcx)
-; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
 ; X64-NEXT:    movq %rbp, 24(%rcx)
 ; X64-NEXT:    movq %rsi, 32(%rcx)
 ; X64-NEXT:    movq %rdi, 40(%rcx)
diff --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll
index 2f0e6b2..f0b7aa44 100644
--- a/llvm/test/CodeGen/X86/mul128.ll
+++ b/llvm/test/CodeGen/X86/mul128.ll
@@ -6,8 +6,8 @@
 ; X64-LABEL: foo:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    imulq %rdi, %rcx
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    imulq %rdi, %rcx
 ; X64-NEXT:    mulq %rdx
 ; X64-NEXT:    addq %rcx, %rdx
 ; X64-NEXT:    imulq %r8, %rsi
@@ -51,7 +51,7 @@
 ; X86-NEXT:    imull %ebp, %edi
 ; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -76,7 +76,7 @@
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl %edi, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
diff --git a/llvm/test/CodeGen/X86/mul64.ll b/llvm/test/CodeGen/X86/mul64.ll
index f8a7aaa..1feed4b 100644
--- a/llvm/test/CodeGen/X86/mul64.ll
+++ b/llvm/test/CodeGen/X86/mul64.ll
@@ -19,8 +19,8 @@
 ;
 ; X64-LABEL: foo:
 ; X64:       # %bb.0:
-; X64-NEXT:    imulq %rsi, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    imulq %rsi, %rax
 ; X64-NEXT:    retq
   %k = mul i64 %t, %u
   ret i64 %k
diff --git a/llvm/test/CodeGen/X86/mwaitx-schedule.ll b/llvm/test/CodeGen/X86/mwaitx-schedule.ll
index b0a64ae..ea135fd 100644
--- a/llvm/test/CodeGen/X86/mwaitx-schedule.ll
+++ b/llvm/test/CodeGen/X86/mwaitx-schedule.ll
@@ -6,22 +6,22 @@
 define void @foo(i8* %P, i32 %E, i32 %H) nounwind {
 ; GENERIC-LABEL: foo:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; GENERIC-NEXT:    movl %esi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; GENERIC-NEXT:    monitorx # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BDVER4-LABEL: foo:
 ; BDVER4:       # %bb.0:
-; BDVER4-NEXT:    leaq (%rdi), %rax
 ; BDVER4-NEXT:    movl %esi, %ecx
+; BDVER4-NEXT:    leaq (%rdi), %rax
 ; BDVER4-NEXT:    monitorx
 ; BDVER4-NEXT:    retq
 ;
 ; ZNVER1-LABEL: foo:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    leaq (%rdi), %rax # sched: [1:0.25]
 ; ZNVER1-NEXT:    movl %esi, %ecx # sched: [1:0.25]
+; ZNVER1-NEXT:    leaq (%rdi), %rax # sched: [1:0.25]
 ; ZNVER1-NEXT:    monitorx # sched: [100:0.25]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   tail call void @llvm.x86.monitorx(i8* %P, i32 %E, i32 %H)
@@ -33,9 +33,9 @@
 ; GENERIC-LABEL: bar:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    pushq %rbx # sched: [5:1.00]
-; GENERIC-NEXT:    movl %edi, %ecx # sched: [1:0.33]
-; GENERIC-NEXT:    movl %esi, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    movl %edx, %ebx # sched: [1:0.33]
+; GENERIC-NEXT:    movl %esi, %eax # sched: [1:0.33]
+; GENERIC-NEXT:    movl %edi, %ecx # sched: [1:0.33]
 ; GENERIC-NEXT:    mwaitx # sched: [100:0.33]
 ; GENERIC-NEXT:    popq %rbx # sched: [6:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
@@ -43,9 +43,9 @@
 ; BDVER4-LABEL: bar:
 ; BDVER4:       # %bb.0:
 ; BDVER4-NEXT:    pushq %rbx
-; BDVER4-NEXT:    movl %edi, %ecx
-; BDVER4-NEXT:    movl %esi, %eax
 ; BDVER4-NEXT:    movl %edx, %ebx
+; BDVER4-NEXT:    movl %esi, %eax
+; BDVER4-NEXT:    movl %edi, %ecx
 ; BDVER4-NEXT:    mwaitx
 ; BDVER4-NEXT:    popq %rbx
 ; BDVER4-NEXT:    retq
@@ -53,9 +53,9 @@
 ; ZNVER1-LABEL: bar:
 ; ZNVER1:       # %bb.0:
 ; ZNVER1-NEXT:    pushq %rbx # sched: [1:0.50]
-; ZNVER1-NEXT:    movl %edi, %ecx # sched: [1:0.25]
-; ZNVER1-NEXT:    movl %esi, %eax # sched: [1:0.25]
 ; ZNVER1-NEXT:    movl %edx, %ebx # sched: [1:0.25]
+; ZNVER1-NEXT:    movl %esi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; ZNVER1-NEXT:    mwaitx # sched: [100:0.25]
 ; ZNVER1-NEXT:    popq %rbx # sched: [8:0.50]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
diff --git a/llvm/test/CodeGen/X86/mwaitx.ll b/llvm/test/CodeGen/X86/mwaitx.ll
index 5bf6431..24d5093 100644
--- a/llvm/test/CodeGen/X86/mwaitx.ll
+++ b/llvm/test/CodeGen/X86/mwaitx.ll
@@ -4,8 +4,9 @@
 ; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=bdver4 | FileCheck %s -check-prefix=WIN64
 
 ; CHECK-LABEL: foo:
-; CHECK: leaq    (%rdi), %rax
-; CHECK-NEXT: movl    %esi, %ecx
+; CHECK-LABEL: # %bb.0:
+; CHECK-DAG: leaq    (%rdi), %rax
+; CHECK-DAG: movl    %esi, %ecx
 ; CHECK-NEXT: monitorx
 ; WIN64-LABEL: foo:
 ; WIN64:      leaq    (%rcx), %rax
@@ -21,13 +22,15 @@
 declare void @llvm.x86.monitorx(i8*, i32, i32) nounwind
 
 ; CHECK-LABEL: bar:
-; CHECK: movl    %edi, %ecx
-; CHECK-NEXT: movl    %esi, %eax
-; CHECK-NEXT: movl    %edx, %ebx
+; CHECK: pushq
+; CHECK-DAG: movl    %edi, %ecx
+; CHECK-DAG: movl    %esi, %eax
+; CHECK-DAG: movl    %edx, %ebx
 ; CHECK-NEXT: mwaitx
 ; WIN64-LABEL: bar:
-; WIN64:      movl    %edx, %eax
-; WIN64:      movl    %r8d, %ebx
+; WIN64: pushq
+; WIN64-DAG:      movl    %edx, %eax
+; WIN64-DAG:      movl    %r8d, %ebx
 ; WIN64-NEXT: mwaitx
 define void @bar(i32 %E, i32 %H, i32 %C) nounwind {
 entry:
diff --git a/llvm/test/CodeGen/X86/negate-i1.ll b/llvm/test/CodeGen/X86/negate-i1.ll
index 743f1a1..2ed6c95 100644
--- a/llvm/test/CodeGen/X86/negate-i1.ll
+++ b/llvm/test/CodeGen/X86/negate-i1.ll
@@ -5,9 +5,10 @@
 define i8 @select_i8_neg1_or_0(i1 %a) {
 ; X64-LABEL: select_i8_neg1_or_0:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    negb %dil
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $1, %al
+; X64-NEXT:    negb %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: select_i8_neg1_or_0:
@@ -23,8 +24,9 @@
 define i8 @select_i8_neg1_or_0_zeroext(i1 zeroext %a) {
 ; X64-LABEL: select_i8_neg1_or_0_zeroext:
 ; X64:       # %bb.0:
-; X64-NEXT:    negb %dil
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    negb %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: select_i8_neg1_or_0_zeroext:
@@ -39,9 +41,10 @@
 define i16 @select_i16_neg1_or_0(i1 %a) {
 ; X64-LABEL: select_i16_neg1_or_0:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    negl %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: select_i16_neg1_or_0:
@@ -58,8 +61,9 @@
 define i16 @select_i16_neg1_or_0_zeroext(i1 zeroext %a) {
 ; X64-LABEL: select_i16_neg1_or_0_zeroext:
 ; X64:       # %bb.0:
-; X64-NEXT:    negl %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: select_i16_neg1_or_0_zeroext:
@@ -75,9 +79,9 @@
 define i32 @select_i32_neg1_or_0(i1 %a) {
 ; X64-LABEL: select_i32_neg1_or_0:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    negl %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    negl %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: select_i32_neg1_or_0:
@@ -93,8 +97,8 @@
 define i32 @select_i32_neg1_or_0_zeroext(i1 zeroext %a) {
 ; X64-LABEL: select_i32_neg1_or_0_zeroext:
 ; X64:       # %bb.0:
-; X64-NEXT:    negl %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    negl %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: select_i32_neg1_or_0_zeroext:
@@ -109,10 +113,9 @@
 define i64 @select_i64_neg1_or_0(i1 %a) {
 ; X64-LABEL: select_i64_neg1_or_0:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    negq %rdi
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    negq %rax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: select_i64_neg1_or_0:
diff --git a/llvm/test/CodeGen/X86/negate-shift.ll b/llvm/test/CodeGen/X86/negate-shift.ll
index 8804460..25cb1e7 100644
--- a/llvm/test/CodeGen/X86/negate-shift.ll
+++ b/llvm/test/CodeGen/X86/negate-shift.ll
@@ -4,8 +4,8 @@
 define i32 @neg_lshr_signbit(i32 %x) {
 ; X64-LABEL: neg_lshr_signbit:
 ; X64:       # %bb.0:
-; X64-NEXT:    sarl $31, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    sarl $31, %eax
 ; X64-NEXT:    retq
   %sh = lshr i32 %x, 31
   %neg = sub i32 0, %sh
@@ -15,8 +15,8 @@
 define i64 @neg_ashr_signbit(i64 %x) {
 ; X64-LABEL: neg_ashr_signbit:
 ; X64:       # %bb.0:
-; X64-NEXT:    shrq $63, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrq $63, %rax
 ; X64-NEXT:    retq
   %sh = ashr i64 %x, 63
   %neg = sub i64 0, %sh
diff --git a/llvm/test/CodeGen/X86/negate.ll b/llvm/test/CodeGen/X86/negate.ll
index 62e4dff..4026ed3 100644
--- a/llvm/test/CodeGen/X86/negate.ll
+++ b/llvm/test/CodeGen/X86/negate.ll
@@ -42,8 +42,9 @@
 define i8 @negate_zero_or_minsigned(i8 %x) {
 ; CHECK-LABEL: negate_zero_or_minsigned:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    shlb $7, %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shlb $7, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %signbit = shl i8 %x, 7
   %neg = sub i8 0, %signbit
diff --git a/llvm/test/CodeGen/X86/no-sse2-avg.ll b/llvm/test/CodeGen/X86/no-sse2-avg.ll
index 0472cc2..21528f7 100644
--- a/llvm/test/CodeGen/X86/no-sse2-avg.ll
+++ b/llvm/test/CodeGen/X86/no-sse2-avg.ll
@@ -5,9 +5,9 @@
 define <16 x i8> @PR27973() {
 ; CHECK-LABEL: PR27973:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    movq $0, 8(%rdi)
 ; CHECK-NEXT:    movq $0, (%rdi)
-; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    retq
   %t0 = zext <16 x i8> zeroinitializer to <16 x i32>
   %t1 = add nuw nsw <16 x i32> %t0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
diff --git a/llvm/test/CodeGen/X86/not-and-simplify.ll b/llvm/test/CodeGen/X86/not-and-simplify.ll
index e753aeb..8fbe6e7 100644
--- a/llvm/test/CodeGen/X86/not-and-simplify.ll
+++ b/llvm/test/CodeGen/X86/not-and-simplify.ll
@@ -7,9 +7,9 @@
 define i32 @shrink_xor_constant1(i32 %x) {
 ; ALL-LABEL: shrink_xor_constant1:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    shrl $31, %edi
-; ALL-NEXT:    xorl $1, %edi
 ; ALL-NEXT:    movl %edi, %eax
+; ALL-NEXT:    shrl $31, %eax
+; ALL-NEXT:    xorl $1, %eax
 ; ALL-NEXT:    retq
   %sh = lshr i32 %x, 31
   %not = xor i32 %sh, -1
@@ -34,9 +34,10 @@
 define i8 @shrink_xor_constant2(i8 %x) {
 ; ALL-LABEL: shrink_xor_constant2:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    shlb $5, %dil
-; ALL-NEXT:    xorb $-32, %dil
 ; ALL-NEXT:    movl %edi, %eax
+; ALL-NEXT:    shlb $5, %al
+; ALL-NEXT:    xorb $-32, %al
+; ALL-NEXT:    # kill: def $al killed $al killed $eax
 ; ALL-NEXT:    retq
   %sh = shl i8 %x, 5
   %not = xor i8 %sh, -1
diff --git a/llvm/test/CodeGen/X86/palignr.ll b/llvm/test/CodeGen/X86/palignr.ll
index 64bbf21..19d493b 100644
--- a/llvm/test/CodeGen/X86/palignr.ll
+++ b/llvm/test/CodeGen/X86/palignr.ll
@@ -167,16 +167,15 @@
 ; CHECK-SSE2-LABEL: test9:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; CHECK-SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; CHECK-SSE2-NEXT:    por %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; CHECK-SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; CHECK-SSE2-NEXT:    por %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retl
 ;
 ; CHECK-SSSE3-LABEL: test9:
 ; CHECK-SSSE3:       # %bb.0:
-; CHECK-SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
 ; CHECK-SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1]
 ; CHECK-SSSE3-NEXT:    retl
 ;
 ; CHECK-AVX-LABEL: test9:
diff --git a/llvm/test/CodeGen/X86/peep-setb.ll b/llvm/test/CodeGen/X86/peep-setb.ll
index 3794b37..944aa4d 100644
--- a/llvm/test/CodeGen/X86/peep-setb.ll
+++ b/llvm/test/CodeGen/X86/peep-setb.ll
@@ -7,9 +7,10 @@
 define i8 @test1(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: test1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpb %sil, %dil
-; CHECK-NEXT:    adcb $0, %sil
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpb %al, %dil
+; CHECK-NEXT:    adcb $0, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ult i8 %a, %b
   %cond = zext i1 %cmp to i8
@@ -20,9 +21,9 @@
 define i32 @test2(i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl %esi, %edi
-; CHECK-NEXT:    adcl $0, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    adcl $0, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ult i32 %a, %b
   %cond = zext i1 %cmp to i32
@@ -33,9 +34,9 @@
 define i64 @test3(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: test3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    adcq $0, %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    adcq $0, %rax
 ; CHECK-NEXT:    retq
   %cmp = icmp ult i64 %a, %b
   %conv = zext i1 %cmp to i64
@@ -46,9 +47,10 @@
 define i8 @test4(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: test4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpb %sil, %dil
-; CHECK-NEXT:    sbbb $0, %sil
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpb %al, %dil
+; CHECK-NEXT:    sbbb $0, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ult i8 %a, %b
   %cond = zext i1 %cmp to i8
@@ -59,9 +61,9 @@
 define i32 @test5(i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: test5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl %esi, %edi
-; CHECK-NEXT:    sbbl $0, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    sbbl $0, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ult i32 %a, %b
   %cond = zext i1 %cmp to i32
@@ -72,9 +74,9 @@
 define i64 @test6(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: test6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    sbbq $0, %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    sbbq $0, %rax
 ; CHECK-NEXT:    retq
   %cmp = icmp ult i64 %a, %b
   %conv = zext i1 %cmp to i64
@@ -85,9 +87,10 @@
 define i8 @test7(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: test7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpb %sil, %dil
-; CHECK-NEXT:    adcb $0, %sil
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpb %al, %dil
+; CHECK-NEXT:    adcb $0, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ult i8 %a, %b
   %cond = sext i1 %cmp to i8
@@ -98,9 +101,9 @@
 define i32 @test8(i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: test8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl %esi, %edi
-; CHECK-NEXT:    adcl $0, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    adcl $0, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ult i32 %a, %b
   %cond = sext i1 %cmp to i32
@@ -111,9 +114,9 @@
 define i64 @test9(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: test9:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    adcq $0, %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    adcq $0, %rax
 ; CHECK-NEXT:    retq
   %cmp = icmp ult i64 %a, %b
   %conv = sext i1 %cmp to i64
diff --git a/llvm/test/CodeGen/X86/pku.ll b/llvm/test/CodeGen/X86/pku.ll
index 6031baf..e5b47ec 100644
--- a/llvm/test/CodeGen/X86/pku.ll
+++ b/llvm/test/CodeGen/X86/pku.ll
@@ -16,9 +16,9 @@
 ;
 ; X64-LABEL: test_x86_wrpkru:
 ; X64:       ## %bb.0:
+; X64-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
 ; X64-NEXT:    xorl %ecx, %ecx ## encoding: [0x31,0xc9]
 ; X64-NEXT:    xorl %edx, %edx ## encoding: [0x31,0xd2]
-; X64-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
 ; X64-NEXT:    wrpkru ## encoding: [0x0f,0x01,0xef]
 ; X64-NEXT:    retq ## encoding: [0xc3]
   call void @llvm.x86.wrpkru(i32 %src)
diff --git a/llvm/test/CodeGen/X86/pmaddubsw.ll b/llvm/test/CodeGen/X86/pmaddubsw.ll
index d44315a..9dac1eb 100644
--- a/llvm/test/CodeGen/X86/pmaddubsw.ll
+++ b/llvm/test/CodeGen/X86/pmaddubsw.ll
@@ -89,6 +89,7 @@
 define <64 x i16> @pmaddubsw_512(<128 x i8>* %Aptr, <128 x i8>* %Bptr) {
 ; SSE-LABEL: pmaddubsw_512:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    movdqa 112(%rdx), %xmm0
 ; SSE-NEXT:    movdqa 96(%rdx), %xmm1
 ; SSE-NEXT:    movdqa 80(%rdx), %xmm2
@@ -113,7 +114,6 @@
 ; SSE-NEXT:    movdqa %xmm6, 32(%rdi)
 ; SSE-NEXT:    movdqa %xmm5, 16(%rdi)
 ; SSE-NEXT:    movdqa %xmm4, (%rdi)
-; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: pmaddubsw_512:
diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index 8f61d34..9fe8a66 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -228,6 +228,7 @@
 define <64 x i16> @mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) {
 ; SSE-LABEL: mulhuw_v64i16:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm0
 ; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm1
 ; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm2
@@ -244,7 +245,6 @@
 ; SSE-NEXT:    movdqa %xmm2, 32(%rdi)
 ; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
 ; SSE-NEXT:    movdqa %xmm0, (%rdi)
-; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: mulhuw_v64i16:
@@ -279,6 +279,7 @@
 define <64 x i16> @mulhw_v64i16(<64 x i16> %a, <64 x i16> %b) {
 ; SSE-LABEL: mulhw_v64i16:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm0
 ; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm1
 ; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm2
@@ -295,7 +296,6 @@
 ; SSE-NEXT:    movdqa %xmm2, 32(%rdi)
 ; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
 ; SSE-NEXT:    movdqa %xmm0, (%rdi)
-; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: mulhw_v64i16:
diff --git a/llvm/test/CodeGen/X86/pr12360.ll b/llvm/test/CodeGen/X86/pr12360.ll
index 6ffa2fc..3df5b94 100644
--- a/llvm/test/CodeGen/X86/pr12360.ll
+++ b/llvm/test/CodeGen/X86/pr12360.ll
@@ -32,8 +32,9 @@
 define zeroext i1 @f3(i1 %x) {
 ; CHECK-LABEL: f3:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    andb $1, %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 
 entry:
diff --git a/llvm/test/CodeGen/X86/pr15705.ll b/llvm/test/CodeGen/X86/pr15705.ll
index d70895b..0c72359 100644
--- a/llvm/test/CodeGen/X86/pr15705.ll
+++ b/llvm/test/CodeGen/X86/pr15705.ll
@@ -22,14 +22,14 @@
 ;
 ; X64-LABEL: PR15705:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %edx, %eax
 ; X64-NEXT:    cmpl %esi, %edi
 ; X64-NEXT:    je .LBB0_2
 ; X64-NEXT:  # %bb.1: # %if.end
-; X64-NEXT:    cmpl %edx, %edi
+; X64-NEXT:    cmpl %eax, %edi
 ; X64-NEXT:    cmovel %ecx, %esi
-; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:  .LBB0_2: # %return
-; X64-NEXT:    movl %edx, %eax
 ; X64-NEXT:    retq
 entry:
   %cmp = icmp eq i32 %x, %a
diff --git a/llvm/test/CodeGen/X86/pr15981.ll b/llvm/test/CodeGen/X86/pr15981.ll
index 90e1cca3..db66de5 100644
--- a/llvm/test/CodeGen/X86/pr15981.ll
+++ b/llvm/test/CodeGen/X86/pr15981.ll
@@ -19,9 +19,9 @@
 ;
 ; X64-LABEL: fn1:
 ; X64:       # %bb.0:
-; X64-NEXT:    testl %esi, %esi
-; X64-NEXT:    cmovel %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    testl %esi, %esi
+; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %3 = icmp ne i32 %1, 0
   %4 = select i1 %3, i32 %0, i32 0
diff --git a/llvm/test/CodeGen/X86/pr23664.ll b/llvm/test/CodeGen/X86/pr23664.ll
index 155fc03..4b9720f 100644
--- a/llvm/test/CodeGen/X86/pr23664.ll
+++ b/llvm/test/CodeGen/X86/pr23664.ll
@@ -7,8 +7,9 @@
   ret i2 %or
 
 ; CHECK-LABEL: f:
-; CHECK:      addb    %dil, %dil
-; CHECK-NEXT: orb     $1, %dil
-; CHECK-NEXT: movl    %edi, %eax
+; CHECK:      movl    %edi, %eax
+; CHECK-NEXT: addb    %al, %al
+; CHECK-NEXT: orb     $1, %al
+; CHECK-NEXT: # kill
 ; CHECK-NEXT: retq
 }
diff --git a/llvm/test/CodeGen/X86/pr28173.ll b/llvm/test/CodeGen/X86/pr28173.ll
index 4cb2567..a10991e 100644
--- a/llvm/test/CodeGen/X86/pr28173.ll
+++ b/llvm/test/CodeGen/X86/pr28173.ll
@@ -78,8 +78,9 @@
 define i8 @foo8(i1 zeroext %i) #0 {
 ; CHECK-LABEL: foo8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    orb $-2, %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    orb $-2, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   br label %bb
 
diff --git a/llvm/test/CodeGen/X86/pr34653.ll b/llvm/test/CodeGen/X86/pr34653.ll
index db3c95d..858e0f46 100644
--- a/llvm/test/CodeGen/X86/pr34653.ll
+++ b/llvm/test/CodeGen/X86/pr34653.ll
@@ -33,170 +33,170 @@
 ; CHECK-NEXT:    vmovaps %xmm13, %xmm14
 ; CHECK-NEXT:    vmovaps %xmm10, %xmm15
 ; CHECK-NEXT:    vmovaps %xmm15, %xmm2
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vextractf32x4 $3, %zmm9, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vextractf32x4 $2, %zmm9, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps %xmm9, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vextractf32x4 $3, %zmm8, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vextractf32x4 $2, %zmm8, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps %xmm8, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vextractf32x4 $3, %zmm7, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vextractf32x4 $2, %zmm7, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps %xmm7, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm11 = xmm11[1,0]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm13 = xmm13[1,0]
 ; CHECK-NEXT:    # kill: def $ymm10 killed $ymm10 killed $zmm10
 ; CHECK-NEXT:    vextractf128 $1, %ymm10, %xmm10
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps %xmm10, %xmm0
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm15 = xmm15[1,0]
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; CHECK-NEXT:    # kill: def $ymm9 killed $ymm9 killed $zmm9
 ; CHECK-NEXT:    vextractf128 $1, %ymm9, %xmm9
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps %xmm9, %xmm0
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; CHECK-NEXT:    # kill: def $ymm8 killed $ymm8 killed $zmm8
 ; CHECK-NEXT:    vextractf128 $1, %ymm8, %xmm8
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps %xmm8, %xmm0
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; CHECK-NEXT:    # kill: def $ymm7 killed $ymm7 killed $zmm7
 ; CHECK-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps %xmm7, %xmm0
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm8 = xmm8[1,0]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm7[1,0]
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm8, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm13, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm1, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm14, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm2, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm4, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm9, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm10, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm15, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm11, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm3, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm6, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm5, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm12, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm7, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
diff --git a/llvm/test/CodeGen/X86/pr34657.ll b/llvm/test/CodeGen/X86/pr34657.ll
index 58c97f6..6c39547 100644
--- a/llvm/test/CodeGen/X86/pr34657.ll
+++ b/llvm/test/CodeGen/X86/pr34657.ll
@@ -1,9 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw -o - | FileCheck %s 
+; RUN: llc %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw -o - | FileCheck %s
 
 define <112 x i8> @pr34657() local_unnamed_addr {
-; CHECK-LABEL: pr34657
+; CHECK-LABEL: pr34657:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    vmovups (%rax), %xmm0
 ; CHECK-NEXT:    vmovups (%rax), %ymm1
 ; CHECK-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
@@ -11,7 +12,6 @@
 ; CHECK-NEXT:    vmovaps %ymm1, 64(%rdi)
 ; CHECK-NEXT:    vmovaps %zmm2, (%rdi)
 ; CHECK-NEXT:    vextractf32x4 $2, %zmm0, 96(%rdi)
-; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/promote-i16.ll b/llvm/test/CodeGen/X86/promote-i16.ll
index 311adc1..a88b173 100644
--- a/llvm/test/CodeGen/X86/promote-i16.ll
+++ b/llvm/test/CodeGen/X86/promote-i16.ll
@@ -12,8 +12,9 @@
 ;
 ; X64-LABEL: foo:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    xorl $21998, %edi # imm = 0x55EE
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorl $21998, %eax # imm = 0x55EE
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
   %0 = xor i16 %x, 21998
@@ -30,8 +31,9 @@
 ;
 ; X64-LABEL: bar:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    xorl $54766, %edi # imm = 0xD5EE
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorl $54766, %eax # imm = 0xD5EE
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 entry:
   %0 = xor i16 %x, 54766
diff --git a/llvm/test/CodeGen/X86/ptest.ll b/llvm/test/CodeGen/X86/ptest.ll
index 8f2e10f..fe69c60 100644
--- a/llvm/test/CodeGen/X86/ptest.ll
+++ b/llvm/test/CodeGen/X86/ptest.ll
@@ -233,16 +233,16 @@
 define i32 @vecsel128(<4 x i32> %input, i32 %a, i32 %b) {
 ; SSE41-LABEL: vecsel128:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    ptest %xmm0, %xmm0
-; SSE41-NEXT:    cmovel %esi, %edi
 ; SSE41-NEXT:    movl %edi, %eax
+; SSE41-NEXT:    ptest %xmm0, %xmm0
+; SSE41-NEXT:    cmovel %esi, %eax
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: vecsel128:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vptest %xmm0, %xmm0
-; AVX-NEXT:    cmovel %esi, %edi
 ; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vptest %xmm0, %xmm0
+; AVX-NEXT:    cmovel %esi, %eax
 ; AVX-NEXT:    retq
   %t0 = bitcast <4 x i32> %input to i128
   %t1 = icmp ne i128 %t0, 0
@@ -253,17 +253,17 @@
 define i32 @vecsel256(<8 x i32> %input, i32 %a, i32 %b) {
 ; SSE41-LABEL: vecsel256:
 ; SSE41:       # %bb.0:
+; SSE41-NEXT:    movl %edi, %eax
 ; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    ptest %xmm0, %xmm0
-; SSE41-NEXT:    cmovel %esi, %edi
-; SSE41-NEXT:    movl %edi, %eax
+; SSE41-NEXT:    cmovel %esi, %eax
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: vecsel256:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vptest %ymm0, %ymm0
-; AVX-NEXT:    cmovel %esi, %edi
 ; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vptest %ymm0, %ymm0
+; AVX-NEXT:    cmovel %esi, %eax
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
   %t0 = bitcast <8 x i32> %input to i256
@@ -275,45 +275,45 @@
 define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
 ; SSE41-LABEL: vecsel512:
 ; SSE41:       # %bb.0:
+; SSE41-NEXT:    movl %edi, %eax
 ; SSE41-NEXT:    por %xmm3, %xmm1
 ; SSE41-NEXT:    por %xmm2, %xmm1
 ; SSE41-NEXT:    por %xmm0, %xmm1
 ; SSE41-NEXT:    ptest %xmm1, %xmm1
-; SSE41-NEXT:    cmovel %esi, %edi
-; SSE41-NEXT:    movl %edi, %eax
+; SSE41-NEXT:    cmovel %esi, %eax
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: vecsel512:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    movl %edi, %eax
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vptest %ymm0, %ymm0
-; AVX1-NEXT:    cmovel %esi, %edi
-; AVX1-NEXT:    movl %edi, %eax
+; AVX1-NEXT:    cmovel %esi, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: vecsel512:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512-NEXT:    vmovq %xmm1, %rax
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT:    vmovq %xmm2, %rcx
-; AVX512-NEXT:    orq %rax, %rcx
-; AVX512-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; AVX512-NEXT:    vmovq %xmm3, %rax
-; AVX512-NEXT:    orq %rcx, %rax
-; AVX512-NEXT:    vmovq %xmm0, %rcx
-; AVX512-NEXT:    orq %rax, %rcx
-; AVX512-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512-NEXT:    vpextrq $1, %xmm2, %rdx
-; AVX512-NEXT:    orq %rax, %rdx
-; AVX512-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX512-NEXT:    orq %rdx, %rax
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX512-NEXT:    orq %rax, %rdx
-; AVX512-NEXT:    orq %rcx, %rdx
-; AVX512-NEXT:    cmovel %esi, %edi
 ; AVX512-NEXT:    movl %edi, %eax
+; AVX512-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, %rcx
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, %rdx
+; AVX512-NEXT:    orq %rcx, %rdx
+; AVX512-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; AVX512-NEXT:    vmovq %xmm3, %rcx
+; AVX512-NEXT:    orq %rdx, %rcx
+; AVX512-NEXT:    vmovq %xmm0, %rdx
+; AVX512-NEXT:    orq %rcx, %rdx
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rdi
+; AVX512-NEXT:    orq %rcx, %rdi
+; AVX512-NEXT:    vpextrq $1, %xmm3, %rcx
+; AVX512-NEXT:    orq %rdi, %rcx
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX512-NEXT:    orq %rcx, %rdi
+; AVX512-NEXT:    orq %rdx, %rdi
+; AVX512-NEXT:    cmovel %esi, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %t0 = bitcast <16 x i32> %input to i512
diff --git a/llvm/test/CodeGen/X86/rot16.ll b/llvm/test/CodeGen/X86/rot16.ll
index 481163e..3b2a01b 100644
--- a/llvm/test/CodeGen/X86/rot16.ll
+++ b/llvm/test/CodeGen/X86/rot16.ll
@@ -13,8 +13,10 @@
 ; X64-LABEL: foo:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shldw %cl, %di, %di
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shldw %cl, %ax, %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 	%t0 = shl i16 %x, %z
 	%t1 = sub i16 16, %z
@@ -35,8 +37,10 @@
 ; X64-LABEL: bar:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shldw %cl, %di, %si
 ; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shldw %cl, %di, %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 	%t0 = shl i16 %y, %z
 	%t1 = sub i16 16, %z
@@ -56,8 +60,10 @@
 ; X64-LABEL: un:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shrdw %cl, %di, %di
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shrdw %cl, %ax, %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 	%t0 = lshr i16 %x, %z
 	%t1 = sub i16 16, %z
@@ -78,8 +84,10 @@
 ; X64-LABEL: bu:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shrdw %cl, %di, %si
 ; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shrdw %cl, %di, %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 	%t0 = lshr i16 %y, %z
 	%t1 = sub i16 16, %z
@@ -97,8 +105,9 @@
 ;
 ; X64-LABEL: xfoo:
 ; X64:       # %bb.0:
-; X64-NEXT:    rolw $5, %di
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    rolw $5, %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 	%t0 = lshr i16 %x, 11
 	%t1 = shl i16 %x, 5
@@ -116,8 +125,9 @@
 ;
 ; X64-LABEL: xbar:
 ; X64:       # %bb.0:
-; X64-NEXT:    shldw $5, %di, %si
 ; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    shldw $5, %di, %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 	%t0 = shl i16 %y, 5
 	%t1 = lshr i16 %x, 11
@@ -134,8 +144,9 @@
 ;
 ; X64-LABEL: xun:
 ; X64:       # %bb.0:
-; X64-NEXT:    rolw $11, %di
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    rolw $11, %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 	%t0 = lshr i16 %x, 5
 	%t1 = shl i16 %x, 11
@@ -153,8 +164,9 @@
 ;
 ; X64-LABEL: xbu:
 ; X64:       # %bb.0:
-; X64-NEXT:    shldw $11, %si, %di
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shldw $11, %si, %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 	%t0 = lshr i16 %y, 5
 	%t1 = shl i16 %x, 11
diff --git a/llvm/test/CodeGen/X86/rot64.ll b/llvm/test/CodeGen/X86/rot64.ll
index e8f090c..94b9eae 100644
--- a/llvm/test/CodeGen/X86/rot64.ll
+++ b/llvm/test/CodeGen/X86/rot64.ll
@@ -6,9 +6,10 @@
 define i64 @foo(i64 %x, i64 %y, i64 %z) nounwind readnone {
 ; ALL-LABEL: foo:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    movl %edx, %ecx
-; ALL-NEXT:    rolq %cl, %rdi
+; ALL-NEXT:    movq %rdx, %rcx
 ; ALL-NEXT:    movq %rdi, %rax
+; ALL-NEXT:    # kill: def $cl killed $cl killed $rcx
+; ALL-NEXT:    rolq %cl, %rax
 ; ALL-NEXT:    retq
 entry:
 	%0 = shl i64 %x, %z
@@ -21,9 +22,10 @@
 define i64 @bar(i64 %x, i64 %y, i64 %z) nounwind readnone {
 ; ALL-LABEL: bar:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    movl %edx, %ecx
-; ALL-NEXT:    shldq %cl, %rdi, %rsi
+; ALL-NEXT:    movq %rdx, %rcx
 ; ALL-NEXT:    movq %rsi, %rax
+; ALL-NEXT:    # kill: def $cl killed $cl killed $rcx
+; ALL-NEXT:    shldq %cl, %rdi, %rax
 ; ALL-NEXT:    retq
 entry:
 	%0 = shl i64 %y, %z
@@ -36,9 +38,10 @@
 define i64 @un(i64 %x, i64 %y, i64 %z) nounwind readnone {
 ; ALL-LABEL: un:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    movl %edx, %ecx
-; ALL-NEXT:    rorq %cl, %rdi
+; ALL-NEXT:    movq %rdx, %rcx
 ; ALL-NEXT:    movq %rdi, %rax
+; ALL-NEXT:    # kill: def $cl killed $cl killed $rcx
+; ALL-NEXT:    rorq %cl, %rax
 ; ALL-NEXT:    retq
 entry:
 	%0 = lshr i64 %x, %z
@@ -51,9 +54,10 @@
 define i64 @bu(i64 %x, i64 %y, i64 %z) nounwind readnone {
 ; ALL-LABEL: bu:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    movl %edx, %ecx
-; ALL-NEXT:    shrdq %cl, %rdi, %rsi
+; ALL-NEXT:    movq %rdx, %rcx
 ; ALL-NEXT:    movq %rsi, %rax
+; ALL-NEXT:    # kill: def $cl killed $cl killed $rcx
+; ALL-NEXT:    shrdq %cl, %rdi, %rax
 ; ALL-NEXT:    retq
 entry:
 	%0 = lshr i64 %y, %z
@@ -66,14 +70,14 @@
 define i64 @xfoo(i64 %x, i64 %y, i64 %z) nounwind readnone {
 ; X64-LABEL: xfoo:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    rolq $7, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    rolq $7, %rax
 ; X64-NEXT:    retq
 ;
 ; SHLD-LABEL: xfoo:
 ; SHLD:       # %bb.0: # %entry
-; SHLD-NEXT:    shldq $7, %rdi, %rdi
 ; SHLD-NEXT:    movq %rdi, %rax
+; SHLD-NEXT:    shldq $7, %rdi, %rax
 ; SHLD-NEXT:    retq
 ;
 ; BMI2-LABEL: xfoo:
@@ -115,8 +119,8 @@
 define i64 @xbar(i64 %x, i64 %y, i64 %z) nounwind readnone {
 ; ALL-LABEL: xbar:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    shrdq $57, %rsi, %rdi
 ; ALL-NEXT:    movq %rdi, %rax
+; ALL-NEXT:    shrdq $57, %rsi, %rax
 ; ALL-NEXT:    retq
 entry:
 	%0 = shl i64 %y, 7
@@ -128,14 +132,14 @@
 define i64 @xun(i64 %x, i64 %y, i64 %z) nounwind readnone {
 ; X64-LABEL: xun:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    rolq $57, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    rolq $57, %rax
 ; X64-NEXT:    retq
 ;
 ; SHLD-LABEL: xun:
 ; SHLD:       # %bb.0: # %entry
-; SHLD-NEXT:    shldq $57, %rdi, %rdi
 ; SHLD-NEXT:    movq %rdi, %rax
+; SHLD-NEXT:    shldq $57, %rdi, %rax
 ; SHLD-NEXT:    retq
 ;
 ; BMI2-LABEL: xun:
@@ -177,8 +181,8 @@
 define i64 @xbu(i64 %x, i64 %y, i64 %z) nounwind readnone {
 ; ALL-LABEL: xbu:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    shldq $57, %rsi, %rdi
 ; ALL-NEXT:    movq %rdi, %rax
+; ALL-NEXT:    shldq $57, %rsi, %rax
 ; ALL-NEXT:    retq
 entry:
 	%0 = lshr i64 %y, 7
diff --git a/llvm/test/CodeGen/X86/rotate.ll b/llvm/test/CodeGen/X86/rotate.ll
index 50b2f96..7f287eb 100644
--- a/llvm/test/CodeGen/X86/rotate.ll
+++ b/llvm/test/CodeGen/X86/rotate.ll
@@ -43,8 +43,9 @@
 ; X64-LABEL: rotl64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    rolq %cl, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rolq %cl, %rax
 ; X64-NEXT:    retq
 	%shift.upgrd.1 = zext i8 %Amt to i64
 	%B = shl i64 %A, %shift.upgrd.1
@@ -96,8 +97,9 @@
 ; X64-LABEL: rotr64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    rorq %cl, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rorq %cl, %rax
 ; X64-NEXT:    retq
 	%shift.upgrd.3 = zext i8 %Amt to i64
 	%B = lshr i64 %A, %shift.upgrd.3
@@ -120,8 +122,8 @@
 ;
 ; X64-LABEL: rotli64:
 ; X64:       # %bb.0:
-; X64-NEXT:    rolq $5, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    rolq $5, %rax
 ; X64-NEXT:    retq
 	%B = shl i64 %A, 5
 	%C = lshr i64 %A, 59
@@ -141,8 +143,8 @@
 ;
 ; X64-LABEL: rotri64:
 ; X64:       # %bb.0:
-; X64-NEXT:    rolq $59, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    rolq $59, %rax
 ; X64-NEXT:    retq
 	%B = lshr i64 %A, 5
 	%C = shl i64 %A, 59
@@ -162,8 +164,8 @@
 ;
 ; X64-LABEL: rotl1_64:
 ; X64:       # %bb.0:
-; X64-NEXT:    rolq %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    rolq %rax
 ; X64-NEXT:    retq
 	%B = shl i64 %A, 1
 	%C = lshr i64 %A, 63
@@ -183,8 +185,8 @@
 ;
 ; X64-LABEL: rotr1_64:
 ; X64:       # %bb.0:
-; X64-NEXT:    rorq %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    rorq %rax
 ; X64-NEXT:    retq
 	%B = shl i64 %A, 63
 	%C = lshr i64 %A, 1
@@ -203,8 +205,9 @@
 ; X64-LABEL: rotl32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    roll %cl, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    roll %cl, %eax
 ; X64-NEXT:    retq
 	%shift.upgrd.1 = zext i8 %Amt to i32
 	%B = shl i32 %A, %shift.upgrd.1
@@ -226,8 +229,9 @@
 ; X64-LABEL: rotr32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    rorl %cl, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rorl %cl, %eax
 ; X64-NEXT:    retq
 	%shift.upgrd.3 = zext i8 %Amt to i32
 	%B = lshr i32 %A, %shift.upgrd.3
@@ -247,8 +251,8 @@
 ;
 ; X64-LABEL: rotli32:
 ; X64:       # %bb.0:
-; X64-NEXT:    roll $5, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    roll $5, %eax
 ; X64-NEXT:    retq
 	%B = shl i32 %A, 5
 	%C = lshr i32 %A, 27
@@ -265,8 +269,8 @@
 ;
 ; X64-LABEL: rotri32:
 ; X64:       # %bb.0:
-; X64-NEXT:    roll $27, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    roll $27, %eax
 ; X64-NEXT:    retq
 	%B = lshr i32 %A, 5
 	%C = shl i32 %A, 27
@@ -283,8 +287,8 @@
 ;
 ; X64-LABEL: rotl1_32:
 ; X64:       # %bb.0:
-; X64-NEXT:    roll %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    roll %eax
 ; X64-NEXT:    retq
 	%B = shl i32 %A, 1
 	%C = lshr i32 %A, 31
@@ -301,8 +305,8 @@
 ;
 ; X64-LABEL: rotr1_32:
 ; X64:       # %bb.0:
-; X64-NEXT:    rorl %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    rorl %eax
 ; X64-NEXT:    retq
 	%B = shl i32 %A, 31
 	%C = lshr i32 %A, 1
@@ -321,8 +325,10 @@
 ; X64-LABEL: rotl16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    rolw %cl, %di
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rolw %cl, %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 	%shift.upgrd.5 = zext i8 %Amt to i16
 	%B = shl i16 %A, %shift.upgrd.5
@@ -344,8 +350,10 @@
 ; X64-LABEL: rotr16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    rorw %cl, %di
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rorw %cl, %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 	%shift.upgrd.7 = zext i8 %Amt to i16
 	%B = lshr i16 %A, %shift.upgrd.7
@@ -365,8 +373,9 @@
 ;
 ; X64-LABEL: rotli16:
 ; X64:       # %bb.0:
-; X64-NEXT:    rolw $5, %di
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    rolw $5, %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 	%B = shl i16 %A, 5
 	%C = lshr i16 %A, 11
@@ -383,8 +392,9 @@
 ;
 ; X64-LABEL: rotri16:
 ; X64:       # %bb.0:
-; X64-NEXT:    rolw $11, %di
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    rolw $11, %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 	%B = lshr i16 %A, 5
 	%C = shl i16 %A, 11
@@ -401,8 +411,9 @@
 ;
 ; X64-LABEL: rotl1_16:
 ; X64:       # %bb.0:
-; X64-NEXT:    rolw %di
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    rolw %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 	%B = shl i16 %A, 1
 	%C = lshr i16 %A, 15
@@ -419,8 +430,9 @@
 ;
 ; X64-LABEL: rotr1_16:
 ; X64:       # %bb.0:
-; X64-NEXT:    rorw %di
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    rorw %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 	%B = lshr i16 %A, 1
 	%C = shl i16 %A, 15
@@ -439,8 +451,10 @@
 ; X64-LABEL: rotl8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    rolb %cl, %dil
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rolb %cl, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 	%B = shl i8 %A, %Amt
 	%Amt2 = sub i8 8, %Amt
@@ -460,8 +474,10 @@
 ; X64-LABEL: rotr8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    rorb %cl, %dil
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rorb %cl, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 	%B = lshr i8 %A, %Amt
 	%Amt2 = sub i8 8, %Amt
@@ -479,8 +495,9 @@
 ;
 ; X64-LABEL: rotli8:
 ; X64:       # %bb.0:
-; X64-NEXT:    rolb $5, %dil
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    rolb $5, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 	%B = shl i8 %A, 5
 	%C = lshr i8 %A, 3
@@ -497,8 +514,9 @@
 ;
 ; X64-LABEL: rotri8:
 ; X64:       # %bb.0:
-; X64-NEXT:    rolb $3, %dil
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    rolb $3, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 	%B = lshr i8 %A, 5
 	%C = shl i8 %A, 3
@@ -515,8 +533,9 @@
 ;
 ; X64-LABEL: rotl1_8:
 ; X64:       # %bb.0:
-; X64-NEXT:    rolb %dil
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    rolb %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 	%B = shl i8 %A, 1
 	%C = lshr i8 %A, 7
@@ -533,8 +552,9 @@
 ;
 ; X64-LABEL: rotr1_8:
 ; X64:       # %bb.0:
-; X64-NEXT:    rorb %dil
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    rorb %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 	%B = lshr i8 %A, 1
 	%C = shl i8 %A, 7
@@ -665,6 +685,7 @@
 ; X64-LABEL: truncated_rot:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    rolq %cl, %rdi
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/rotate2.ll b/llvm/test/CodeGen/X86/rotate2.ll
index 3833dc2..3b17923 100644
--- a/llvm/test/CodeGen/X86/rotate2.ll
+++ b/llvm/test/CodeGen/X86/rotate2.ll
@@ -14,8 +14,8 @@
 ;
 ; X64-LABEL: test1:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    rolq $9, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    rolq $9, %rax
 ; X64-NEXT:    retq
 entry:
 	%tmp2 = lshr i64 %x, 55		; <i64> [#uses=1]
@@ -34,9 +34,8 @@
 ;
 ; X64-LABEL: test2:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    roll $10, %edi
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    roll $10, %eax
 ; X64-NEXT:    retq
 entry:
 	%tmp2 = lshr i32 %x, 22		; <i32> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/rotate4.ll b/llvm/test/CodeGen/X86/rotate4.ll
index 29b63ec..5347fec 100644
--- a/llvm/test/CodeGen/X86/rotate4.ll
+++ b/llvm/test/CodeGen/X86/rotate4.ll
@@ -16,8 +16,9 @@
 ; X64-LABEL: rotate_left_32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    roll %cl, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    roll %cl, %eax
 ; X64-NEXT:    retq
   %and = and i32 %b, 31
   %shl = shl i32 %a, %and
@@ -39,8 +40,9 @@
 ; X64-LABEL: rotate_right_32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    rorl %cl, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rorl %cl, %eax
 ; X64-NEXT:    retq
   %and = and i32 %b, 31
   %shl = lshr i32 %a, %and
@@ -98,9 +100,10 @@
 ;
 ; X64-LABEL: rotate_left_64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    rolq %cl, %rdi
+; X64-NEXT:    movq %rsi, %rcx
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NEXT:    rolq %cl, %rax
 ; X64-NEXT:    retq
   %and = and i64 %b, 63
   %shl = shl i64 %a, %and
@@ -158,9 +161,10 @@
 ;
 ; X64-LABEL: rotate_right_64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    rorq %cl, %rdi
+; X64-NEXT:    movq %rsi, %rcx
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NEXT:    rorq %cl, %rax
 ; X64-NEXT:    retq
   %and = and i64 %b, 63
   %shl = lshr i64 %a, %and
@@ -184,6 +188,7 @@
 ; X64-LABEL: rotate_left_m32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    roll %cl, (%rdi)
 ; X64-NEXT:    retq
   %a = load i32, i32* %pa, align 16
@@ -208,6 +213,7 @@
 ; X64-LABEL: rotate_right_m32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    rorl %cl, (%rdi)
 ; X64-NEXT:    retq
   %a = load i32, i32* %pa, align 16
@@ -276,7 +282,8 @@
 ;
 ; X64-LABEL: rotate_left_m64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NEXT:    rolq %cl, (%rdi)
 ; X64-NEXT:    retq
   %a = load i64, i64* %pa, align 16
@@ -345,7 +352,8 @@
 ;
 ; X64-LABEL: rotate_right_m64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NEXT:    rorq %cl, (%rdi)
 ; X64-NEXT:    retq
   %a = load i64, i64* %pa, align 16
@@ -373,8 +381,10 @@
 ; X64-LABEL: rotate_left_8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    rolb %cl, %dil
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rolb %cl, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %amt = trunc i32 %amount to i8
   %sub = sub i8 0, %amt
@@ -397,8 +407,10 @@
 ; X64-LABEL: rotate_right_8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    rorb %cl, %dil
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rorb %cl, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %amt = trunc i32 %amount to i8
   %sub = sub i8 0, %amt
@@ -421,8 +433,10 @@
 ; X64-LABEL: rotate_left_16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    rolw %cl, %di
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rolw %cl, %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %amt = trunc i32 %amount to i16
   %sub = sub i16 0, %amt
@@ -445,8 +459,10 @@
 ; X64-LABEL: rotate_right_16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    rorw %cl, %di
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rorw %cl, %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %amt = trunc i32 %amount to i16
   %sub = sub i16 0, %amt
@@ -469,6 +485,7 @@
 ; X64-LABEL: rotate_left_m8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    rolb %cl, (%rdi)
 ; X64-NEXT:    retq
   %x = load i8, i8* %p, align 1
@@ -494,6 +511,7 @@
 ; X64-LABEL: rotate_right_m8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    rorb %cl, (%rdi)
 ; X64-NEXT:    retq
   %x = load i8, i8* %p, align 1
@@ -519,6 +537,7 @@
 ; X64-LABEL: rotate_left_m16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    rolw %cl, (%rdi)
 ; X64-NEXT:    retq
   %x = load i16, i16* %p, align 1
@@ -544,6 +563,7 @@
 ; X64-LABEL: rotate_right_m16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    rorw %cl, (%rdi)
 ; X64-NEXT:    retq
   %x = load i16, i16* %p, align 1
@@ -569,10 +589,11 @@
 ;
 ; X64-LABEL: rotate_demanded_bits:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $30, %sil
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    roll %cl, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $30, %cl
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    roll %cl, %eax
 ; X64-NEXT:    retq
   %3 = and i32 %1, 30
   %4 = shl i32 %0, %3
@@ -594,10 +615,11 @@
 ;
 ; X64-LABEL: rotate_demanded_bits_2:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $23, %sil
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    roll %cl, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $23, %cl
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    roll %cl, %eax
 ; X64-NEXT:    retq
   %3 = and i32 %1, 23
   %4 = shl i32 %0, %3
@@ -620,11 +642,12 @@
 ;
 ; X64-LABEL: rotate_demanded_bits_3:
 ; X64:       # %bb.0:
-; X64-NEXT:    addb %sil, %sil
-; X64-NEXT:    andb $30, %sil
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    roll %cl, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    addb %cl, %cl
+; X64-NEXT:    andb $30, %cl
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    roll %cl, %eax
 ; X64-NEXT:    retq
   %3 = shl i32 %1, 1
   %4 = and i32 %3, 30
diff --git a/llvm/test/CodeGen/X86/sar_fold64.ll b/llvm/test/CodeGen/X86/sar_fold64.ll
index 29eaee4..54e5f34 100644
--- a/llvm/test/CodeGen/X86/sar_fold64.ll
+++ b/llvm/test/CodeGen/X86/sar_fold64.ll
@@ -56,9 +56,10 @@
 define i8 @all_sign_bit_ashr(i8 %x) {
 ; CHECK-LABEL: all_sign_bit_ashr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andb $1, %dil
-; CHECK-NEXT:    negb %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andb $1, %al
+; CHECK-NEXT:    negb %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %and = and i8 %x, 1
   %neg = sub i8 0, %and
diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll
index 8265a55..5b667d8 100644
--- a/llvm/test/CodeGen/X86/sat-add.ll
+++ b/llvm/test/CodeGen/X86/sat-add.ll
@@ -10,13 +10,14 @@
 define i8 @unsigned_sat_constant_i8_using_min(i8 %x) {
 ; ANY-LABEL: unsigned_sat_constant_i8_using_min:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    cmpb $-43, %dil
+; ANY-NEXT:    movl %edi, %eax
+; ANY-NEXT:    cmpb $-43, %al
 ; ANY-NEXT:    jb .LBB0_2
 ; ANY-NEXT:  # %bb.1:
-; ANY-NEXT:    movb $-43, %dil
+; ANY-NEXT:    movb $-43, %al
 ; ANY-NEXT:  .LBB0_2:
-; ANY-NEXT:    addb $42, %dil
-; ANY-NEXT:    movl %edi, %eax
+; ANY-NEXT:    addb $42, %al
+; ANY-NEXT:    # kill: def $al killed $al killed $eax
 ; ANY-NEXT:    retq
   %c = icmp ult i8 %x, -43
   %s = select i1 %c, i8 %x, i8 -43
@@ -190,15 +191,16 @@
 define i8 @unsigned_sat_variable_i8_using_min(i8 %x, i8 %y) {
 ; ANY-LABEL: unsigned_sat_variable_i8_using_min:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movl %esi, %eax
-; ANY-NEXT:    notb %al
-; ANY-NEXT:    cmpb %al, %dil
+; ANY-NEXT:    movl %edi, %eax
+; ANY-NEXT:    movl %esi, %ecx
+; ANY-NEXT:    notb %cl
+; ANY-NEXT:    cmpb %cl, %al
 ; ANY-NEXT:    jb .LBB12_2
 ; ANY-NEXT:  # %bb.1:
-; ANY-NEXT:    movl %eax, %edi
+; ANY-NEXT:    movl %ecx, %eax
 ; ANY-NEXT:  .LBB12_2:
-; ANY-NEXT:    addb %sil, %dil
-; ANY-NEXT:    movl %edi, %eax
+; ANY-NEXT:    addb %sil, %al
+; ANY-NEXT:    # kill: def $al killed $al killed $eax
 ; ANY-NEXT:    retq
   %noty = xor i8 %y, -1
   %c = icmp ult i8 %x, %noty
diff --git a/llvm/test/CodeGen/X86/scalar_widen_div.ll b/llvm/test/CodeGen/X86/scalar_widen_div.ll
index d66a2f6..4678164 100644
--- a/llvm/test/CodeGen/X86/scalar_widen_div.ll
+++ b/llvm/test/CodeGen/X86/scalar_widen_div.ll
@@ -56,20 +56,21 @@
 define <3 x i8> @test_char_div(<3 x i8> %num, <3 x i8> %div) {
 ; CHECK-LABEL: test_char_div:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edx, %r10d
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    cbtw
 ; CHECK-NEXT:    idivb %cl
 ; CHECK-NEXT:    movl %eax, %edi
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    cbtw
 ; CHECK-NEXT:    idivb %r8b
-; CHECK-NEXT:    movl %eax, %esi
-; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    movl %r10d, %eax
 ; CHECK-NEXT:    cbtw
 ; CHECK-NEXT:    idivb %r9b
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    movl %esi, %edx
 ; CHECK-NEXT:    retq
   %div.r = sdiv <3 x i8> %num, %div
   ret <3 x i8>  %div.r
@@ -232,8 +233,8 @@
 ; CHECK-LABEL: test_ulong_div:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdx, %r10
-; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divq %rcx
 ; CHECK-NEXT:    movq %rax, %rcx
 ; CHECK-NEXT:    xorl %edx, %edx
diff --git a/llvm/test/CodeGen/X86/schedule-x86-64-shld.ll b/llvm/test/CodeGen/X86/schedule-x86-64-shld.ll
index 4f96d2c..46388d7 100644
--- a/llvm/test/CodeGen/X86/schedule-x86-64-shld.ll
+++ b/llvm/test/CodeGen/X86/schedule-x86-64-shld.ll
@@ -12,20 +12,20 @@
 define i64 @lshift10_optsize(i64 %a, i64 %b) nounwind readnone optsize {
 ; GENERIC-LABEL: lshift10_optsize:
 ; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    shldq $10, %rsi, %rdi # sched: [2:0.67]
 ; GENERIC-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT:    shldq $10, %rsi, %rax # sched: [2:0.67]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BTVER2-LABEL: lshift10_optsize:
 ; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    shldq $10, %rsi, %rdi # sched: [3:3.00]
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    shldq $10, %rsi, %rax # sched: [3:3.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; BDVER1-LABEL: lshift10_optsize:
 ; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    shldq $10, %rsi, %rdi
 ; BDVER1-NEXT:    movq %rdi, %rax
+; BDVER1-NEXT:    shldq $10, %rsi, %rax
 ; BDVER1-NEXT:    retq
 entry:
   %shl = shl i64 %a, 10
@@ -37,8 +37,8 @@
 define i64 @lshift10(i64 %a, i64 %b) nounwind readnone {
 ; GENERIC-LABEL: lshift10:
 ; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    shldq $10, %rsi, %rdi # sched: [2:0.67]
 ; GENERIC-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT:    shldq $10, %rsi, %rax # sched: [2:0.67]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BTVER2-LABEL: lshift10:
@@ -70,20 +70,20 @@
 define i64 @rshift10_optsize(i64 %a, i64 %b) nounwind readnone optsize {
 ; GENERIC-LABEL: rshift10_optsize:
 ; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    shrdq $62, %rsi, %rdi # sched: [2:0.67]
 ; GENERIC-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT:    shrdq $62, %rsi, %rax # sched: [2:0.67]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BTVER2-LABEL: rshift10_optsize:
 ; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    shrdq $62, %rsi, %rdi # sched: [3:3.00]
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    shrdq $62, %rsi, %rax # sched: [3:3.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; BDVER1-LABEL: rshift10_optsize:
 ; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    shrdq $62, %rsi, %rdi
 ; BDVER1-NEXT:    movq %rdi, %rax
+; BDVER1-NEXT:    shrdq $62, %rsi, %rax
 ; BDVER1-NEXT:    retq
 entry:
   %shl = lshr i64 %a, 62
@@ -96,8 +96,8 @@
 define i64 @rshift10(i64 %a, i64 %b) nounwind readnone {
 ; GENERIC-LABEL: rshift10:
 ; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    shrdq $62, %rsi, %rdi # sched: [2:0.67]
 ; GENERIC-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT:    shrdq $62, %rsi, %rax # sched: [2:0.67]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BTVER2-LABEL: rshift10:
@@ -126,23 +126,26 @@
 define i64 @lshift_cl_optsize(i64 %a, i64 %b, i64 %c) nounwind readnone optsize {
 ; GENERIC-LABEL: lshift_cl_optsize:
 ; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    movl %edx, %ecx # sched: [1:0.33]
-; GENERIC-NEXT:    shldq %cl, %rsi, %rdi # sched: [4:1.50]
+; GENERIC-NEXT:    movq %rdx, %rcx # sched: [1:0.33]
 ; GENERIC-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT:    # kill: def $cl killed $cl killed $rcx
+; GENERIC-NEXT:    shldq %cl, %rsi, %rax # sched: [4:1.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BTVER2-LABEL: lshift_cl_optsize:
 ; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    movl %edx, %ecx # sched: [1:0.50]
-; BTVER2-NEXT:    shldq %cl, %rsi, %rdi # sched: [4:4.00]
+; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; BTVER2-NEXT:    shldq %cl, %rsi, %rax # sched: [4:4.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; BDVER1-LABEL: lshift_cl_optsize:
 ; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movl %edx, %ecx
-; BDVER1-NEXT:    shldq %cl, %rsi, %rdi
+; BDVER1-NEXT:    movq %rdx, %rcx
 ; BDVER1-NEXT:    movq %rdi, %rax
+; BDVER1-NEXT:    # kill: def $cl killed $cl killed $rcx
+; BDVER1-NEXT:    shldq %cl, %rsi, %rax
 ; BDVER1-NEXT:    retq
 entry:
   %shl = shl i64 %a, %c
@@ -155,31 +158,32 @@
 define i64 @lshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone {
 ; GENERIC-LABEL: lshift_cl:
 ; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    movl %edx, %ecx # sched: [1:0.33]
-; GENERIC-NEXT:    shldq %cl, %rsi, %rdi # sched: [4:1.50]
+; GENERIC-NEXT:    movq %rdx, %rcx # sched: [1:0.33]
 ; GENERIC-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT:    # kill: def $cl killed $cl killed $rcx
+; GENERIC-NEXT:    shldq %cl, %rsi, %rax # sched: [4:1.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BTVER2-LABEL: lshift_cl:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
+; BTVER2-NEXT:    movq %rsi, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    shlq %cl, %rdi # sched: [1:0.50]
 ; BTVER2-NEXT:    negl %ecx # sched: [1:0.50]
 ; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BTVER2-NEXT:    shrq %cl, %rsi # sched: [1:0.50]
-; BTVER2-NEXT:    orq %rdi, %rsi # sched: [1:0.50]
-; BTVER2-NEXT:    movq %rsi, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    shrq %cl, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    orq %rdi, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; BDVER1-LABEL: lshift_cl:
 ; BDVER1:       # %bb.0: # %entry
 ; BDVER1-NEXT:    movq %rdx, %rcx
+; BDVER1-NEXT:    movq %rsi, %rax
 ; BDVER1-NEXT:    shlq %cl, %rdi
 ; BDVER1-NEXT:    negl %ecx
 ; BDVER1-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER1-NEXT:    shrq %cl, %rsi
-; BDVER1-NEXT:    orq %rdi, %rsi
-; BDVER1-NEXT:    movq %rsi, %rax
+; BDVER1-NEXT:    shrq %cl, %rax
+; BDVER1-NEXT:    orq %rdi, %rax
 ; BDVER1-NEXT:    retq
 entry:
   %shl = shl i64 %a, %c
@@ -198,23 +202,26 @@
 define i64 @rshift_cl_optsize(i64 %a, i64 %b, i64 %c) nounwind readnone optsize {
 ; GENERIC-LABEL: rshift_cl_optsize:
 ; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    movl %edx, %ecx # sched: [1:0.33]
-; GENERIC-NEXT:    shrdq %cl, %rsi, %rdi # sched: [4:1.50]
+; GENERIC-NEXT:    movq %rdx, %rcx # sched: [1:0.33]
 ; GENERIC-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT:    # kill: def $cl killed $cl killed $rcx
+; GENERIC-NEXT:    shrdq %cl, %rsi, %rax # sched: [4:1.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BTVER2-LABEL: rshift_cl_optsize:
 ; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    movl %edx, %ecx # sched: [1:0.50]
-; BTVER2-NEXT:    shrdq %cl, %rsi, %rdi # sched: [4:4.00]
+; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; BTVER2-NEXT:    shrdq %cl, %rsi, %rax # sched: [4:4.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; BDVER1-LABEL: rshift_cl_optsize:
 ; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movl %edx, %ecx
-; BDVER1-NEXT:    shrdq %cl, %rsi, %rdi
+; BDVER1-NEXT:    movq %rdx, %rcx
 ; BDVER1-NEXT:    movq %rdi, %rax
+; BDVER1-NEXT:    # kill: def $cl killed $cl killed $rcx
+; BDVER1-NEXT:    shrdq %cl, %rsi, %rax
 ; BDVER1-NEXT:    retq
 entry:
   %shr = lshr i64 %a, %c
@@ -227,31 +234,32 @@
 define i64 @rshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone {
 ; GENERIC-LABEL: rshift_cl:
 ; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    movl %edx, %ecx # sched: [1:0.33]
-; GENERIC-NEXT:    shrdq %cl, %rsi, %rdi # sched: [4:1.50]
+; GENERIC-NEXT:    movq %rdx, %rcx # sched: [1:0.33]
 ; GENERIC-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT:    # kill: def $cl killed $cl killed $rcx
+; GENERIC-NEXT:    shrdq %cl, %rsi, %rax # sched: [4:1.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BTVER2-LABEL: rshift_cl:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
+; BTVER2-NEXT:    movq %rsi, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    shrq %cl, %rdi # sched: [1:0.50]
 ; BTVER2-NEXT:    negl %ecx # sched: [1:0.50]
 ; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BTVER2-NEXT:    shlq %cl, %rsi # sched: [1:0.50]
-; BTVER2-NEXT:    orq %rdi, %rsi # sched: [1:0.50]
-; BTVER2-NEXT:    movq %rsi, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    shlq %cl, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    orq %rdi, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; BDVER1-LABEL: rshift_cl:
 ; BDVER1:       # %bb.0: # %entry
 ; BDVER1-NEXT:    movq %rdx, %rcx
+; BDVER1-NEXT:    movq %rsi, %rax
 ; BDVER1-NEXT:    shrq %cl, %rdi
 ; BDVER1-NEXT:    negl %ecx
 ; BDVER1-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER1-NEXT:    shlq %cl, %rsi
-; BDVER1-NEXT:    orq %rdi, %rsi
-; BDVER1-NEXT:    movq %rsi, %rax
+; BDVER1-NEXT:    shlq %cl, %rax
+; BDVER1-NEXT:    orq %rdi, %rax
 ; BDVER1-NEXT:    retq
 entry:
   %shr = lshr i64 %a, %c
@@ -271,19 +279,22 @@
 define void @lshift_mem_cl_optsize(i64 %a, i64 %c) nounwind readnone optsize {
 ; GENERIC-LABEL: lshift_mem_cl_optsize:
 ; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    movl %esi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT:    movq %rsi, %rcx # sched: [1:0.33]
+; GENERIC-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; GENERIC-NEXT:    shldq %cl, %rdi, {{.*}}(%rip) # sched: [10:1.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; BTVER2-LABEL: lshift_mem_cl_optsize:
 ; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    movl %esi, %ecx # sched: [1:0.50]
+; BTVER2-NEXT:    movq %rsi, %rcx # sched: [1:0.50]
+; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; BTVER2-NEXT:    shldq %cl, %rdi, {{.*}}(%rip) # sched: [9:11.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; BDVER1-LABEL: lshift_mem_cl_optsize:
 ; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movl %esi, %ecx
+; BDVER1-NEXT:    movq %rsi, %rcx
+; BDVER1-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; BDVER1-NEXT:    shldq %cl, %rdi, {{.*}}(%rip)
 ; BDVER1-NEXT:    retq
 entry:
@@ -299,7 +310,8 @@
 define void @lshift_mem_cl(i64 %a, i64 %c) nounwind readnone {
 ; GENERIC-LABEL: lshift_mem_cl:
 ; GENERIC:       # %bb.0: # %entry
-; GENERIC-NEXT:    movl %esi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT:    movq %rsi, %rcx # sched: [1:0.33]
+; GENERIC-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; GENERIC-NEXT:    shldq %cl, %rdi, {{.*}}(%rip) # sched: [10:1.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
diff --git a/llvm/test/CodeGen/X86/schedule-x86_64.ll b/llvm/test/CodeGen/X86/schedule-x86_64.ll
index 3e48097..0a9eee2 100644
--- a/llvm/test/CodeGen/X86/schedule-x86_64.ll
+++ b/llvm/test/CodeGen/X86/schedule-x86_64.ll
@@ -2541,62 +2541,62 @@
 define i32 @test_bswap32(i32 %a0) optsize {
 ; GENERIC-LABEL: test_bswap32:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    bswapl %edi # sched: [1:1.00]
 ; GENERIC-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT:    bswapl %eax # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_bswap32:
 ; ATOM:       # %bb.0:
-; ATOM-NEXT:    bswapl %edi # sched: [1:1.00]
 ; ATOM-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; ATOM-NEXT:    bswapl %eax # sched: [1:1.00]
 ; ATOM-NEXT:    retq # sched: [79:39.50]
 ;
 ; SLM-LABEL: test_bswap32:
 ; SLM:       # %bb.0:
-; SLM-NEXT:    bswapl %edi # sched: [1:0.50]
 ; SLM-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; SLM-NEXT:    bswapl %eax # sched: [1:0.50]
 ; SLM-NEXT:    retq # sched: [4:1.00]
 ;
 ; SANDY-LABEL: test_bswap32:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    bswapl %edi # sched: [1:1.00]
 ; SANDY-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; SANDY-NEXT:    bswapl %eax # sched: [1:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_bswap32:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    bswapl %edi # sched: [1:0.50]
 ; HASWELL-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    bswapl %eax # sched: [1:0.50]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_bswap32:
 ; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    bswapl %edi # sched: [1:0.50]
 ; BROADWELL-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; BROADWELL-NEXT:    bswapl %eax # sched: [1:0.50]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_bswap32:
 ; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    bswapl %edi # sched: [1:0.50]
 ; SKYLAKE-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT:    bswapl %eax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_bswap32:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    bswapl %edi # sched: [1:0.50]
 ; SKX-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; SKX-NEXT:    bswapl %eax # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_bswap32:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    bswapl %edi # sched: [1:0.50]
 ; BTVER2-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    bswapl %eax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-LABEL: test_bswap32:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    bswapl %edi # sched: [1:1.00]
 ; ZNVER1-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    bswapl %eax # sched: [1:1.00]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   %1 = tail call i32 asm "bswap $0", "=r,0"(i32 %a0) nounwind
   ret i32 %1
@@ -2604,62 +2604,62 @@
 define i64 @test_bswap64(i64 %a0) optsize {
 ; GENERIC-LABEL: test_bswap64:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    bswapq %rdi # sched: [2:1.00]
 ; GENERIC-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT:    bswapq %rax # sched: [2:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_bswap64:
 ; ATOM:       # %bb.0:
-; ATOM-NEXT:    bswapq %rdi # sched: [1:1.00]
 ; ATOM-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; ATOM-NEXT:    bswapq %rax # sched: [1:1.00]
 ; ATOM-NEXT:    retq # sched: [79:39.50]
 ;
 ; SLM-LABEL: test_bswap64:
 ; SLM:       # %bb.0:
-; SLM-NEXT:    bswapq %rdi # sched: [1:0.50]
 ; SLM-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; SLM-NEXT:    bswapq %rax # sched: [1:0.50]
 ; SLM-NEXT:    retq # sched: [4:1.00]
 ;
 ; SANDY-LABEL: test_bswap64:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    bswapq %rdi # sched: [2:1.00]
 ; SANDY-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; SANDY-NEXT:    bswapq %rax # sched: [2:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_bswap64:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    bswapq %rdi # sched: [2:0.50]
 ; HASWELL-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    bswapq %rax # sched: [2:0.50]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_bswap64:
 ; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    bswapq %rdi # sched: [2:0.50]
 ; BROADWELL-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; BROADWELL-NEXT:    bswapq %rax # sched: [2:0.50]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_bswap64:
 ; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    bswapq %rdi # sched: [2:0.50]
 ; SKYLAKE-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT:    bswapq %rax # sched: [2:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_bswap64:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    bswapq %rdi # sched: [2:0.50]
 ; SKX-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; SKX-NEXT:    bswapq %rax # sched: [2:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_bswap64:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    bswapq %rdi # sched: [1:0.50]
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    bswapq %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-LABEL: test_bswap64:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    bswapq %rdi # sched: [1:1.00]
 ; ZNVER1-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    bswapq %rax # sched: [1:1.00]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   %1 = tail call i64 asm "bswap $0", "=r,0"(i64 %a0) nounwind
   ret i64 %1
diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll
index f82d438..f35da3b 100644
--- a/llvm/test/CodeGen/X86/select.ll
+++ b/llvm/test/CodeGen/X86/select.ll
@@ -1062,16 +1062,18 @@
 define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
 ; GENERIC-LABEL: test18:
 ; GENERIC:       ## %bb.0:
-; GENERIC-NEXT:    cmpl $15, %edi
-; GENERIC-NEXT:    cmovgel %edx, %esi
 ; GENERIC-NEXT:    movl %esi, %eax
+; GENERIC-NEXT:    cmpl $15, %edi
+; GENERIC-NEXT:    cmovgel %edx, %eax
+; GENERIC-NEXT:    ## kill: def $al killed $al killed $eax
 ; GENERIC-NEXT:    retq
 ;
 ; ATOM-LABEL: test18:
 ; ATOM:       ## %bb.0:
-; ATOM-NEXT:    cmpl $15, %edi
-; ATOM-NEXT:    cmovgel %edx, %esi
 ; ATOM-NEXT:    movl %esi, %eax
+; ATOM-NEXT:    cmpl $15, %edi
+; ATOM-NEXT:    cmovgel %edx, %eax
+; ATOM-NEXT:    ## kill: def $al killed $al killed $eax
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
@@ -1102,10 +1104,11 @@
 define i32 @trunc_select_miscompile(i32 %a, i1 zeroext %cc) {
 ; CHECK-LABEL: trunc_select_miscompile:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    orb $2, %sil
 ; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    shll %cl, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    orb $2, %cl
+; CHECK-NEXT:    ## kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shll %cl, %eax
 ; CHECK-NEXT:    retq
 ;
 ; ATHLON-LABEL: trunc_select_miscompile:
@@ -1118,8 +1121,9 @@
 ;
 ; MCU-LABEL: trunc_select_miscompile:
 ; MCU:       # %bb.0:
-; MCU-NEXT:    orb $2, %dl
 ; MCU-NEXT:    movl %edx, %ecx
+; MCU-NEXT:    orb $2, %cl
+; MCU-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; MCU-NEXT:    shll %cl, %eax
 ; MCU-NEXT:    retl
   %tmp1 = select i1 %cc, i32 3, i32 2
@@ -1438,10 +1442,10 @@
 define i32 @select_xor_2(i32 %A, i32 %B, i8 %cond) {
 ; CHECK-LABEL: select_xor_2:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    xorl %edi, %esi
-; CHECK-NEXT:    testb $1, %dl
-; CHECK-NEXT:    cmovel %edi, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    xorl %edi, %eax
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovel %edi, %eax
 ; CHECK-NEXT:    retq
 ;
 ; ATHLON-LABEL: select_xor_2:
@@ -1473,10 +1477,10 @@
 define i32 @select_xor_2b(i32 %A, i32 %B, i8 %cond) {
 ; CHECK-LABEL: select_xor_2b:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    xorl %edi, %esi
-; CHECK-NEXT:    testb $1, %dl
-; CHECK-NEXT:    cmovel %edi, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    xorl %edi, %eax
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovel %edi, %eax
 ; CHECK-NEXT:    retq
 ;
 ; ATHLON-LABEL: select_xor_2b:
@@ -1507,10 +1511,10 @@
 define i32 @select_or(i32 %A, i32 %B, i8 %cond) {
 ; CHECK-LABEL: select_or:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    orl %edi, %esi
-; CHECK-NEXT:    testb $1, %dl
-; CHECK-NEXT:    cmovel %edi, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    orl %edi, %eax
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovel %edi, %eax
 ; CHECK-NEXT:    retq
 ;
 ; ATHLON-LABEL: select_or:
@@ -1542,10 +1546,10 @@
 define i32 @select_or_b(i32 %A, i32 %B, i8 %cond) {
 ; CHECK-LABEL: select_or_b:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    orl %edi, %esi
-; CHECK-NEXT:    testb $1, %dl
-; CHECK-NEXT:    cmovel %edi, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    orl %edi, %eax
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovel %edi, %eax
 ; CHECK-NEXT:    retq
 ;
 ; ATHLON-LABEL: select_or_b:
@@ -1576,10 +1580,10 @@
 define i32 @select_or_1(i32 %A, i32 %B, i32 %cond) {
 ; CHECK-LABEL: select_or_1:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    orl %edi, %esi
-; CHECK-NEXT:    testb $1, %dl
-; CHECK-NEXT:    cmovel %edi, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    orl %edi, %eax
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovel %edi, %eax
 ; CHECK-NEXT:    retq
 ;
 ; ATHLON-LABEL: select_or_1:
@@ -1611,10 +1615,10 @@
 define i32 @select_or_1b(i32 %A, i32 %B, i32 %cond) {
 ; CHECK-LABEL: select_or_1b:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    orl %edi, %esi
-; CHECK-NEXT:    testb $1, %dl
-; CHECK-NEXT:    cmovel %edi, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    orl %edi, %eax
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovel %edi, %eax
 ; CHECK-NEXT:    retq
 ;
 ; ATHLON-LABEL: select_or_1b:
diff --git a/llvm/test/CodeGen/X86/select_const.ll b/llvm/test/CodeGen/X86/select_const.ll
index aad18fe..fb7746a 100644
--- a/llvm/test/CodeGen/X86/select_const.ll
+++ b/llvm/test/CodeGen/X86/select_const.ll
@@ -43,8 +43,8 @@
 define i32 @select_1_or_0(i1 %cond) {
 ; CHECK-LABEL: select_1_or_0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    retq
   %sel = select i1 %cond, i32 1, i32 0
   ret i32 %sel
@@ -62,8 +62,8 @@
 define i32 @select_1_or_0_signext(i1 signext %cond) {
 ; CHECK-LABEL: select_1_or_0_signext:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    retq
   %sel = select i1 %cond, i32 1, i32 0
   ret i32 %sel
@@ -95,8 +95,8 @@
 define i32 @select_0_or_neg1_signext(i1 signext %cond) {
 ; CHECK-LABEL: select_0_or_neg1_signext:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notl %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    notl %eax
 ; CHECK-NEXT:    retq
   %sel = select i1 %cond, i32 0, i32 -1
   ret i32 %sel
@@ -107,9 +107,9 @@
 define i32 @select_neg1_or_0(i1 %cond) {
 ; CHECK-LABEL: select_neg1_or_0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    negl %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    negl %eax
 ; CHECK-NEXT:    retq
   %sel = select i1 %cond, i32 -1, i32 0
   ret i32 %sel
@@ -118,8 +118,8 @@
 define i32 @select_neg1_or_0_zeroext(i1 zeroext %cond) {
 ; CHECK-LABEL: select_neg1_or_0_zeroext:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    negl %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    negl %eax
 ; CHECK-NEXT:    retq
   %sel = select i1 %cond, i32 -1, i32 0
   ret i32 %sel
@@ -329,9 +329,10 @@
 define i8 @select_pow2_diff(i1 zeroext %cond) {
 ; CHECK-LABEL: select_pow2_diff:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    shlb $4, %dil
-; CHECK-NEXT:    orb $3, %dil
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shlb $4, %al
+; CHECK-NEXT:    orb $3, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %sel = select i1 %cond, i8 19, i8 3
   ret i8 %sel
diff --git a/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll b/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll
index 2c7fb19..b6bbce9 100644
--- a/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll
+++ b/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll
@@ -7,16 +7,16 @@
 define i32 @neg_sel_constants(i32 %a) {
 ; CHECK-NOBMI-LABEL: neg_sel_constants:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    sarl $31, %edi
-; CHECK-NOBMI-NEXT:    andl $5, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    sarl $31, %eax
+; CHECK-NOBMI-NEXT:    andl $5, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: neg_sel_constants:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    sarl $31, %edi
-; CHECK-BMI-NEXT:    andl $5, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    sarl $31, %eax
+; CHECK-BMI-NEXT:    andl $5, %eax
 ; CHECK-BMI-NEXT:    retq
   %tmp.1 = icmp slt i32 %a, 0
   %retval = select i1 %tmp.1, i32 5, i32 0
@@ -28,16 +28,16 @@
 define i32 @neg_sel_special_constant(i32 %a) {
 ; CHECK-NOBMI-LABEL: neg_sel_special_constant:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    shrl $22, %edi
-; CHECK-NOBMI-NEXT:    andl $512, %edi # imm = 0x200
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    shrl $22, %eax
+; CHECK-NOBMI-NEXT:    andl $512, %eax # imm = 0x200
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: neg_sel_special_constant:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    shrl $22, %edi
-; CHECK-BMI-NEXT:    andl $512, %edi # imm = 0x200
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    shrl $22, %eax
+; CHECK-BMI-NEXT:    andl $512, %eax # imm = 0x200
 ; CHECK-BMI-NEXT:    retq
   %tmp.1 = icmp slt i32 %a, 0
   %retval = select i1 %tmp.1, i32 512, i32 0
@@ -49,16 +49,16 @@
 define i32 @neg_sel_variable_and_zero(i32 %a, i32 %b) {
 ; CHECK-NOBMI-LABEL: neg_sel_variable_and_zero:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    sarl $31, %edi
-; CHECK-NOBMI-NEXT:    andl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    sarl $31, %eax
+; CHECK-NOBMI-NEXT:    andl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: neg_sel_variable_and_zero:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    sarl $31, %edi
-; CHECK-BMI-NEXT:    andl %esi, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    sarl $31, %eax
+; CHECK-BMI-NEXT:    andl %esi, %eax
 ; CHECK-BMI-NEXT:    retq
   %tmp.1 = icmp slt i32 %a, 0
   %retval = select i1 %tmp.1, i32 %b, i32 0
@@ -116,18 +116,18 @@
 define i32 @pos_sel_special_constant(i32 %a) {
 ; CHECK-NOBMI-LABEL: pos_sel_special_constant:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    notl %edi
-; CHECK-NOBMI-NEXT:    shrl $22, %edi
-; CHECK-NOBMI-NEXT:    andl $512, %edi # imm = 0x200
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    notl %eax
+; CHECK-NOBMI-NEXT:    shrl $22, %eax
+; CHECK-NOBMI-NEXT:    andl $512, %eax # imm = 0x200
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: pos_sel_special_constant:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    notl %edi
-; CHECK-BMI-NEXT:    shrl $22, %edi
-; CHECK-BMI-NEXT:    andl $512, %edi # imm = 0x200
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    notl %eax
+; CHECK-BMI-NEXT:    shrl $22, %eax
+; CHECK-BMI-NEXT:    andl $512, %eax # imm = 0x200
 ; CHECK-BMI-NEXT:    retq
   %tmp.1 = icmp sgt i32 %a, -1
   %retval = select i1 %tmp.1, i32 512, i32 0
diff --git a/llvm/test/CodeGen/X86/setcc-logic.ll b/llvm/test/CodeGen/X86/setcc-logic.ll
index 9933b9c..231b9f4 100644
--- a/llvm/test/CodeGen/X86/setcc-logic.ll
+++ b/llvm/test/CodeGen/X86/setcc-logic.ll
@@ -41,9 +41,10 @@
 define zeroext i1 @all_sign_bits_set(i32 %P, i32 %Q) nounwind {
 ; CHECK-LABEL: all_sign_bits_set:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl %esi, %edi
-; CHECK-NEXT:    shrl $31, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl %esi, %eax
+; CHECK-NEXT:    shrl $31, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %a = icmp slt i32 %P, 0
   %b = icmp slt i32 %Q, 0
@@ -66,9 +67,10 @@
 define zeroext i1 @any_sign_bits_set(i32 %P, i32 %Q) nounwind {
 ; CHECK-LABEL: any_sign_bits_set:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    orl %esi, %edi
-; CHECK-NEXT:    shrl $31, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    orl %esi, %eax
+; CHECK-NEXT:    shrl $31, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %a = icmp slt i32 %P, 0
   %b = icmp slt i32 %Q, 0
diff --git a/llvm/test/CodeGen/X86/sext-i1.ll b/llvm/test/CodeGen/X86/sext-i1.ll
index 578d2c9..b1b0676 100644
--- a/llvm/test/CodeGen/X86/sext-i1.ll
+++ b/llvm/test/CodeGen/X86/sext-i1.ll
@@ -164,8 +164,8 @@
 ;
 ; X64-LABEL: select_0_or_1s_signext:
 ; X64:       # %bb.0:
-; X64-NEXT:    notl %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notl %eax
 ; X64-NEXT:    retq
   %not = xor i1 %cond, 1
   %sext = sext i1 %not to i32
diff --git a/llvm/test/CodeGen/X86/shift-and.ll b/llvm/test/CodeGen/X86/shift-and.ll
index 1e448d3..fc8eb2f 100644
--- a/llvm/test/CodeGen/X86/shift-and.ll
+++ b/llvm/test/CodeGen/X86/shift-and.ll
@@ -12,9 +12,10 @@
 ;
 ; X64-LABEL: t1:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    shll %cl, %esi
 ; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    retq
        %shamt = and i32 %t, 31
        %res = shl i32 %val, %shamt
@@ -31,9 +32,10 @@
 ;
 ; X64-LABEL: t2:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    shll %cl, %esi
 ; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    retq
        %shamt = and i32 %t, 63
        %res = shl i32 %val, %shamt
@@ -52,6 +54,7 @@
 ; X64-LABEL: t3:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    sarw %cl, {{.*}}(%rip)
 ; X64-NEXT:    retq
        %shamt = and i16 %t, 31
@@ -82,9 +85,10 @@
 ;
 ; X64-LABEL: t4:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    shrq %cl, %rsi
 ; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
        %shamt = and i64 %t, 63
        %res = lshr i64 %val, %shamt
@@ -112,9 +116,10 @@
 ;
 ; X64-LABEL: t5:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    shrq %cl, %rsi
 ; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
        %shamt = and i64 %t, 191
        %res = lshr i64 %val, %shamt
@@ -147,7 +152,8 @@
 ;
 ; X64-LABEL: t5ptr:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NEXT:    shrq %cl, (%rsi)
 ; X64-NEXT:    retq
        %shamt = and i64 %t, 191
@@ -205,9 +211,9 @@
 ;
 ; X64-LABEL: big_mask_constant:
 ; X64:       # %bb.0:
-; X64-NEXT:    shrq $7, %rdi
-; X64-NEXT:    andl $134217728, %edi # imm = 0x8000000
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shrq $7, %rax
+; X64-NEXT:    andl $134217728, %eax # imm = 0x8000000
 ; X64-NEXT:    retq
   %and = and i64 %x, 17179869184 ; 0x400000000
   %sh = lshr i64 %and, 7
diff --git a/llvm/test/CodeGen/X86/shift-bmi2.ll b/llvm/test/CodeGen/X86/shift-bmi2.ll
index 07e60e3..96d1103 100644
--- a/llvm/test/CodeGen/X86/shift-bmi2.ll
+++ b/llvm/test/CodeGen/X86/shift-bmi2.ll
@@ -26,8 +26,8 @@
 ;
 ; BMI264-LABEL: shl32i:
 ; BMI264:       # %bb.0:
-; BMI264-NEXT:    shll $5, %edi
 ; BMI264-NEXT:    movl %edi, %eax
+; BMI264-NEXT:    shll $5, %eax
 ; BMI264-NEXT:    retq
   %shl = shl i32 %x, 5
   ret i32 %shl
@@ -69,6 +69,24 @@
 }
 
 define i64 @shl64(i64 %x, i64 %shamt) nounwind uwtable readnone {
+; BMI2-LABEL: shl64:
+; BMI2:       # %bb.0:
+; BMI2-NEXT:    pushl %esi
+; BMI2-NEXT:    .cfi_def_cfa_offset 8
+; BMI2-NEXT:    .cfi_offset %esi, -8
+; BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; BMI2-NEXT:    shldl %cl, %eax, %edx
+; BMI2-NEXT:    shlxl %ecx, %eax, %esi
+; BMI2-NEXT:    xorl %eax, %eax
+; BMI2-NEXT:    testb $32, %cl
+; BMI2-NEXT:    cmovnel %esi, %edx
+; BMI2-NEXT:    cmovel %esi, %eax
+; BMI2-NEXT:    popl %esi
+; BMI2-NEXT:    .cfi_def_cfa_offset 4
+; BMI2-NEXT:    retl
+;
 ; BMI264-LABEL: shl64:
 ; BMI264:       # %bb.0:
 ; BMI264-NEXT:    shlxq %rsi, %rdi, %rax
@@ -78,16 +96,43 @@
 }
 
 define i64 @shl64i(i64 %x) nounwind uwtable readnone {
+; BMI2-LABEL: shl64i:
+; BMI2:       # %bb.0:
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; BMI2-NEXT:    shldl $7, %eax, %edx
+; BMI2-NEXT:    shll $7, %eax
+; BMI2-NEXT:    retl
+;
 ; BMI264-LABEL: shl64i:
 ; BMI264:       # %bb.0:
-; BMI264-NEXT:    shlq $7, %rdi
 ; BMI264-NEXT:    movq %rdi, %rax
+; BMI264-NEXT:    shlq $7, %rax
 ; BMI264-NEXT:    retq
   %shl = shl i64 %x, 7
   ret i64 %shl
 }
 
 define i64 @shl64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
+; BMI2-LABEL: shl64p:
+; BMI2:       # %bb.0:
+; BMI2-NEXT:    pushl %esi
+; BMI2-NEXT:    .cfi_def_cfa_offset 8
+; BMI2-NEXT:    .cfi_offset %esi, -8
+; BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT:    movl (%eax), %esi
+; BMI2-NEXT:    movl 4(%eax), %edx
+; BMI2-NEXT:    shldl %cl, %esi, %edx
+; BMI2-NEXT:    shlxl %ecx, %esi, %esi
+; BMI2-NEXT:    xorl %eax, %eax
+; BMI2-NEXT:    testb $32, %cl
+; BMI2-NEXT:    cmovnel %esi, %edx
+; BMI2-NEXT:    cmovel %esi, %eax
+; BMI2-NEXT:    popl %esi
+; BMI2-NEXT:    .cfi_def_cfa_offset 4
+; BMI2-NEXT:    retl
+;
 ; BMI264-LABEL: shl64p:
 ; BMI264:       # %bb.0:
 ; BMI264-NEXT:    shlxq %rsi, (%rdi), %rax
@@ -98,6 +143,15 @@
 }
 
 define i64 @shl64pi(i64* %p) nounwind uwtable readnone {
+; BMI2-LABEL: shl64pi:
+; BMI2:       # %bb.0:
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; BMI2-NEXT:    movl (%ecx), %eax
+; BMI2-NEXT:    movl 4(%ecx), %edx
+; BMI2-NEXT:    shldl $7, %eax, %edx
+; BMI2-NEXT:    shll $7, %eax
+; BMI2-NEXT:    retl
+;
 ; BMI264-LABEL: shl64pi:
 ; BMI264:       # %bb.0:
 ; BMI264-NEXT:    movq (%rdi), %rax
@@ -141,6 +195,24 @@
 }
 
 define i64 @lshr64(i64 %x, i64 %shamt) nounwind uwtable readnone {
+; BMI2-LABEL: lshr64:
+; BMI2:       # %bb.0:
+; BMI2-NEXT:    pushl %esi
+; BMI2-NEXT:    .cfi_def_cfa_offset 8
+; BMI2-NEXT:    .cfi_offset %esi, -8
+; BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; BMI2-NEXT:    shrdl %cl, %edx, %eax
+; BMI2-NEXT:    shrxl %ecx, %edx, %esi
+; BMI2-NEXT:    xorl %edx, %edx
+; BMI2-NEXT:    testb $32, %cl
+; BMI2-NEXT:    cmovnel %esi, %eax
+; BMI2-NEXT:    cmovel %esi, %edx
+; BMI2-NEXT:    popl %esi
+; BMI2-NEXT:    .cfi_def_cfa_offset 4
+; BMI2-NEXT:    retl
+;
 ; BMI264-LABEL: lshr64:
 ; BMI264:       # %bb.0:
 ; BMI264-NEXT:    shrxq %rsi, %rdi, %rax
@@ -150,6 +222,25 @@
 }
 
 define i64 @lshr64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
+; BMI2-LABEL: lshr64p:
+; BMI2:       # %bb.0:
+; BMI2-NEXT:    pushl %esi
+; BMI2-NEXT:    .cfi_def_cfa_offset 8
+; BMI2-NEXT:    .cfi_offset %esi, -8
+; BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; BMI2-NEXT:    movl (%edx), %eax
+; BMI2-NEXT:    movl 4(%edx), %edx
+; BMI2-NEXT:    shrdl %cl, %edx, %eax
+; BMI2-NEXT:    shrxl %ecx, %edx, %esi
+; BMI2-NEXT:    xorl %edx, %edx
+; BMI2-NEXT:    testb $32, %cl
+; BMI2-NEXT:    cmovnel %esi, %eax
+; BMI2-NEXT:    cmovel %esi, %edx
+; BMI2-NEXT:    popl %esi
+; BMI2-NEXT:    .cfi_def_cfa_offset 4
+; BMI2-NEXT:    retl
+;
 ; BMI264-LABEL: lshr64p:
 ; BMI264:       # %bb.0:
 ; BMI264-NEXT:    shrxq %rsi, (%rdi), %rax
@@ -192,6 +283,24 @@
 }
 
 define i64 @ashr64(i64 %x, i64 %shamt) nounwind uwtable readnone {
+; BMI2-LABEL: ashr64:
+; BMI2:       # %bb.0:
+; BMI2-NEXT:    pushl %esi
+; BMI2-NEXT:    .cfi_def_cfa_offset 8
+; BMI2-NEXT:    .cfi_offset %esi, -8
+; BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; BMI2-NEXT:    shrdl %cl, %edx, %eax
+; BMI2-NEXT:    sarxl %ecx, %edx, %esi
+; BMI2-NEXT:    sarl $31, %edx
+; BMI2-NEXT:    testb $32, %cl
+; BMI2-NEXT:    cmovnel %esi, %eax
+; BMI2-NEXT:    cmovel %esi, %edx
+; BMI2-NEXT:    popl %esi
+; BMI2-NEXT:    .cfi_def_cfa_offset 4
+; BMI2-NEXT:    retl
+;
 ; BMI264-LABEL: ashr64:
 ; BMI264:       # %bb.0:
 ; BMI264-NEXT:    sarxq %rsi, %rdi, %rax
@@ -201,6 +310,25 @@
 }
 
 define i64 @ashr64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
+; BMI2-LABEL: ashr64p:
+; BMI2:       # %bb.0:
+; BMI2-NEXT:    pushl %esi
+; BMI2-NEXT:    .cfi_def_cfa_offset 8
+; BMI2-NEXT:    .cfi_offset %esi, -8
+; BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; BMI2-NEXT:    movl (%edx), %eax
+; BMI2-NEXT:    movl 4(%edx), %edx
+; BMI2-NEXT:    shrdl %cl, %edx, %eax
+; BMI2-NEXT:    sarxl %ecx, %edx, %esi
+; BMI2-NEXT:    sarl $31, %edx
+; BMI2-NEXT:    testb $32, %cl
+; BMI2-NEXT:    cmovnel %esi, %eax
+; BMI2-NEXT:    cmovel %esi, %edx
+; BMI2-NEXT:    popl %esi
+; BMI2-NEXT:    .cfi_def_cfa_offset 4
+; BMI2-NEXT:    retl
+;
 ; BMI264-LABEL: ashr64p:
 ; BMI264:       # %bb.0:
 ; BMI264-NEXT:    sarxq %rsi, (%rdi), %rax
@@ -227,6 +355,21 @@
 }
 
 define i64 @shl64and(i64 %t, i64 %val) nounwind {
+; BMI2-LABEL: shl64and:
+; BMI2:       # %bb.0:
+; BMI2-NEXT:    pushl %esi
+; BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; BMI2-NEXT:    shldl %cl, %eax, %edx
+; BMI2-NEXT:    shlxl %ecx, %eax, %esi
+; BMI2-NEXT:    xorl %eax, %eax
+; BMI2-NEXT:    testb $32, %cl
+; BMI2-NEXT:    cmovnel %esi, %edx
+; BMI2-NEXT:    cmovel %esi, %eax
+; BMI2-NEXT:    popl %esi
+; BMI2-NEXT:    retl
+;
 ; BMI264-LABEL: shl64and:
 ; BMI264:       # %bb.0:
 ; BMI264-NEXT:    shlxq %rdi, %rsi, %rax
@@ -253,6 +396,21 @@
 }
 
 define i64 @lshr64and(i64 %t, i64 %val) nounwind {
+; BMI2-LABEL: lshr64and:
+; BMI2:       # %bb.0:
+; BMI2-NEXT:    pushl %esi
+; BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; BMI2-NEXT:    shrdl %cl, %edx, %eax
+; BMI2-NEXT:    shrxl %ecx, %edx, %esi
+; BMI2-NEXT:    xorl %edx, %edx
+; BMI2-NEXT:    testb $32, %cl
+; BMI2-NEXT:    cmovnel %esi, %eax
+; BMI2-NEXT:    cmovel %esi, %edx
+; BMI2-NEXT:    popl %esi
+; BMI2-NEXT:    retl
+;
 ; BMI264-LABEL: lshr64and:
 ; BMI264:       # %bb.0:
 ; BMI264-NEXT:    shrxq %rdi, %rsi, %rax
@@ -279,6 +437,21 @@
 }
 
 define i64 @ashr64and(i64 %t, i64 %val) nounwind {
+; BMI2-LABEL: ashr64and:
+; BMI2:       # %bb.0:
+; BMI2-NEXT:    pushl %esi
+; BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; BMI2-NEXT:    shrdl %cl, %edx, %eax
+; BMI2-NEXT:    sarxl %ecx, %edx, %esi
+; BMI2-NEXT:    sarl $31, %edx
+; BMI2-NEXT:    testb $32, %cl
+; BMI2-NEXT:    cmovnel %esi, %eax
+; BMI2-NEXT:    cmovel %esi, %edx
+; BMI2-NEXT:    popl %esi
+; BMI2-NEXT:    retl
+;
 ; BMI264-LABEL: ashr64and:
 ; BMI264:       # %bb.0:
 ; BMI264-NEXT:    sarxq %rdi, %rsi, %rax
diff --git a/llvm/test/CodeGen/X86/shift-double-x86_64.ll b/llvm/test/CodeGen/X86/shift-double-x86_64.ll
index 0d5d949..da5d10c 100644
--- a/llvm/test/CodeGen/X86/shift-double-x86_64.ll
+++ b/llvm/test/CodeGen/X86/shift-double-x86_64.ll
@@ -6,10 +6,11 @@
 define i64 @test1(i64 %hi, i64 %lo, i64 %bits) nounwind {
 ; CHECK-LABEL: test1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $63, %edx
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    shldq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdx, %rcx
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    andl $63, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    shldq %cl, %rsi, %rax
 ; CHECK-NEXT:    retq
   %and = and i64 %bits, 63
   %and64 = sub i64 64, %and
@@ -22,10 +23,11 @@
 define i64 @test2(i64 %hi, i64 %lo, i64 %bits) nounwind {
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $63, %edx
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    shrdq %cl, %rdi, %rsi
+; CHECK-NEXT:    movq %rdx, %rcx
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    andl $63, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    shrdq %cl, %rdi, %rax
 ; CHECK-NEXT:    retq
   %and = and i64 %bits, 63
   %and64 = sub i64 64, %and
@@ -38,9 +40,10 @@
 define i64 @test3(i64 %hi, i64 %lo, i64 %bits) nounwind {
 ; CHECK-LABEL: test3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    shldq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdx, %rcx
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    shldq %cl, %rsi, %rax
 ; CHECK-NEXT:    retq
   %bits64 = sub i64 64, %bits
   %sh_lo = lshr i64 %lo, %bits64
@@ -52,9 +55,10 @@
 define i64 @test4(i64 %hi, i64 %lo, i64 %bits) nounwind {
 ; CHECK-LABEL: test4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    shrdq %cl, %rdi, %rsi
+; CHECK-NEXT:    movq %rdx, %rcx
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    shrdq %cl, %rdi, %rax
 ; CHECK-NEXT:    retq
   %bits64 = sub i64 64, %bits
   %sh_lo = shl i64 %hi, %bits64
@@ -66,9 +70,10 @@
 define i64 @test5(i64 %hi, i64 %lo, i64 %bits) nounwind {
 ; CHECK-LABEL: test5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    shldq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdx, %rcx
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    shldq %cl, %rsi, %rax
 ; CHECK-NEXT:    retq
   %bits64 = xor i64 %bits, 63
   %lo2 = lshr i64 %lo, 1
@@ -81,9 +86,10 @@
 define i64 @test6(i64 %hi, i64 %lo, i64 %bits) nounwind {
 ; CHECK-LABEL: test6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    shrdq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdx, %rcx
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    shrdq %cl, %rsi, %rax
 ; CHECK-NEXT:    retq
   %bits64 = xor i64 %bits, 63
   %lo2 = shl i64 %lo, 1
@@ -96,9 +102,10 @@
 define i64 @test7(i64 %hi, i64 %lo, i64 %bits) nounwind {
 ; CHECK-LABEL: test7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    shrdq %cl, %rsi, %rdi
+; CHECK-NEXT:    movq %rdx, %rcx
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    shrdq %cl, %rsi, %rax
 ; CHECK-NEXT:    retq
   %bits64 = xor i64 %bits, 63
   %lo2 = add i64 %lo, %lo
diff --git a/llvm/test/CodeGen/X86/shift-double.ll b/llvm/test/CodeGen/X86/shift-double.ll
index 9037623..b1b5f1c 100644
--- a/llvm/test/CodeGen/X86/shift-double.ll
+++ b/llvm/test/CodeGen/X86/shift-double.ll
@@ -26,8 +26,9 @@
 ; X64-LABEL: test1:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    shlq %cl, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shlq %cl, %rax
 ; X64-NEXT:    retq
         %shift.upgrd.1 = zext i8 %C to i64              ; <i64> [#uses=1]
         %Y = shl i64 %X, %shift.upgrd.1         ; <i64> [#uses=1]
@@ -57,8 +58,9 @@
 ; X64-LABEL: test2:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    sarq %cl, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    sarq %cl, %rax
 ; X64-NEXT:    retq
         %shift.upgrd.2 = zext i8 %C to i64              ; <i64> [#uses=1]
         %Y = ashr i64 %X, %shift.upgrd.2                ; <i64> [#uses=1]
@@ -87,8 +89,9 @@
 ; X64-LABEL: test3:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    shrq %cl, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
         %shift.upgrd.3 = zext i8 %C to i64              ; <i64> [#uses=1]
         %Y = lshr i64 %X, %shift.upgrd.3                ; <i64> [#uses=1]
@@ -109,8 +112,9 @@
 ; X64-LABEL: test4:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shldl %cl, %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shldl %cl, %esi, %eax
 ; X64-NEXT:    retq
         %shift.upgrd.4 = zext i8 %C to i32              ; <i32> [#uses=1]
         %X = shl i32 %A, %shift.upgrd.4         ; <i32> [#uses=1]
@@ -133,8 +137,10 @@
 ; X64-LABEL: test5:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shldw %cl, %si, %di
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shldw %cl, %si, %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
         %shift.upgrd.6 = zext i8 %C to i16              ; <i16> [#uses=1]
         %X = shl i16 %A, %shift.upgrd.6         ; <i16> [#uses=1]
@@ -159,8 +165,9 @@
 ; X64-LABEL: test6:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shrdl %cl, %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shrdl %cl, %esi, %eax
 ; X64-NEXT:    retq
         %shift.upgrd.4 = zext i8 %C to i32              ; <i32> [#uses=1]
         %X = lshr i32 %A, %shift.upgrd.4         ; <i32> [#uses=1]
@@ -183,8 +190,10 @@
 ; X64-LABEL: test7:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shrdw %cl, %si, %di
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shrdw %cl, %si, %ax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
         %shift.upgrd.6 = zext i8 %C to i16              ; <i16> [#uses=1]
         %X = lshr i16 %A, %shift.upgrd.6         ; <i16> [#uses=1]
@@ -212,10 +221,11 @@
 ;
 ; X64-LABEL: test8:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $31, %sil
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    shlq %cl, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    andb $31, %cl
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shlq %cl, %rax
 ; X64-NEXT:    retq
   %and = and i32 %bits, 31
   %sh_prom = zext i32 %and to i64
@@ -235,10 +245,11 @@
 ;
 ; X64-LABEL: test9:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $31, %sil
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    sarq %cl, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    andb $31, %cl
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    sarq %cl, %rax
 ; X64-NEXT:    retq
   %and = and i32 %bits, 31
   %sh_prom = zext i32 %and to i64
@@ -258,10 +269,11 @@
 ;
 ; X64-LABEL: test10:
 ; X64:       # %bb.0:
-; X64-NEXT:    andb $31, %sil
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    shrq %cl, %rdi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    andb $31, %cl
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
   %and = and i32 %bits, 31
   %sh_prom = zext i32 %and to i64
@@ -284,10 +296,11 @@
 ;
 ; X64-LABEL: test11:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl $31, %edx
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shldl %cl, %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $31, %ecx
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shldl %cl, %esi, %eax
 ; X64-NEXT:    retq
   %and = and i32 %bits, 31
   %and32 = sub i32 32, %and
@@ -310,10 +323,11 @@
 ;
 ; X64-LABEL: test12:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl $31, %edx
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shrdl %cl, %edi, %esi
 ; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $31, %ecx
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shrdl %cl, %edi, %eax
 ; X64-NEXT:    retq
   %and = and i32 %bits, 31
   %and32 = sub i32 32, %and
@@ -335,8 +349,9 @@
 ; X64-LABEL: test13:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shldl %cl, %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shldl %cl, %esi, %eax
 ; X64-NEXT:    retq
   %bits32 = sub i32 32, %bits
   %sh_lo = lshr i32 %lo, %bits32
@@ -357,8 +372,9 @@
 ; X64-LABEL: test14:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shrdl %cl, %edi, %esi
 ; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shrdl %cl, %edi, %eax
 ; X64-NEXT:    retq
   %bits32 = sub i32 32, %bits
   %sh_lo = shl i32 %hi, %bits32
@@ -379,8 +395,9 @@
 ; X64-LABEL: test15:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shldl %cl, %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shldl %cl, %esi, %eax
 ; X64-NEXT:    retq
   %bits32 = xor i32 %bits, 31
   %lo2 = lshr i32 %lo, 1
@@ -402,8 +419,9 @@
 ; X64-LABEL: test16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shrdl %cl, %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shrdl %cl, %esi, %eax
 ; X64-NEXT:    retq
   %bits32 = xor i32 %bits, 31
   %lo2 = shl i32 %lo, 1
@@ -425,8 +443,9 @@
 ; X64-LABEL: test17:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
-; X64-NEXT:    shrdl %cl, %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shrdl %cl, %esi, %eax
 ; X64-NEXT:    retq
   %bits32 = xor i32 %bits, 31
   %lo2 = add i32 %lo, %lo
diff --git a/llvm/test/CodeGen/X86/shift-pair.ll b/llvm/test/CodeGen/X86/shift-pair.ll
index 0a1d68d..d809f9f 100644
--- a/llvm/test/CodeGen/X86/shift-pair.ll
+++ b/llvm/test/CodeGen/X86/shift-pair.ll
@@ -4,9 +4,9 @@
 define i64 @test(i64 %A) {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrq $54, %rdi
-; CHECK-NEXT:    andl $-4, %edi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    shrq $54, %rax
+; CHECK-NEXT:    andl $-4, %eax
 ; CHECK-NEXT:    retq
     %B = lshr i64 %A, 56
     %C = shl i64 %B, 2
diff --git a/llvm/test/CodeGen/X86/shuffle-of-insert.ll b/llvm/test/CodeGen/X86/shuffle-of-insert.ll
index 16074dc..bf1d748 100644
--- a/llvm/test/CodeGen/X86/shuffle-of-insert.ll
+++ b/llvm/test/CodeGen/X86/shuffle-of-insert.ll
@@ -6,15 +6,15 @@
 define <4 x i32> @ins_elt_0(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
 ; SSE2-LABEL: ins_elt_0:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movd %edi, %xmm0
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    movd %edi, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: ins_elt_0:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    pinsrd $0, %edi, %xmm1
 ; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    pinsrd $0, %edi, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX-LABEL: ins_elt_0:
@@ -36,8 +36,8 @@
 ;
 ; SSE4-LABEL: ins_elt_1:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    pinsrd $1, %edi, %xmm1
 ; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    pinsrd $1, %edi, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX-LABEL: ins_elt_1:
@@ -54,16 +54,16 @@
 define <4 x i32> @ins_elt_2_commute(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
 ; SSE2-LABEL: ins_elt_2_commute:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movd %edi, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    movd %edi, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: ins_elt_2_commute:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    pinsrd $2, %edi, %xmm1
 ; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    pinsrd $2, %edi, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX-LABEL: ins_elt_2_commute:
@@ -78,16 +78,16 @@
 define <4 x i32> @ins_elt_3_commute(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
 ; SSE2-LABEL: ins_elt_3_commute:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movd %edi, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    movd %edi, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: ins_elt_3_commute:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    pinsrd $3, %edi, %xmm1
 ; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    pinsrd $3, %edi, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX-LABEL: ins_elt_3_commute:
@@ -104,16 +104,16 @@
 define <4 x i32> @ins_elt_0_to_2(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
 ; SSE2-LABEL: ins_elt_0_to_2:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movd %edi, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    movd %edi, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: ins_elt_0_to_2:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    pinsrd $2, %edi, %xmm1
 ; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    pinsrd $2, %edi, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX-LABEL: ins_elt_0_to_2:
@@ -128,15 +128,15 @@
 define <4 x i32> @ins_elt_1_to_0(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
 ; SSE2-LABEL: ins_elt_1_to_0:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movd %edi, %xmm0
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    movd %edi, %xmm1
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: ins_elt_1_to_0:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    pinsrd $0, %edi, %xmm1
 ; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    pinsrd $0, %edi, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX-LABEL: ins_elt_1_to_0:
@@ -151,16 +151,16 @@
 define <4 x i32> @ins_elt_2_to_3(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
 ; SSE2-LABEL: ins_elt_2_to_3:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movd %edi, %xmm0
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    movd %edi, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: ins_elt_2_to_3:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    pinsrd $3, %edi, %xmm1
 ; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    pinsrd $3, %edi, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX-LABEL: ins_elt_2_to_3:
@@ -182,8 +182,8 @@
 ;
 ; SSE4-LABEL: ins_elt_3_to_1:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    pinsrd $1, %edi, %xmm1
 ; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    pinsrd $1, %edi, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX-LABEL: ins_elt_3_to_1:
diff --git a/llvm/test/CodeGen/X86/signbit-shift.ll b/llvm/test/CodeGen/X86/signbit-shift.ll
index 1579a77..7c2ce7a 100644
--- a/llvm/test/CodeGen/X86/signbit-shift.ll
+++ b/llvm/test/CodeGen/X86/signbit-shift.ll
@@ -6,9 +6,9 @@
 define i32 @zext_ifpos(i32 %x) {
 ; CHECK-LABEL: zext_ifpos:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notl %edi
-; CHECK-NEXT:    shrl $31, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    notl %eax
+; CHECK-NEXT:    shrl $31, %eax
 ; CHECK-NEXT:    retq
   %c = icmp sgt i32 %x, -1
   %e = zext i1 %c to i32
@@ -57,9 +57,9 @@
 define i32 @sext_ifpos(i32 %x) {
 ; CHECK-LABEL: sext_ifpos:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    notl %edi
-; CHECK-NEXT:    sarl $31, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    notl %eax
+; CHECK-NEXT:    sarl $31, %eax
 ; CHECK-NEXT:    retq
   %c = icmp sgt i32 %x, -1
   %e = sext i1 %c to i32
@@ -109,8 +109,8 @@
 define i32 @zext_ifneg(i32 %x) {
 ; CHECK-LABEL: zext_ifneg:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    shrl $31, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shrl $31, %eax
 ; CHECK-NEXT:    retq
   %c = icmp slt i32 %x, 0
   %r = zext i1 %c to i32
@@ -145,8 +145,8 @@
 define i32 @sext_ifneg(i32 %x) {
 ; CHECK-LABEL: sext_ifneg:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    sarl $31, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    sarl $31, %eax
 ; CHECK-NEXT:    retq
   %c = icmp slt i32 %x, 0
   %r = sext i1 %c to i32
diff --git a/llvm/test/CodeGen/X86/sret-implicit.ll b/llvm/test/CodeGen/X86/sret-implicit.ll
index 2a998fc..75aaf46 100644
--- a/llvm/test/CodeGen/X86/sret-implicit.ll
+++ b/llvm/test/CodeGen/X86/sret-implicit.ll
@@ -10,8 +10,8 @@
 }
 
 ; X64-LABEL: sret_void
-; X64-DAG: movl $0, (%rdi)
 ; X64-DAG: movq %rdi, %rax
+; X64-DAG: movl $0, (%rdi)
 ; X64: retq
 
 ; X86-LABEL: sret_void
@@ -24,8 +24,8 @@
 }
 
 ; X64-LABEL: sret_demoted
-; X64-DAG: movq $0, (%rdi)
 ; X64-DAG: movq %rdi, %rax
+; X64-DAG: movq $0, (%rdi)
 ; X64: retq
 
 ; X86-LABEL: sret_demoted
diff --git a/llvm/test/CodeGen/X86/sse1.ll b/llvm/test/CodeGen/X86/sse1.ll
index 5e383a3..482062b 100644
--- a/llvm/test/CodeGen/X86/sse1.ll
+++ b/llvm/test/CodeGen/X86/sse1.ll
@@ -190,26 +190,27 @@
 ;
 ; X64-LABEL: PR30512:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    xorl %edi, %edi
 ; X64-NEXT:    cmpl {{[0-9]+}}(%rsp), %r8d
-; X64-NEXT:    sete %al
-; X64-NEXT:    negl %eax
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    sete %dil
+; X64-NEXT:    negl %edi
+; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    xorl %edi, %edi
 ; X64-NEXT:    cmpl {{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    sete %al
-; X64-NEXT:    negl %eax
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    sete %dil
+; X64-NEXT:    negl %edi
+; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpl {{[0-9]+}}(%rsp), %edx
-; X64-NEXT:    sete %al
-; X64-NEXT:    negl %eax
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    sete %cl
+; X64-NEXT:    negl %ecx
+; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    xorl %ecx, %ecx
 ; X64-NEXT:    cmpl %r9d, %esi
-; X64-NEXT:    sete %al
-; X64-NEXT:    negl %eax
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    sete %cl
+; X64-NEXT:    negl %ecx
+; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -218,8 +219,7 @@
 ; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; X64-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
 ; X64-NEXT:    andps {{.*}}(%rip), %xmm2
-; X64-NEXT:    movaps %xmm2, (%rdi)
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movaps %xmm2, (%rax)
 ; X64-NEXT:    retq
   %cmp = icmp eq <4 x i32> %x, %y
   %zext = zext <4 x i1> %cmp to <4 x i32>
diff --git a/llvm/test/CodeGen/X86/sse3-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse3-intrinsics-x86.ll
index f97bf08..dfeb3cf 100644
--- a/llvm/test/CodeGen/X86/sse3-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/sse3-intrinsics-x86.ll
@@ -143,8 +143,8 @@
 ;
 ; X64-LABEL: monitor:
 ; X64:       ## %bb.0:
-; X64-NEXT:    leaq (%rdi), %rax ## encoding: [0x48,0x8d,0x07]
 ; X64-NEXT:    movl %esi, %ecx ## encoding: [0x89,0xf1]
+; X64-NEXT:    leaq (%rdi), %rax ## encoding: [0x48,0x8d,0x07]
 ; X64-NEXT:    monitor ## encoding: [0x0f,0x01,0xc8]
 ; X64-NEXT:    retq ## encoding: [0xc3]
   tail call void @llvm.x86.sse3.monitor(i8* %P, i32 %E, i32 %H)
@@ -162,8 +162,8 @@
 ;
 ; X64-LABEL: mwait:
 ; X64:       ## %bb.0:
-; X64-NEXT:    movl %edi, %ecx ## encoding: [0x89,0xf9]
 ; X64-NEXT:    movl %esi, %eax ## encoding: [0x89,0xf0]
+; X64-NEXT:    movl %edi, %ecx ## encoding: [0x89,0xf9]
 ; X64-NEXT:    mwait ## encoding: [0x0f,0x01,0xc9]
 ; X64-NEXT:    retq ## encoding: [0xc3]
   tail call void @llvm.x86.sse3.mwait(i32 %E, i32 %H)
diff --git a/llvm/test/CodeGen/X86/sse3-schedule.ll b/llvm/test/CodeGen/X86/sse3-schedule.ll
index d8b0c45..bb37f21 100644
--- a/llvm/test/CodeGen/X86/sse3-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse3-schedule.ll
@@ -768,120 +768,120 @@
 define void @test_monitor(i8* %a0, i32 %a1, i32 %a2) {
 ; GENERIC-LABEL: test_monitor:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; GENERIC-NEXT:    movl %esi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; GENERIC-NEXT:    monitor # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_monitor:
 ; ATOM:       # %bb.0:
-; ATOM-NEXT:    leaq (%rdi), %rax # sched: [1:1.00]
 ; ATOM-NEXT:    movl %esi, %ecx # sched: [1:0.50]
+; ATOM-NEXT:    leaq (%rdi), %rax # sched: [1:1.00]
 ; ATOM-NEXT:    monitor # sched: [45:22.50]
 ; ATOM-NEXT:    retq # sched: [79:39.50]
 ;
 ; SLM-LABEL: test_monitor:
 ; SLM:       # %bb.0:
-; SLM-NEXT:    leaq (%rdi), %rax # sched: [1:1.00]
 ; SLM-NEXT:    movl %esi, %ecx # sched: [1:0.50]
+; SLM-NEXT:    leaq (%rdi), %rax # sched: [1:1.00]
 ; SLM-NEXT:    monitor # sched: [100:1.00]
 ; SLM-NEXT:    retq # sched: [4:1.00]
 ;
 ; SANDY-SSE-LABEL: test_monitor:
 ; SANDY-SSE:       # %bb.0:
-; SANDY-SSE-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; SANDY-SSE-NEXT:    movl %esi, %ecx # sched: [1:0.33]
+; SANDY-SSE-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; SANDY-SSE-NEXT:    monitor # sched: [100:0.33]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_monitor:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; SANDY-NEXT:    movl %esi, %ecx # sched: [1:0.33]
+; SANDY-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; SANDY-NEXT:    monitor # sched: [100:0.33]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_monitor:
 ; HASWELL-SSE:       # %bb.0:
-; HASWELL-SSE-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; HASWELL-SSE-NEXT:    movl %esi, %ecx # sched: [1:0.25]
+; HASWELL-SSE-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; HASWELL-SSE-NEXT:    monitor # sched: [100:0.25]
 ; HASWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; HASWELL-LABEL: test_monitor:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; HASWELL-NEXT:    movl %esi, %ecx # sched: [1:0.25]
+; HASWELL-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; HASWELL-NEXT:    monitor # sched: [100:0.25]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-SSE-LABEL: test_monitor:
 ; BROADWELL-SSE:       # %bb.0:
-; BROADWELL-SSE-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; BROADWELL-SSE-NEXT:    movl %esi, %ecx # sched: [1:0.25]
+; BROADWELL-SSE-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; BROADWELL-SSE-NEXT:    monitor # sched: [100:0.25]
 ; BROADWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_monitor:
 ; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; BROADWELL-NEXT:    movl %esi, %ecx # sched: [1:0.25]
+; BROADWELL-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; BROADWELL-NEXT:    monitor # sched: [100:0.25]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-SSE-LABEL: test_monitor:
 ; SKYLAKE-SSE:       # %bb.0:
-; SKYLAKE-SSE-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; SKYLAKE-SSE-NEXT:    movl %esi, %ecx # sched: [1:0.25]
+; SKYLAKE-SSE-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; SKYLAKE-SSE-NEXT:    monitor # sched: [100:0.25]
 ; SKYLAKE-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_monitor:
 ; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    movl %esi, %ecx # sched: [1:0.25]
+; SKYLAKE-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    monitor # sched: [100:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-SSE-LABEL: test_monitor:
 ; SKX-SSE:       # %bb.0:
-; SKX-SSE-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; SKX-SSE-NEXT:    movl %esi, %ecx # sched: [1:0.25]
+; SKX-SSE-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; SKX-SSE-NEXT:    monitor # sched: [100:0.25]
 ; SKX-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_monitor:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; SKX-NEXT:    movl %esi, %ecx # sched: [1:0.25]
+; SKX-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; SKX-NEXT:    monitor # sched: [100:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_monitor:
 ; BTVER2-SSE:       # %bb.0:
-; BTVER2-SSE-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; BTVER2-SSE-NEXT:    movl %esi, %ecx # sched: [1:0.50]
+; BTVER2-SSE-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; BTVER2-SSE-NEXT:    monitor # sched: [100:0.50]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
 ; BTVER2-LABEL: test_monitor:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    movl %esi, %ecx # sched: [1:0.50]
+; BTVER2-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    monitor # sched: [100:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-SSE-LABEL: test_monitor:
 ; ZNVER1-SSE:       # %bb.0:
-; ZNVER1-SSE-NEXT:    leaq (%rdi), %rax # sched: [1:0.25]
 ; ZNVER1-SSE-NEXT:    movl %esi, %ecx # sched: [1:0.25]
+; ZNVER1-SSE-NEXT:    leaq (%rdi), %rax # sched: [1:0.25]
 ; ZNVER1-SSE-NEXT:    monitor # sched: [100:0.25]
 ; ZNVER1-SSE-NEXT:    retq # sched: [1:0.50]
 ;
 ; ZNVER1-LABEL: test_monitor:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    leaq (%rdi), %rax # sched: [1:0.25]
 ; ZNVER1-NEXT:    movl %esi, %ecx # sched: [1:0.25]
+; ZNVER1-NEXT:    leaq (%rdi), %rax # sched: [1:0.25]
 ; ZNVER1-NEXT:    monitor # sched: [100:0.25]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   tail call void @llvm.x86.sse3.monitor(i8* %a0, i32 %a1, i32 %a2)
@@ -1273,120 +1273,120 @@
 define void @test_mwait(i32 %a0, i32 %a1) {
 ; GENERIC-LABEL: test_mwait:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    movl %edi, %ecx # sched: [1:0.33]
 ; GENERIC-NEXT:    movl %esi, %eax # sched: [1:0.33]
+; GENERIC-NEXT:    movl %edi, %ecx # sched: [1:0.33]
 ; GENERIC-NEXT:    mwait # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; ATOM-LABEL: test_mwait:
 ; ATOM:       # %bb.0:
-; ATOM-NEXT:    movl %edi, %ecx # sched: [1:0.50]
 ; ATOM-NEXT:    movl %esi, %eax # sched: [1:0.50]
+; ATOM-NEXT:    movl %edi, %ecx # sched: [1:0.50]
 ; ATOM-NEXT:    mwait # sched: [46:23.00]
 ; ATOM-NEXT:    retq # sched: [79:39.50]
 ;
 ; SLM-LABEL: test_mwait:
 ; SLM:       # %bb.0:
-; SLM-NEXT:    movl %edi, %ecx # sched: [1:0.50]
 ; SLM-NEXT:    movl %esi, %eax # sched: [1:0.50]
+; SLM-NEXT:    movl %edi, %ecx # sched: [1:0.50]
 ; SLM-NEXT:    mwait # sched: [100:1.00]
 ; SLM-NEXT:    retq # sched: [4:1.00]
 ;
 ; SANDY-SSE-LABEL: test_mwait:
 ; SANDY-SSE:       # %bb.0:
-; SANDY-SSE-NEXT:    movl %edi, %ecx # sched: [1:0.33]
 ; SANDY-SSE-NEXT:    movl %esi, %eax # sched: [1:0.33]
+; SANDY-SSE-NEXT:    movl %edi, %ecx # sched: [1:0.33]
 ; SANDY-SSE-NEXT:    mwait # sched: [100:0.33]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_mwait:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    movl %edi, %ecx # sched: [1:0.33]
 ; SANDY-NEXT:    movl %esi, %eax # sched: [1:0.33]
+; SANDY-NEXT:    movl %edi, %ecx # sched: [1:0.33]
 ; SANDY-NEXT:    mwait # sched: [100:0.33]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_mwait:
 ; HASWELL-SSE:       # %bb.0:
-; HASWELL-SSE-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; HASWELL-SSE-NEXT:    movl %esi, %eax # sched: [1:0.25]
+; HASWELL-SSE-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; HASWELL-SSE-NEXT:    mwait # sched: [20:2.50]
 ; HASWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; HASWELL-LABEL: test_mwait:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; HASWELL-NEXT:    movl %esi, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; HASWELL-NEXT:    mwait # sched: [20:2.50]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-SSE-LABEL: test_mwait:
 ; BROADWELL-SSE:       # %bb.0:
-; BROADWELL-SSE-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; BROADWELL-SSE-NEXT:    movl %esi, %eax # sched: [1:0.25]
+; BROADWELL-SSE-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; BROADWELL-SSE-NEXT:    mwait # sched: [100:0.25]
 ; BROADWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_mwait:
 ; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; BROADWELL-NEXT:    movl %esi, %eax # sched: [1:0.25]
+; BROADWELL-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; BROADWELL-NEXT:    mwait # sched: [100:0.25]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-SSE-LABEL: test_mwait:
 ; SKYLAKE-SSE:       # %bb.0:
-; SKYLAKE-SSE-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; SKYLAKE-SSE-NEXT:    movl %esi, %eax # sched: [1:0.25]
+; SKYLAKE-SSE-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; SKYLAKE-SSE-NEXT:    mwait # sched: [20:2.50]
 ; SKYLAKE-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_mwait:
 ; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; SKYLAKE-NEXT:    movl %esi, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; SKYLAKE-NEXT:    mwait # sched: [20:2.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-SSE-LABEL: test_mwait:
 ; SKX-SSE:       # %bb.0:
-; SKX-SSE-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; SKX-SSE-NEXT:    movl %esi, %eax # sched: [1:0.25]
+; SKX-SSE-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; SKX-SSE-NEXT:    mwait # sched: [20:2.50]
 ; SKX-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_mwait:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; SKX-NEXT:    movl %esi, %eax # sched: [1:0.25]
+; SKX-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; SKX-NEXT:    mwait # sched: [20:2.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-SSE-LABEL: test_mwait:
 ; BTVER2-SSE:       # %bb.0:
-; BTVER2-SSE-NEXT:    movl %edi, %ecx # sched: [1:0.50]
 ; BTVER2-SSE-NEXT:    movl %esi, %eax # sched: [1:0.50]
+; BTVER2-SSE-NEXT:    movl %edi, %ecx # sched: [1:0.50]
 ; BTVER2-SSE-NEXT:    mwait # sched: [100:0.50]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
 ; BTVER2-LABEL: test_mwait:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    movl %edi, %ecx # sched: [1:0.50]
 ; BTVER2-NEXT:    movl %esi, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    movl %edi, %ecx # sched: [1:0.50]
 ; BTVER2-NEXT:    mwait # sched: [100:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-SSE-LABEL: test_mwait:
 ; ZNVER1-SSE:       # %bb.0:
-; ZNVER1-SSE-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; ZNVER1-SSE-NEXT:    movl %esi, %eax # sched: [1:0.25]
+; ZNVER1-SSE-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; ZNVER1-SSE-NEXT:    mwait # sched: [100:0.25]
 ; ZNVER1-SSE-NEXT:    retq # sched: [1:0.50]
 ;
 ; ZNVER1-LABEL: test_mwait:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; ZNVER1-NEXT:    movl %esi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; ZNVER1-NEXT:    mwait # sched: [100:0.25]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   tail call void @llvm.x86.sse3.mwait(i32 %a0, i32 %a1)
diff --git a/llvm/test/CodeGen/X86/sse42-intrinsics-fast-isel-x86_64.ll b/llvm/test/CodeGen/X86/sse42-intrinsics-fast-isel-x86_64.ll
index 09a823e..852ebcd35 100644
--- a/llvm/test/CodeGen/X86/sse42-intrinsics-fast-isel-x86_64.ll
+++ b/llvm/test/CodeGen/X86/sse42-intrinsics-fast-isel-x86_64.ll
@@ -19,8 +19,8 @@
 define i64 @test_mm_crc64_u64(i64 %a0, i64 %a1) nounwind{
 ; CHECK-LABEL: test_mm_crc64_u64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    crc32q %rsi, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    crc32q %rsi, %rax
 ; CHECK-NEXT:    retq
   %res = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1)
   ret i64 %res
diff --git a/llvm/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
index 7f42f8e..68fc6d0 100644
--- a/llvm/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
@@ -35,22 +35,22 @@
 ;
 ; X64-SSE-LABEL: test_mm_cmpestra:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    xorl %r8d, %r8d
-; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    movl %esi, %edx
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    xorl %esi, %esi
 ; X64-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0
-; X64-SSE-NEXT:    seta %r8b
-; X64-SSE-NEXT:    movl %r8d, %eax
+; X64-SSE-NEXT:    seta %sil
+; X64-SSE-NEXT:    movl %esi, %eax
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_mm_cmpestra:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    xorl %r8d, %r8d
-; X64-AVX-NEXT:    movl %edi, %eax
 ; X64-AVX-NEXT:    movl %esi, %edx
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    xorl %esi, %esi
 ; X64-AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0
-; X64-AVX-NEXT:    seta %r8b
-; X64-AVX-NEXT:    movl %r8d, %eax
+; X64-AVX-NEXT:    seta %sil
+; X64-AVX-NEXT:    movl %esi, %eax
 ; X64-AVX-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
@@ -86,22 +86,22 @@
 ;
 ; X64-SSE-LABEL: test_mm_cmpestrc:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    xorl %r8d, %r8d
-; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    movl %esi, %edx
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    xorl %esi, %esi
 ; X64-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0
-; X64-SSE-NEXT:    setb %r8b
-; X64-SSE-NEXT:    movl %r8d, %eax
+; X64-SSE-NEXT:    setb %sil
+; X64-SSE-NEXT:    movl %esi, %eax
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_mm_cmpestrc:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    xorl %r8d, %r8d
-; X64-AVX-NEXT:    movl %edi, %eax
 ; X64-AVX-NEXT:    movl %esi, %edx
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    xorl %esi, %esi
 ; X64-AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0
-; X64-AVX-NEXT:    setb %r8b
-; X64-AVX-NEXT:    movl %r8d, %eax
+; X64-AVX-NEXT:    setb %sil
+; X64-AVX-NEXT:    movl %esi, %eax
 ; X64-AVX-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
@@ -129,16 +129,16 @@
 ;
 ; X64-SSE-LABEL: test_mm_cmpestri:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    movl %esi, %edx
+; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0
 ; X64-SSE-NEXT:    movl %ecx, %eax
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_mm_cmpestri:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    movl %edi, %eax
 ; X64-AVX-NEXT:    movl %esi, %edx
+; X64-AVX-NEXT:    movl %edi, %eax
 ; X64-AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0
 ; X64-AVX-NEXT:    movl %ecx, %eax
 ; X64-AVX-NEXT:    retq
@@ -166,15 +166,15 @@
 ;
 ; X64-SSE-LABEL: test_mm_cmpestrm:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    movl %esi, %edx
+; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    pcmpestrm $7, %xmm1, %xmm0
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_mm_cmpestrm:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    movl %edi, %eax
 ; X64-AVX-NEXT:    movl %esi, %edx
+; X64-AVX-NEXT:    movl %edi, %eax
 ; X64-AVX-NEXT:    vpcmpestrm $7, %xmm1, %xmm0
 ; X64-AVX-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -212,22 +212,22 @@
 ;
 ; X64-SSE-LABEL: test_mm_cmpestro:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    xorl %r8d, %r8d
-; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    movl %esi, %edx
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    xorl %esi, %esi
 ; X64-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0
-; X64-SSE-NEXT:    seto %r8b
-; X64-SSE-NEXT:    movl %r8d, %eax
+; X64-SSE-NEXT:    seto %sil
+; X64-SSE-NEXT:    movl %esi, %eax
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_mm_cmpestro:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    xorl %r8d, %r8d
-; X64-AVX-NEXT:    movl %edi, %eax
 ; X64-AVX-NEXT:    movl %esi, %edx
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    xorl %esi, %esi
 ; X64-AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0
-; X64-AVX-NEXT:    seto %r8b
-; X64-AVX-NEXT:    movl %r8d, %eax
+; X64-AVX-NEXT:    seto %sil
+; X64-AVX-NEXT:    movl %esi, %eax
 ; X64-AVX-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
@@ -263,22 +263,22 @@
 ;
 ; X64-SSE-LABEL: test_mm_cmpestrs:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    xorl %r8d, %r8d
-; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    movl %esi, %edx
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    xorl %esi, %esi
 ; X64-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0
-; X64-SSE-NEXT:    sets %r8b
-; X64-SSE-NEXT:    movl %r8d, %eax
+; X64-SSE-NEXT:    sets %sil
+; X64-SSE-NEXT:    movl %esi, %eax
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_mm_cmpestrs:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    xorl %r8d, %r8d
-; X64-AVX-NEXT:    movl %edi, %eax
 ; X64-AVX-NEXT:    movl %esi, %edx
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    xorl %esi, %esi
 ; X64-AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0
-; X64-AVX-NEXT:    sets %r8b
-; X64-AVX-NEXT:    movl %r8d, %eax
+; X64-AVX-NEXT:    sets %sil
+; X64-AVX-NEXT:    movl %esi, %eax
 ; X64-AVX-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
@@ -314,22 +314,22 @@
 ;
 ; X64-SSE-LABEL: test_mm_cmpestrz:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    xorl %r8d, %r8d
-; X64-SSE-NEXT:    movl %edi, %eax
 ; X64-SSE-NEXT:    movl %esi, %edx
+; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    xorl %esi, %esi
 ; X64-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0
-; X64-SSE-NEXT:    sete %r8b
-; X64-SSE-NEXT:    movl %r8d, %eax
+; X64-SSE-NEXT:    sete %sil
+; X64-SSE-NEXT:    movl %esi, %eax
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_mm_cmpestrz:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    xorl %r8d, %r8d
-; X64-AVX-NEXT:    movl %edi, %eax
 ; X64-AVX-NEXT:    movl %esi, %edx
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    xorl %esi, %esi
 ; X64-AVX-NEXT:    vpcmpestri $7, %xmm1, %xmm0
-; X64-AVX-NEXT:    sete %r8b
-; X64-AVX-NEXT:    movl %r8d, %eax
+; X64-AVX-NEXT:    sete %sil
+; X64-AVX-NEXT:    movl %esi, %eax
 ; X64-AVX-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
@@ -510,8 +510,8 @@
 ;
 ; X64-LABEL: test_mm_crc32_u8:
 ; X64:       # %bb.0:
-; X64-NEXT:    crc32b %sil, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    crc32b %sil, %eax
 ; X64-NEXT:    retq
   %res = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a0, i8 %a1)
   ret i32 %res
@@ -527,8 +527,8 @@
 ;
 ; X64-LABEL: test_mm_crc32_u16:
 ; X64:       # %bb.0:
-; X64-NEXT:    crc32w %si, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    crc32w %si, %eax
 ; X64-NEXT:    retq
   %res = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a0, i16 %a1)
   ret i32 %res
@@ -544,8 +544,8 @@
 ;
 ; X64-LABEL: test_mm_crc32_u32:
 ; X64:       # %bb.0:
-; X64-NEXT:    crc32l %esi, %edi
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    crc32l %esi, %eax
 ; X64-NEXT:    retq
   %res = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1)
   ret i32 %res
diff --git a/llvm/test/CodeGen/X86/sse42-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse42-intrinsics-x86.ll
index 6f1d653..120e3c0 100644
--- a/llvm/test/CodeGen/X86/sse42-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/sse42-intrinsics-x86.ll
@@ -626,8 +626,8 @@
 ;
 ; X64-LABEL: crc32_32_8:
 ; X64:       ## %bb.0:
-; X64-NEXT:    crc32b %sil, %edi ## encoding: [0xf2,0x40,0x0f,0x38,0xf0,0xfe]
 ; X64-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
+; X64-NEXT:    crc32b %sil, %eax ## encoding: [0xf2,0x40,0x0f,0x38,0xf0,0xc6]
 ; X64-NEXT:    retq ## encoding: [0xc3]
   %tmp = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a, i8 %b)
   ret i32 %tmp
@@ -643,8 +643,8 @@
 ;
 ; X64-LABEL: crc32_32_16:
 ; X64:       ## %bb.0:
-; X64-NEXT:    crc32w %si, %edi ## encoding: [0x66,0xf2,0x0f,0x38,0xf1,0xfe]
 ; X64-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
+; X64-NEXT:    crc32w %si, %eax ## encoding: [0x66,0xf2,0x0f,0x38,0xf1,0xc6]
 ; X64-NEXT:    retq ## encoding: [0xc3]
   %tmp = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a, i16 %b)
   ret i32 %tmp
@@ -660,8 +660,8 @@
 ;
 ; X64-LABEL: crc32_32_32:
 ; X64:       ## %bb.0:
-; X64-NEXT:    crc32l %esi, %edi ## encoding: [0xf2,0x0f,0x38,0xf1,0xfe]
 ; X64-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
+; X64-NEXT:    crc32l %esi, %eax ## encoding: [0xf2,0x0f,0x38,0xf1,0xc6]
 ; X64-NEXT:    retq ## encoding: [0xc3]
   %tmp = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a, i32 %b)
   ret i32 %tmp
diff --git a/llvm/test/CodeGen/X86/sse42-intrinsics-x86_64.ll b/llvm/test/CodeGen/X86/sse42-intrinsics-x86_64.ll
index f23dd36..2f069d0 100644
--- a/llvm/test/CodeGen/X86/sse42-intrinsics-x86_64.ll
+++ b/llvm/test/CodeGen/X86/sse42-intrinsics-x86_64.ll
@@ -9,8 +9,8 @@
 define i64 @crc32_64_8(i64 %a, i8 %b) nounwind {
 ; CHECK-LABEL: crc32_64_8:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    crc32b %sil, %edi ## encoding: [0xf2,0x40,0x0f,0x38,0xf0,0xfe]
 ; CHECK-NEXT:    movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
+; CHECK-NEXT:    crc32b %sil, %eax ## encoding: [0xf2,0x40,0x0f,0x38,0xf0,0xc6]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %tmp = call i64 @llvm.x86.sse42.crc32.64.8(i64 %a, i8 %b)
   ret i64 %tmp
@@ -19,8 +19,8 @@
 define i64 @crc32_64_64(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: crc32_64_64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    crc32q %rsi, %rdi ## encoding: [0xf2,0x48,0x0f,0x38,0xf1,0xfe]
 ; CHECK-NEXT:    movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
+; CHECK-NEXT:    crc32q %rsi, %rax ## encoding: [0xf2,0x48,0x0f,0x38,0xf1,0xc6]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %tmp = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a, i64 %b)
   ret i64 %tmp
diff --git a/llvm/test/CodeGen/X86/sse42-schedule.ll b/llvm/test/CodeGen/X86/sse42-schedule.ll
index 1e91079..7bb4ac6 100644
--- a/llvm/test/CodeGen/X86/sse42-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse42-schedule.ll
@@ -21,114 +21,114 @@
 define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) {
 ; GENERIC-LABEL: crc32_32_8:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; GENERIC-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; GENERIC-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; GENERIC-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SLM-LABEL: crc32_32_8:
 ; SLM:       # %bb.0:
-; SLM-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; SLM-NEXT:    crc32b (%rdx), %edi # sched: [6:1.00]
 ; SLM-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; SLM-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; SLM-NEXT:    crc32b (%rdx), %eax # sched: [6:1.00]
 ; SLM-NEXT:    retq # sched: [4:1.00]
 ;
 ; SANDY-SSE-LABEL: crc32_32_8:
 ; SANDY-SSE:       # %bb.0:
-; SANDY-SSE-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; SANDY-SSE-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; SANDY-SSE-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; SANDY-SSE-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; SANDY-SSE-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: crc32_32_8:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; SANDY-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; SANDY-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; SANDY-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; SANDY-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: crc32_32_8:
 ; HASWELL-SSE:       # %bb.0:
-; HASWELL-SSE-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; HASWELL-SSE-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; HASWELL-SSE-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; HASWELL-SSE-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; HASWELL-SSE-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; HASWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; HASWELL-LABEL: crc32_32_8:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; HASWELL-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; HASWELL-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; HASWELL-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-SSE-LABEL: crc32_32_8:
 ; BROADWELL-SSE:       # %bb.0:
-; BROADWELL-SSE-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; BROADWELL-SSE-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; BROADWELL-SSE-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; BROADWELL-SSE-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; BROADWELL-SSE-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; BROADWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: crc32_32_8:
 ; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; BROADWELL-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; BROADWELL-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; BROADWELL-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; BROADWELL-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-SSE-LABEL: crc32_32_8:
 ; SKYLAKE-SSE:       # %bb.0:
-; SKYLAKE-SSE-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; SKYLAKE-SSE-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; SKYLAKE-SSE-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; SKYLAKE-SSE-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; SKYLAKE-SSE-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; SKYLAKE-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: crc32_32_8:
 ; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; SKYLAKE-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; SKYLAKE-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; SKYLAKE-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-SSE-LABEL: crc32_32_8:
 ; SKX-SSE:       # %bb.0:
-; SKX-SSE-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; SKX-SSE-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; SKX-SSE-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; SKX-SSE-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; SKX-SSE-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; SKX-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: crc32_32_8:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; SKX-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; SKX-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; SKX-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; SKX-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-SSE-LABEL: crc32_32_8:
 ; BTVER2-SSE:       # %bb.0:
-; BTVER2-SSE-NEXT:    crc32b %sil, %edi # sched: [3:2.00]
-; BTVER2-SSE-NEXT:    crc32b (%rdx), %edi # sched: [6:2.00]
 ; BTVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BTVER2-SSE-NEXT:    crc32b %sil, %eax # sched: [3:2.00]
+; BTVER2-SSE-NEXT:    crc32b (%rdx), %eax # sched: [6:2.00]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
 ; BTVER2-LABEL: crc32_32_8:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    crc32b %sil, %edi # sched: [3:2.00]
-; BTVER2-NEXT:    crc32b (%rdx), %edi # sched: [6:2.00]
 ; BTVER2-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    crc32b %sil, %eax # sched: [3:2.00]
+; BTVER2-NEXT:    crc32b (%rdx), %eax # sched: [6:2.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-SSE-LABEL: crc32_32_8:
 ; ZNVER1-SSE:       # %bb.0:
-; ZNVER1-SSE-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; ZNVER1-SSE-NEXT:    crc32b (%rdx), %edi # sched: [10:1.00]
 ; ZNVER1-SSE-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; ZNVER1-SSE-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; ZNVER1-SSE-NEXT:    crc32b (%rdx), %eax # sched: [10:1.00]
 ; ZNVER1-SSE-NEXT:    retq # sched: [1:0.50]
 ;
 ; ZNVER1-LABEL: crc32_32_8:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; ZNVER1-NEXT:    crc32b (%rdx), %edi # sched: [10:1.00]
 ; ZNVER1-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; ZNVER1-NEXT:    crc32b (%rdx), %eax # sched: [10:1.00]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   %1 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a0, i8 %a1)
   %2 = load i8, i8 *%a2
@@ -140,114 +140,114 @@
 define i32 @crc32_32_16(i32 %a0, i16 %a1, i16 *%a2) {
 ; GENERIC-LABEL: crc32_32_16:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    crc32w %si, %edi # sched: [3:1.00]
-; GENERIC-NEXT:    crc32w (%rdx), %edi # sched: [8:1.00]
 ; GENERIC-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT:    crc32w %si, %eax # sched: [3:1.00]
+; GENERIC-NEXT:    crc32w (%rdx), %eax # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SLM-LABEL: crc32_32_16:
 ; SLM:       # %bb.0:
-; SLM-NEXT:    crc32w %si, %edi # sched: [3:1.00]
-; SLM-NEXT:    crc32w (%rdx), %edi # sched: [6:1.00]
 ; SLM-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; SLM-NEXT:    crc32w %si, %eax # sched: [3:1.00]
+; SLM-NEXT:    crc32w (%rdx), %eax # sched: [6:1.00]
 ; SLM-NEXT:    retq # sched: [4:1.00]
 ;
 ; SANDY-SSE-LABEL: crc32_32_16:
 ; SANDY-SSE:       # %bb.0:
-; SANDY-SSE-NEXT:    crc32w %si, %edi # sched: [3:1.00]
-; SANDY-SSE-NEXT:    crc32w (%rdx), %edi # sched: [8:1.00]
 ; SANDY-SSE-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; SANDY-SSE-NEXT:    crc32w %si, %eax # sched: [3:1.00]
+; SANDY-SSE-NEXT:    crc32w (%rdx), %eax # sched: [8:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: crc32_32_16:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    crc32w %si, %edi # sched: [3:1.00]
-; SANDY-NEXT:    crc32w (%rdx), %edi # sched: [8:1.00]
 ; SANDY-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; SANDY-NEXT:    crc32w %si, %eax # sched: [3:1.00]
+; SANDY-NEXT:    crc32w (%rdx), %eax # sched: [8:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: crc32_32_16:
 ; HASWELL-SSE:       # %bb.0:
-; HASWELL-SSE-NEXT:    crc32w %si, %edi # sched: [3:1.00]
-; HASWELL-SSE-NEXT:    crc32w (%rdx), %edi # sched: [8:1.00]
 ; HASWELL-SSE-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; HASWELL-SSE-NEXT:    crc32w %si, %eax # sched: [3:1.00]
+; HASWELL-SSE-NEXT:    crc32w (%rdx), %eax # sched: [8:1.00]
 ; HASWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; HASWELL-LABEL: crc32_32_16:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    crc32w %si, %edi # sched: [3:1.00]
-; HASWELL-NEXT:    crc32w (%rdx), %edi # sched: [8:1.00]
 ; HASWELL-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    crc32w %si, %eax # sched: [3:1.00]
+; HASWELL-NEXT:    crc32w (%rdx), %eax # sched: [8:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-SSE-LABEL: crc32_32_16:
 ; BROADWELL-SSE:       # %bb.0:
-; BROADWELL-SSE-NEXT:    crc32w %si, %edi # sched: [3:1.00]
-; BROADWELL-SSE-NEXT:    crc32w (%rdx), %edi # sched: [8:1.00]
 ; BROADWELL-SSE-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; BROADWELL-SSE-NEXT:    crc32w %si, %eax # sched: [3:1.00]
+; BROADWELL-SSE-NEXT:    crc32w (%rdx), %eax # sched: [8:1.00]
 ; BROADWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: crc32_32_16:
 ; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    crc32w %si, %edi # sched: [3:1.00]
-; BROADWELL-NEXT:    crc32w (%rdx), %edi # sched: [8:1.00]
 ; BROADWELL-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; BROADWELL-NEXT:    crc32w %si, %eax # sched: [3:1.00]
+; BROADWELL-NEXT:    crc32w (%rdx), %eax # sched: [8:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-SSE-LABEL: crc32_32_16:
 ; SKYLAKE-SSE:       # %bb.0:
-; SKYLAKE-SSE-NEXT:    crc32w %si, %edi # sched: [3:1.00]
-; SKYLAKE-SSE-NEXT:    crc32w (%rdx), %edi # sched: [8:1.00]
 ; SKYLAKE-SSE-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; SKYLAKE-SSE-NEXT:    crc32w %si, %eax # sched: [3:1.00]
+; SKYLAKE-SSE-NEXT:    crc32w (%rdx), %eax # sched: [8:1.00]
 ; SKYLAKE-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: crc32_32_16:
 ; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    crc32w %si, %edi # sched: [3:1.00]
-; SKYLAKE-NEXT:    crc32w (%rdx), %edi # sched: [8:1.00]
 ; SKYLAKE-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT:    crc32w %si, %eax # sched: [3:1.00]
+; SKYLAKE-NEXT:    crc32w (%rdx), %eax # sched: [8:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-SSE-LABEL: crc32_32_16:
 ; SKX-SSE:       # %bb.0:
-; SKX-SSE-NEXT:    crc32w %si, %edi # sched: [3:1.00]
-; SKX-SSE-NEXT:    crc32w (%rdx), %edi # sched: [8:1.00]
 ; SKX-SSE-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; SKX-SSE-NEXT:    crc32w %si, %eax # sched: [3:1.00]
+; SKX-SSE-NEXT:    crc32w (%rdx), %eax # sched: [8:1.00]
 ; SKX-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: crc32_32_16:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    crc32w %si, %edi # sched: [3:1.00]
-; SKX-NEXT:    crc32w (%rdx), %edi # sched: [8:1.00]
 ; SKX-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; SKX-NEXT:    crc32w %si, %eax # sched: [3:1.00]
+; SKX-NEXT:    crc32w (%rdx), %eax # sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-SSE-LABEL: crc32_32_16:
 ; BTVER2-SSE:       # %bb.0:
-; BTVER2-SSE-NEXT:    crc32w %si, %edi # sched: [3:2.00]
-; BTVER2-SSE-NEXT:    crc32w (%rdx), %edi # sched: [6:2.00]
 ; BTVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BTVER2-SSE-NEXT:    crc32w %si, %eax # sched: [3:2.00]
+; BTVER2-SSE-NEXT:    crc32w (%rdx), %eax # sched: [6:2.00]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
 ; BTVER2-LABEL: crc32_32_16:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    crc32w %si, %edi # sched: [3:2.00]
-; BTVER2-NEXT:    crc32w (%rdx), %edi # sched: [6:2.00]
 ; BTVER2-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    crc32w %si, %eax # sched: [3:2.00]
+; BTVER2-NEXT:    crc32w (%rdx), %eax # sched: [6:2.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-SSE-LABEL: crc32_32_16:
 ; ZNVER1-SSE:       # %bb.0:
-; ZNVER1-SSE-NEXT:    crc32w %si, %edi # sched: [3:1.00]
-; ZNVER1-SSE-NEXT:    crc32w (%rdx), %edi # sched: [10:1.00]
 ; ZNVER1-SSE-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; ZNVER1-SSE-NEXT:    crc32w %si, %eax # sched: [3:1.00]
+; ZNVER1-SSE-NEXT:    crc32w (%rdx), %eax # sched: [10:1.00]
 ; ZNVER1-SSE-NEXT:    retq # sched: [1:0.50]
 ;
 ; ZNVER1-LABEL: crc32_32_16:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    crc32w %si, %edi # sched: [3:1.00]
-; ZNVER1-NEXT:    crc32w (%rdx), %edi # sched: [10:1.00]
 ; ZNVER1-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    crc32w %si, %eax # sched: [3:1.00]
+; ZNVER1-NEXT:    crc32w (%rdx), %eax # sched: [10:1.00]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   %1 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a0, i16 %a1)
   %2 = load i16, i16 *%a2
@@ -259,114 +259,114 @@
 define i32 @crc32_32_32(i32 %a0, i32 %a1, i32 *%a2) {
 ; GENERIC-LABEL: crc32_32_32:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    crc32l %esi, %edi # sched: [3:1.00]
-; GENERIC-NEXT:    crc32l (%rdx), %edi # sched: [8:1.00]
 ; GENERIC-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT:    crc32l %esi, %eax # sched: [3:1.00]
+; GENERIC-NEXT:    crc32l (%rdx), %eax # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SLM-LABEL: crc32_32_32:
 ; SLM:       # %bb.0:
-; SLM-NEXT:    crc32l %esi, %edi # sched: [3:1.00]
-; SLM-NEXT:    crc32l (%rdx), %edi # sched: [6:1.00]
 ; SLM-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; SLM-NEXT:    crc32l %esi, %eax # sched: [3:1.00]
+; SLM-NEXT:    crc32l (%rdx), %eax # sched: [6:1.00]
 ; SLM-NEXT:    retq # sched: [4:1.00]
 ;
 ; SANDY-SSE-LABEL: crc32_32_32:
 ; SANDY-SSE:       # %bb.0:
-; SANDY-SSE-NEXT:    crc32l %esi, %edi # sched: [3:1.00]
-; SANDY-SSE-NEXT:    crc32l (%rdx), %edi # sched: [8:1.00]
 ; SANDY-SSE-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; SANDY-SSE-NEXT:    crc32l %esi, %eax # sched: [3:1.00]
+; SANDY-SSE-NEXT:    crc32l (%rdx), %eax # sched: [8:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: crc32_32_32:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    crc32l %esi, %edi # sched: [3:1.00]
-; SANDY-NEXT:    crc32l (%rdx), %edi # sched: [8:1.00]
 ; SANDY-NEXT:    movl %edi, %eax # sched: [1:0.33]
+; SANDY-NEXT:    crc32l %esi, %eax # sched: [3:1.00]
+; SANDY-NEXT:    crc32l (%rdx), %eax # sched: [8:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: crc32_32_32:
 ; HASWELL-SSE:       # %bb.0:
-; HASWELL-SSE-NEXT:    crc32l %esi, %edi # sched: [3:1.00]
-; HASWELL-SSE-NEXT:    crc32l (%rdx), %edi # sched: [8:1.00]
 ; HASWELL-SSE-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; HASWELL-SSE-NEXT:    crc32l %esi, %eax # sched: [3:1.00]
+; HASWELL-SSE-NEXT:    crc32l (%rdx), %eax # sched: [8:1.00]
 ; HASWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; HASWELL-LABEL: crc32_32_32:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    crc32l %esi, %edi # sched: [3:1.00]
-; HASWELL-NEXT:    crc32l (%rdx), %edi # sched: [8:1.00]
 ; HASWELL-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; HASWELL-NEXT:    crc32l %esi, %eax # sched: [3:1.00]
+; HASWELL-NEXT:    crc32l (%rdx), %eax # sched: [8:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-SSE-LABEL: crc32_32_32:
 ; BROADWELL-SSE:       # %bb.0:
-; BROADWELL-SSE-NEXT:    crc32l %esi, %edi # sched: [3:1.00]
-; BROADWELL-SSE-NEXT:    crc32l (%rdx), %edi # sched: [8:1.00]
 ; BROADWELL-SSE-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; BROADWELL-SSE-NEXT:    crc32l %esi, %eax # sched: [3:1.00]
+; BROADWELL-SSE-NEXT:    crc32l (%rdx), %eax # sched: [8:1.00]
 ; BROADWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: crc32_32_32:
 ; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    crc32l %esi, %edi # sched: [3:1.00]
-; BROADWELL-NEXT:    crc32l (%rdx), %edi # sched: [8:1.00]
 ; BROADWELL-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; BROADWELL-NEXT:    crc32l %esi, %eax # sched: [3:1.00]
+; BROADWELL-NEXT:    crc32l (%rdx), %eax # sched: [8:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-SSE-LABEL: crc32_32_32:
 ; SKYLAKE-SSE:       # %bb.0:
-; SKYLAKE-SSE-NEXT:    crc32l %esi, %edi # sched: [3:1.00]
-; SKYLAKE-SSE-NEXT:    crc32l (%rdx), %edi # sched: [8:1.00]
 ; SKYLAKE-SSE-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; SKYLAKE-SSE-NEXT:    crc32l %esi, %eax # sched: [3:1.00]
+; SKYLAKE-SSE-NEXT:    crc32l (%rdx), %eax # sched: [8:1.00]
 ; SKYLAKE-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: crc32_32_32:
 ; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    crc32l %esi, %edi # sched: [3:1.00]
-; SKYLAKE-NEXT:    crc32l (%rdx), %edi # sched: [8:1.00]
 ; SKYLAKE-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT:    crc32l %esi, %eax # sched: [3:1.00]
+; SKYLAKE-NEXT:    crc32l (%rdx), %eax # sched: [8:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-SSE-LABEL: crc32_32_32:
 ; SKX-SSE:       # %bb.0:
-; SKX-SSE-NEXT:    crc32l %esi, %edi # sched: [3:1.00]
-; SKX-SSE-NEXT:    crc32l (%rdx), %edi # sched: [8:1.00]
 ; SKX-SSE-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; SKX-SSE-NEXT:    crc32l %esi, %eax # sched: [3:1.00]
+; SKX-SSE-NEXT:    crc32l (%rdx), %eax # sched: [8:1.00]
 ; SKX-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: crc32_32_32:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    crc32l %esi, %edi # sched: [3:1.00]
-; SKX-NEXT:    crc32l (%rdx), %edi # sched: [8:1.00]
 ; SKX-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; SKX-NEXT:    crc32l %esi, %eax # sched: [3:1.00]
+; SKX-NEXT:    crc32l (%rdx), %eax # sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-SSE-LABEL: crc32_32_32:
 ; BTVER2-SSE:       # %bb.0:
-; BTVER2-SSE-NEXT:    crc32l %esi, %edi # sched: [3:2.00]
-; BTVER2-SSE-NEXT:    crc32l (%rdx), %edi # sched: [6:2.00]
 ; BTVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BTVER2-SSE-NEXT:    crc32l %esi, %eax # sched: [3:2.00]
+; BTVER2-SSE-NEXT:    crc32l (%rdx), %eax # sched: [6:2.00]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
 ; BTVER2-LABEL: crc32_32_32:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    crc32l %esi, %edi # sched: [3:2.00]
-; BTVER2-NEXT:    crc32l (%rdx), %edi # sched: [6:2.00]
 ; BTVER2-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BTVER2-NEXT:    crc32l %esi, %eax # sched: [3:2.00]
+; BTVER2-NEXT:    crc32l (%rdx), %eax # sched: [6:2.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-SSE-LABEL: crc32_32_32:
 ; ZNVER1-SSE:       # %bb.0:
-; ZNVER1-SSE-NEXT:    crc32l %esi, %edi # sched: [3:1.00]
-; ZNVER1-SSE-NEXT:    crc32l (%rdx), %edi # sched: [10:1.00]
 ; ZNVER1-SSE-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; ZNVER1-SSE-NEXT:    crc32l %esi, %eax # sched: [3:1.00]
+; ZNVER1-SSE-NEXT:    crc32l (%rdx), %eax # sched: [10:1.00]
 ; ZNVER1-SSE-NEXT:    retq # sched: [1:0.50]
 ;
 ; ZNVER1-LABEL: crc32_32_32:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    crc32l %esi, %edi # sched: [3:1.00]
-; ZNVER1-NEXT:    crc32l (%rdx), %edi # sched: [10:1.00]
 ; ZNVER1-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT:    crc32l %esi, %eax # sched: [3:1.00]
+; ZNVER1-NEXT:    crc32l (%rdx), %eax # sched: [10:1.00]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   %1 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1)
   %2 = load i32, i32 *%a2
@@ -378,114 +378,114 @@
 define i64 @crc32_64_8(i64 %a0, i8 %a1, i8 *%a2) nounwind {
 ; GENERIC-LABEL: crc32_64_8:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; GENERIC-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; GENERIC-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; GENERIC-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SLM-LABEL: crc32_64_8:
 ; SLM:       # %bb.0:
-; SLM-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; SLM-NEXT:    crc32b (%rdx), %edi # sched: [6:1.00]
 ; SLM-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; SLM-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; SLM-NEXT:    crc32b (%rdx), %eax # sched: [6:1.00]
 ; SLM-NEXT:    retq # sched: [4:1.00]
 ;
 ; SANDY-SSE-LABEL: crc32_64_8:
 ; SANDY-SSE:       # %bb.0:
-; SANDY-SSE-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; SANDY-SSE-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; SANDY-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; SANDY-SSE-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; SANDY-SSE-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: crc32_64_8:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; SANDY-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; SANDY-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; SANDY-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; SANDY-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: crc32_64_8:
 ; HASWELL-SSE:       # %bb.0:
-; HASWELL-SSE-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; HASWELL-SSE-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; HASWELL-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; HASWELL-SSE-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; HASWELL-SSE-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; HASWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; HASWELL-LABEL: crc32_64_8:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; HASWELL-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; HASWELL-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; HASWELL-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-SSE-LABEL: crc32_64_8:
 ; BROADWELL-SSE:       # %bb.0:
-; BROADWELL-SSE-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; BROADWELL-SSE-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; BROADWELL-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; BROADWELL-SSE-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; BROADWELL-SSE-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; BROADWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: crc32_64_8:
 ; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; BROADWELL-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; BROADWELL-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; BROADWELL-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; BROADWELL-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-SSE-LABEL: crc32_64_8:
 ; SKYLAKE-SSE:       # %bb.0:
-; SKYLAKE-SSE-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; SKYLAKE-SSE-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; SKYLAKE-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; SKYLAKE-SSE-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; SKYLAKE-SSE-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; SKYLAKE-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: crc32_64_8:
 ; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; SKYLAKE-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; SKYLAKE-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; SKYLAKE-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-SSE-LABEL: crc32_64_8:
 ; SKX-SSE:       # %bb.0:
-; SKX-SSE-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; SKX-SSE-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; SKX-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; SKX-SSE-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; SKX-SSE-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; SKX-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: crc32_64_8:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; SKX-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; SKX-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; SKX-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; SKX-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-SSE-LABEL: crc32_64_8:
 ; BTVER2-SSE:       # %bb.0:
-; BTVER2-SSE-NEXT:    crc32b %sil, %edi # sched: [3:2.00]
-; BTVER2-SSE-NEXT:    crc32b (%rdx), %edi # sched: [6:2.00]
 ; BTVER2-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BTVER2-SSE-NEXT:    crc32b %sil, %eax # sched: [3:2.00]
+; BTVER2-SSE-NEXT:    crc32b (%rdx), %eax # sched: [6:2.00]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
 ; BTVER2-LABEL: crc32_64_8:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    crc32b %sil, %edi # sched: [3:2.00]
-; BTVER2-NEXT:    crc32b (%rdx), %edi # sched: [6:2.00]
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    crc32b %sil, %eax # sched: [3:2.00]
+; BTVER2-NEXT:    crc32b (%rdx), %eax # sched: [6:2.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-SSE-LABEL: crc32_64_8:
 ; ZNVER1-SSE:       # %bb.0:
-; ZNVER1-SSE-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; ZNVER1-SSE-NEXT:    crc32b (%rdx), %edi # sched: [10:1.00]
 ; ZNVER1-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; ZNVER1-SSE-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; ZNVER1-SSE-NEXT:    crc32b (%rdx), %eax # sched: [10:1.00]
 ; ZNVER1-SSE-NEXT:    retq # sched: [1:0.50]
 ;
 ; ZNVER1-LABEL: crc32_64_8:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
-; ZNVER1-NEXT:    crc32b (%rdx), %edi # sched: [10:1.00]
 ; ZNVER1-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    crc32b %sil, %eax # sched: [3:1.00]
+; ZNVER1-NEXT:    crc32b (%rdx), %eax # sched: [10:1.00]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   %1 = call i64 @llvm.x86.sse42.crc32.64.8(i64 %a0, i8 %a1)
   %2 = load i8, i8 *%a2
@@ -497,114 +497,114 @@
 define i64 @crc32_64_64(i64 %a0, i64 %a1, i64 *%a2) {
 ; GENERIC-LABEL: crc32_64_64:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    crc32q %rsi, %rdi # sched: [3:1.00]
-; GENERIC-NEXT:    crc32q (%rdx), %rdi # sched: [8:1.00]
 ; GENERIC-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT:    crc32q %rsi, %rax # sched: [3:1.00]
+; GENERIC-NEXT:    crc32q (%rdx), %rax # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SLM-LABEL: crc32_64_64:
 ; SLM:       # %bb.0:
-; SLM-NEXT:    crc32q %rsi, %rdi # sched: [3:1.00]
-; SLM-NEXT:    crc32q (%rdx), %rdi # sched: [6:1.00]
 ; SLM-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; SLM-NEXT:    crc32q %rsi, %rax # sched: [3:1.00]
+; SLM-NEXT:    crc32q (%rdx), %rax # sched: [6:1.00]
 ; SLM-NEXT:    retq # sched: [4:1.00]
 ;
 ; SANDY-SSE-LABEL: crc32_64_64:
 ; SANDY-SSE:       # %bb.0:
-; SANDY-SSE-NEXT:    crc32q %rsi, %rdi # sched: [3:1.00]
-; SANDY-SSE-NEXT:    crc32q (%rdx), %rdi # sched: [8:1.00]
 ; SANDY-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; SANDY-SSE-NEXT:    crc32q %rsi, %rax # sched: [3:1.00]
+; SANDY-SSE-NEXT:    crc32q (%rdx), %rax # sched: [8:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: crc32_64_64:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    crc32q %rsi, %rdi # sched: [3:1.00]
-; SANDY-NEXT:    crc32q (%rdx), %rdi # sched: [8:1.00]
 ; SANDY-NEXT:    movq %rdi, %rax # sched: [1:0.33]
+; SANDY-NEXT:    crc32q %rsi, %rax # sched: [3:1.00]
+; SANDY-NEXT:    crc32q (%rdx), %rax # sched: [8:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: crc32_64_64:
 ; HASWELL-SSE:       # %bb.0:
-; HASWELL-SSE-NEXT:    crc32q %rsi, %rdi # sched: [3:1.00]
-; HASWELL-SSE-NEXT:    crc32q (%rdx), %rdi # sched: [8:1.00]
 ; HASWELL-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; HASWELL-SSE-NEXT:    crc32q %rsi, %rax # sched: [3:1.00]
+; HASWELL-SSE-NEXT:    crc32q (%rdx), %rax # sched: [8:1.00]
 ; HASWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; HASWELL-LABEL: crc32_64_64:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    crc32q %rsi, %rdi # sched: [3:1.00]
-; HASWELL-NEXT:    crc32q (%rdx), %rdi # sched: [8:1.00]
 ; HASWELL-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; HASWELL-NEXT:    crc32q %rsi, %rax # sched: [3:1.00]
+; HASWELL-NEXT:    crc32q (%rdx), %rax # sched: [8:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-SSE-LABEL: crc32_64_64:
 ; BROADWELL-SSE:       # %bb.0:
-; BROADWELL-SSE-NEXT:    crc32q %rsi, %rdi # sched: [3:1.00]
-; BROADWELL-SSE-NEXT:    crc32q (%rdx), %rdi # sched: [8:1.00]
 ; BROADWELL-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; BROADWELL-SSE-NEXT:    crc32q %rsi, %rax # sched: [3:1.00]
+; BROADWELL-SSE-NEXT:    crc32q (%rdx), %rax # sched: [8:1.00]
 ; BROADWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: crc32_64_64:
 ; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    crc32q %rsi, %rdi # sched: [3:1.00]
-; BROADWELL-NEXT:    crc32q (%rdx), %rdi # sched: [8:1.00]
 ; BROADWELL-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; BROADWELL-NEXT:    crc32q %rsi, %rax # sched: [3:1.00]
+; BROADWELL-NEXT:    crc32q (%rdx), %rax # sched: [8:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-SSE-LABEL: crc32_64_64:
 ; SKYLAKE-SSE:       # %bb.0:
-; SKYLAKE-SSE-NEXT:    crc32q %rsi, %rdi # sched: [3:1.00]
-; SKYLAKE-SSE-NEXT:    crc32q (%rdx), %rdi # sched: [8:1.00]
 ; SKYLAKE-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; SKYLAKE-SSE-NEXT:    crc32q %rsi, %rax # sched: [3:1.00]
+; SKYLAKE-SSE-NEXT:    crc32q (%rdx), %rax # sched: [8:1.00]
 ; SKYLAKE-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: crc32_64_64:
 ; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    crc32q %rsi, %rdi # sched: [3:1.00]
-; SKYLAKE-NEXT:    crc32q (%rdx), %rdi # sched: [8:1.00]
 ; SKYLAKE-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT:    crc32q %rsi, %rax # sched: [3:1.00]
+; SKYLAKE-NEXT:    crc32q (%rdx), %rax # sched: [8:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-SSE-LABEL: crc32_64_64:
 ; SKX-SSE:       # %bb.0:
-; SKX-SSE-NEXT:    crc32q %rsi, %rdi # sched: [3:1.00]
-; SKX-SSE-NEXT:    crc32q (%rdx), %rdi # sched: [8:1.00]
 ; SKX-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; SKX-SSE-NEXT:    crc32q %rsi, %rax # sched: [3:1.00]
+; SKX-SSE-NEXT:    crc32q (%rdx), %rax # sched: [8:1.00]
 ; SKX-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: crc32_64_64:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    crc32q %rsi, %rdi # sched: [3:1.00]
-; SKX-NEXT:    crc32q (%rdx), %rdi # sched: [8:1.00]
 ; SKX-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; SKX-NEXT:    crc32q %rsi, %rax # sched: [3:1.00]
+; SKX-NEXT:    crc32q (%rdx), %rax # sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-SSE-LABEL: crc32_64_64:
 ; BTVER2-SSE:       # %bb.0:
-; BTVER2-SSE-NEXT:    crc32q %rsi, %rdi # sched: [3:2.00]
-; BTVER2-SSE-NEXT:    crc32q (%rdx), %rdi # sched: [6:2.00]
 ; BTVER2-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BTVER2-SSE-NEXT:    crc32q %rsi, %rax # sched: [3:2.00]
+; BTVER2-SSE-NEXT:    crc32q (%rdx), %rax # sched: [6:2.00]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
 ; BTVER2-LABEL: crc32_64_64:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    crc32q %rsi, %rdi # sched: [3:2.00]
-; BTVER2-NEXT:    crc32q (%rdx), %rdi # sched: [6:2.00]
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    crc32q %rsi, %rax # sched: [3:2.00]
+; BTVER2-NEXT:    crc32q (%rdx), %rax # sched: [6:2.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-SSE-LABEL: crc32_64_64:
 ; ZNVER1-SSE:       # %bb.0:
-; ZNVER1-SSE-NEXT:    crc32q %rsi, %rdi # sched: [3:1.00]
-; ZNVER1-SSE-NEXT:    crc32q (%rdx), %rdi # sched: [10:1.00]
 ; ZNVER1-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; ZNVER1-SSE-NEXT:    crc32q %rsi, %rax # sched: [3:1.00]
+; ZNVER1-SSE-NEXT:    crc32q (%rdx), %rax # sched: [10:1.00]
 ; ZNVER1-SSE-NEXT:    retq # sched: [1:0.50]
 ;
 ; ZNVER1-LABEL: crc32_64_64:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    crc32q %rsi, %rdi # sched: [3:1.00]
-; ZNVER1-NEXT:    crc32q (%rdx), %rdi # sched: [10:1.00]
 ; ZNVER1-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; ZNVER1-NEXT:    crc32q %rsi, %rax # sched: [3:1.00]
+; ZNVER1-NEXT:    crc32q (%rdx), %rax # sched: [10:1.00]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   %1 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1)
   %2 = load i64, i64 *%a2
diff --git a/llvm/test/CodeGen/X86/sttni.ll b/llvm/test/CodeGen/X86/sttni.ll
index 70cccaa..021fb2c 100644
--- a/llvm/test/CodeGen/X86/sttni.ll
+++ b/llvm/test/CodeGen/X86/sttni.ll
@@ -20,8 +20,8 @@
 ;
 ; X64-LABEL: pcmpestri_reg_eq_i8:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    retq
@@ -42,8 +42,8 @@
 ;
 ; X64-LABEL: pcmpestri_reg_idx_i8:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
 ; X64-NEXT:    movl %ecx, %eax
 ; X64-NEXT:    retq
@@ -81,8 +81,8 @@
 ;
 ; X64-LABEL: pcmpestri_reg_diff_i8:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
 ; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
 ; X64-NEXT:    cmpl $16, %ecx
@@ -133,8 +133,8 @@
 ; X64-LABEL: pcmpestri_mem_eq_i8:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NEXT:    movl %ecx, %edx
 ; X64-NEXT:    pcmpestri $24, (%r8), %xmm0
 ; X64-NEXT:    setae %al
@@ -166,8 +166,8 @@
 ; X64-LABEL: pcmpestri_mem_idx_i8:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NEXT:    movl %ecx, %edx
 ; X64-NEXT:    pcmpestri $24, (%r8), %xmm0
 ; X64-NEXT:    movl %ecx, %eax
@@ -216,9 +216,9 @@
 ;
 ; X64-LABEL: pcmpestri_mem_diff_i8:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movdqu (%rdi), %xmm1
 ; X64-NEXT:    movdqu (%rdx), %xmm0
-; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl %ecx, %edx
 ; X64-NEXT:    pcmpestri $24, %xmm0, %xmm1
 ; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
@@ -268,8 +268,8 @@
 ;
 ; X64-LABEL: pcmpestri_reg_eq_i16:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    retq
@@ -292,8 +292,8 @@
 ;
 ; X64-LABEL: pcmpestri_reg_idx_i16:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
 ; X64-NEXT:    movl %ecx, %eax
 ; X64-NEXT:    retq
@@ -334,8 +334,8 @@
 ;
 ; X64-LABEL: pcmpestri_reg_diff_i16:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
 ; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
 ; X64-NEXT:    cmpl $16, %ecx
@@ -388,8 +388,8 @@
 ; X64-LABEL: pcmpestri_mem_eq_i16:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NEXT:    movl %ecx, %edx
 ; X64-NEXT:    pcmpestri $25, (%r8), %xmm0
 ; X64-NEXT:    setae %al
@@ -423,8 +423,8 @@
 ; X64-LABEL: pcmpestri_mem_idx_i16:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movdqu (%rdi), %xmm0
 ; X64-NEXT:    movl %ecx, %edx
 ; X64-NEXT:    pcmpestri $25, (%r8), %xmm0
 ; X64-NEXT:    movl %ecx, %eax
@@ -476,9 +476,9 @@
 ;
 ; X64-LABEL: pcmpestri_mem_diff_i16:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movdqu (%rdi), %xmm1
 ; X64-NEXT:    movdqu (%rdx), %xmm0
-; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    movl %ecx, %edx
 ; X64-NEXT:    pcmpestri $25, %xmm0, %xmm1
 ; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
@@ -989,13 +989,13 @@
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rcx, %r8
 ; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    xorl %r10d, %r10d
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorl %esi, %esi
 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
-; X64-NEXT:    setb %r10b
+; X64-NEXT:    setb %sil
 ; X64-NEXT:    movl %ecx, (%r9)
-; X64-NEXT:    movl %r10d, (%r8)
+; X64-NEXT:    movl %esi, (%r8)
 ; X64-NEXT:    retq
 entry:
   %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
@@ -1026,13 +1026,13 @@
 ; X64-LABEL: pcmpestr_mask_flag:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    xorl %r9d, %r9d
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorl %esi, %esi
 ; X64-NEXT:    pcmpestrm $24, %xmm1, %xmm0
-; X64-NEXT:    setb %r9b
+; X64-NEXT:    setb %sil
 ; X64-NEXT:    movdqa %xmm0, (%r8)
-; X64-NEXT:    movl %r9d, (%rcx)
+; X64-NEXT:    movl %esi, (%rcx)
 ; X64-NEXT:    retq
 entry:
   %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
@@ -1064,9 +1064,9 @@
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rcx, %r8
 ; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movdqa %xmm0, %xmm2
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movdqa %xmm0, %xmm2
 ; X64-NEXT:    pcmpestrm $24, %xmm1, %xmm0
 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm2
 ; X64-NEXT:    movdqa %xmm0, (%r9)
@@ -1110,9 +1110,9 @@
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rcx, %r9
 ; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movdqa %xmm0, %xmm2
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movdqa %xmm0, %xmm2
 ; X64-NEXT:    pcmpestrm $24, %xmm1, %xmm0
 ; X64-NEXT:    xorl %esi, %esi
 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm2
@@ -1321,9 +1321,9 @@
 ;
 ; X64-LABEL: pcmpestri_nontemporal:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    movntdqa (%rsi), %xmm1
 ; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
 ; X64-NEXT:    setb %sil
 ; X64-NEXT:    movl %esi, %eax
diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/llvm/test/CodeGen/X86/subcarry.ll
index 932616c..4b796da 100644
--- a/llvm/test/CodeGen/X86/subcarry.ll
+++ b/llvm/test/CodeGen/X86/subcarry.ll
@@ -4,9 +4,9 @@
 define i128 @sub128(i128 %a, i128 %b) nounwind {
 ; CHECK-LABEL: sub128:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq %rdx, %rdi
-; CHECK-NEXT:    sbbq %rcx, %rsi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    subq %rdx, %rax
+; CHECK-NEXT:    sbbq %rcx, %rsi
 ; CHECK-NEXT:    movq %rsi, %rdx
 ; CHECK-NEXT:    retq
 entry:
@@ -17,6 +17,7 @@
 define i256 @sub256(i256 %a, i256 %b) nounwind {
 ; CHECK-LABEL: sub256:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    subq %r9, %rsi
 ; CHECK-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
 ; CHECK-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
@@ -25,7 +26,6 @@
 ; CHECK-NEXT:    movq %rsi, (%rdi)
 ; CHECK-NEXT:    movq %rcx, 16(%rdi)
 ; CHECK-NEXT:    movq %r8, 24(%rdi)
-; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    retq
 entry:
   %0 = sub i256 %a, %b
@@ -37,19 +37,19 @@
 define %S @negate(%S* nocapture readonly %this) {
 ; CHECK-LABEL: negate:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %r8d, %r8d
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    subq (%rsi), %rcx
-; CHECK-NEXT:    movl $0, %edx
-; CHECK-NEXT:    sbbq 8(%rsi), %rdx
-; CHECK-NEXT:    movl $0, %eax
-; CHECK-NEXT:    sbbq 16(%rsi), %rax
-; CHECK-NEXT:    sbbq 24(%rsi), %r8
-; CHECK-NEXT:    movq %rcx, (%rdi)
-; CHECK-NEXT:    movq %rdx, 8(%rdi)
-; CHECK-NEXT:    movq %rax, 16(%rdi)
-; CHECK-NEXT:    movq %r8, 24(%rdi)
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    xorl %r8d, %r8d
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    subq (%rsi), %rdx
+; CHECK-NEXT:    movl $0, %edi
+; CHECK-NEXT:    sbbq 8(%rsi), %rdi
+; CHECK-NEXT:    movl $0, %ecx
+; CHECK-NEXT:    sbbq 16(%rsi), %rcx
+; CHECK-NEXT:    sbbq 24(%rsi), %r8
+; CHECK-NEXT:    movq %rdx, (%rax)
+; CHECK-NEXT:    movq %rdi, 8(%rax)
+; CHECK-NEXT:    movq %rcx, 16(%rax)
+; CHECK-NEXT:    movq %r8, 24(%rax)
 ; CHECK-NEXT:    retq
 entry:
   %0 = getelementptr inbounds %S, %S* %this, i64 0, i32 0, i64 0
@@ -90,29 +90,29 @@
 define %S @sub(%S* nocapture readonly %this, %S %arg.b) local_unnamed_addr {
 ; CHECK-LABEL: sub:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    notq %rdx
-; CHECK-NEXT:    xorl %r10d, %r10d
-; CHECK-NEXT:    addq (%rsi), %rdx
-; CHECK-NEXT:    setb %r10b
-; CHECK-NEXT:    addq $1, %rdx
-; CHECK-NEXT:    adcq 8(%rsi), %r10
-; CHECK-NEXT:    setb %al
-; CHECK-NEXT:    movzbl %al, %r11d
-; CHECK-NEXT:    notq %rcx
-; CHECK-NEXT:    addq %r10, %rcx
-; CHECK-NEXT:    adcq 16(%rsi), %r11
-; CHECK-NEXT:    setb %al
-; CHECK-NEXT:    movzbl %al, %eax
-; CHECK-NEXT:    notq %r8
-; CHECK-NEXT:    addq %r11, %r8
-; CHECK-NEXT:    adcq 24(%rsi), %rax
-; CHECK-NEXT:    notq %r9
-; CHECK-NEXT:    addq %rax, %r9
-; CHECK-NEXT:    movq %rdx, (%rdi)
-; CHECK-NEXT:    movq %rcx, 8(%rdi)
-; CHECK-NEXT:    movq %r8, 16(%rdi)
-; CHECK-NEXT:    movq %r9, 24(%rdi)
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    notq %rdx
+; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    addq (%rsi), %rdx
+; CHECK-NEXT:    setb %dil
+; CHECK-NEXT:    addq $1, %rdx
+; CHECK-NEXT:    adcq 8(%rsi), %rdi
+; CHECK-NEXT:    setb %r10b
+; CHECK-NEXT:    movzbl %r10b, %r10d
+; CHECK-NEXT:    notq %rcx
+; CHECK-NEXT:    addq %rdi, %rcx
+; CHECK-NEXT:    adcq 16(%rsi), %r10
+; CHECK-NEXT:    setb %dil
+; CHECK-NEXT:    movzbl %dil, %edi
+; CHECK-NEXT:    notq %r8
+; CHECK-NEXT:    addq %r10, %r8
+; CHECK-NEXT:    adcq 24(%rsi), %rdi
+; CHECK-NEXT:    notq %r9
+; CHECK-NEXT:    addq %rdi, %r9
+; CHECK-NEXT:    movq %rdx, (%rax)
+; CHECK-NEXT:    movq %rcx, 8(%rax)
+; CHECK-NEXT:    movq %r8, 16(%rax)
+; CHECK-NEXT:    movq %r9, 24(%rax)
 ; CHECK-NEXT:    retq
 entry:
   %0 = extractvalue %S %arg.b, 0
diff --git a/llvm/test/CodeGen/X86/swift-return.ll b/llvm/test/CodeGen/X86/swift-return.ll
index cb85585..77591d3 100644
--- a/llvm/test/CodeGen/X86/swift-return.ll
+++ b/llvm/test/CodeGen/X86/swift-return.ll
@@ -457,18 +457,18 @@
 ; CHECK-LABEL: gen9:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    movl %edi, %edx
-; CHECK-NEXT:    movl %edi, %ecx
-; CHECK-NEXT:    movl %edi, %r8d
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    movl %eax, %r8d
 ; CHECK-NEXT:    retq
 ;
 ; CHECK-O0-LABEL: gen9:
 ; CHECK-O0:       # %bb.0:
 ; CHECK-O0-NEXT:    movb %dil, %al
-; CHECK-O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp) # 1-byte Spill
-; CHECK-O0-NEXT:    movb -{{[0-9]+}}(%rsp), %dl # 1-byte Reload
-; CHECK-O0-NEXT:    movb -{{[0-9]+}}(%rsp), %cl # 1-byte Reload
-; CHECK-O0-NEXT:    movb -{{[0-9]+}}(%rsp), %r8b # 1-byte Reload
+; CHECK-O0-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %dl # 1-byte Reload
+; CHECK-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-O0-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %r8b # 1-byte Reload
 ; CHECK-O0-NEXT:    retq
   %v0 = insertvalue { i8, i8, i8, i8 } undef, i8 %key, 0
   %v1 = insertvalue { i8, i8, i8, i8 } %v0, i8 %key, 1
@@ -479,10 +479,10 @@
 define swiftcc { double, double, double, double, i64, i64, i64, i64 } @gen10(double %keyd, i64 %keyi) {
 ; CHECK-LABEL: gen10:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    movaps %xmm0, %xmm2
 ; CHECK-NEXT:    movaps %xmm0, %xmm3
-; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    movq %rdi, %rdx
 ; CHECK-NEXT:    movq %rdi, %rcx
 ; CHECK-NEXT:    movq %rdi, %r8
@@ -490,12 +490,12 @@
 ;
 ; CHECK-O0-LABEL: gen10:
 ; CHECK-O0:       # %bb.0:
-; CHECK-O0-NEXT:    movsd %xmm0, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK-O0-NEXT:    movsd -{{[0-9]+}}(%rsp), %xmm1 # 8-byte Reload
+; CHECK-O0-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-O0-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
 ; CHECK-O0-NEXT:    # xmm1 = mem[0],zero
-; CHECK-O0-NEXT:    movsd -{{[0-9]+}}(%rsp), %xmm2 # 8-byte Reload
+; CHECK-O0-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 8-byte Reload
 ; CHECK-O0-NEXT:    # xmm2 = mem[0],zero
-; CHECK-O0-NEXT:    movsd -{{[0-9]+}}(%rsp), %xmm3 # 8-byte Reload
+; CHECK-O0-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 8-byte Reload
 ; CHECK-O0-NEXT:    # xmm3 = mem[0],zero
 ; CHECK-O0-NEXT:    movq %rdi, %rax
 ; CHECK-O0-NEXT:    movq %rdi, %rdx
diff --git a/llvm/test/CodeGen/X86/swifterror.ll b/llvm/test/CodeGen/X86/swifterror.ll
index 1ecd337..cb0597f 100644
--- a/llvm/test/CodeGen/X86/swifterror.ll
+++ b/llvm/test/CodeGen/X86/swifterror.ll
@@ -1,6 +1,6 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=CHECK-APPLE %s
-; RUN: llc -verify-machineinstrs -O0 < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=CHECK-O0 %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=i386-apple-darwin | FileCheck --check-prefix=CHECK-i386 %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=CHECK-APPLE %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-apple-darwin -O0 | FileCheck --check-prefix=CHECK-O0 %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=i386-apple-darwin | FileCheck --check-prefix=CHECK-i386 %s
 
 declare i8* @malloc(i64)
 declare void @free(i8*)
@@ -37,8 +37,7 @@
 ; CHECK-APPLE: testq %r12, %r12
 ; CHECK-APPLE: jne
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: movb 8(%r12)
-; CHECK-APPLE: movq %r12, %rdi
+; CHECK-APPLE: movb 8(%rdi)
 ; CHECK-APPLE: callq {{.*}}free
 
 ; CHECK-O0-LABEL: caller:
@@ -250,9 +249,8 @@
 ; CHECK-APPLE: testq %r12, %r12
 ; CHECK-APPLE: jne
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: movb 8(%r12),
+; CHECK-APPLE: movb 8(%rdi),
 ; CHECK-APPLE: movb %{{.*}},
-; CHECK-APPLE: movq %r12, %rdi
 ; CHECK-APPLE: callq {{.*}}free
 
 ; CHECK-O0-LABEL: caller3:
@@ -300,8 +298,7 @@
 ; CHECK-APPLE: testq %r12, %r12
 ; CHECK-APPLE: jne
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: movb 8(%r12)
-; CHECK-APPLE: movq %r12, %rdi
+; CHECK-APPLE: movb 8(%rdi)
 ; CHECK-APPLE: callq {{.*}}free
 
 ; The second swifterror value:
@@ -310,8 +307,7 @@
 ; CHECK-APPLE: testq %r12, %r12
 ; CHECK-APPLE: jne
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: movb 8(%r12)
-; CHECK-APPLE: movq %r12, %rdi
+; CHECK-APPLE: movb 8(%rdi)
 ; CHECK-APPLE: callq {{.*}}free
 
 ; CHECK-O0-LABEL: caller_with_multiple_swifterror_values:
@@ -488,8 +484,8 @@
 ; CHECK-i386:  retl
 ; CHECK-APPLE-LABEL: empty_swiftcc:
 ; CHECK-APPLE:  movl    %edx, %ecx
-; CHECK-APPLE:  movl    %edi, %eax
-; CHECK-APPLE:  movl    %esi, %edx
+; CHECK-APPLE-DAG:  movl    %edi, %eax
+; CHECK-APPLE-DAG:  movl    %esi, %edx
 ; CHECK-APPLE:  retq
 define swiftcc {i32, i32, i32} @empty_swiftcc({i32, i32, i32} , %swift_error** swifterror %error_ptr_ref) {
 entry:
diff --git a/llvm/test/CodeGen/X86/system-intrinsics-xsetbv.ll b/llvm/test/CodeGen/X86/system-intrinsics-xsetbv.ll
index 6dfb563..a0961a0 100644
--- a/llvm/test/CodeGen/X86/system-intrinsics-xsetbv.ll
+++ b/llvm/test/CodeGen/X86/system-intrinsics-xsetbv.ll
@@ -11,8 +11,8 @@
 
 ; CHECK64-LABEL: test_xsetbv
 ; CHECK64: movl  %edx, %eax
-; CHECK64: movl  %edi, %ecx
-; CHECK64: movl  %esi, %edx
+; CHECK64-DAG: movl  %edi, %ecx
+; CHECK64-DAG: movl  %esi, %edx
 ; CHECK64: xsetbv
 ; CHECK64: ret
 
diff --git a/llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll b/llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll
index 8c2f578..82b3b4c 100644
--- a/llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll
+++ b/llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll
@@ -40,10 +40,10 @@
 ; X64-LABEL: test__blcic_u64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    xorq $-1, %rax
-; X64-NEXT:    addq $1, %rdi
-; X64-NEXT:    andq %rax, %rdi
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    xorq $-1, %rcx
+; X64-NEXT:    addq $1, %rax
+; X64-NEXT:    andq %rcx, %rax
 ; X64-NEXT:    retq
   %1 = xor i64 %a0, -1
   %2 = add i64 %a0, 1
@@ -89,10 +89,10 @@
 ; X64-LABEL: test__blsic_u64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    xorq $-1, %rax
-; X64-NEXT:    subq $1, %rdi
-; X64-NEXT:    orq %rax, %rdi
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    xorq $-1, %rcx
+; X64-NEXT:    subq $1, %rax
+; X64-NEXT:    orq %rcx, %rax
 ; X64-NEXT:    retq
   %1 = xor i64 %a0, -1
   %2 = sub i64 %a0, 1
@@ -104,10 +104,10 @@
 ; X64-LABEL: test__t1mskc_u64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    xorq $-1, %rax
-; X64-NEXT:    addq $1, %rdi
-; X64-NEXT:    orq %rax, %rdi
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    xorq $-1, %rcx
+; X64-NEXT:    addq $1, %rax
+; X64-NEXT:    orq %rcx, %rax
 ; X64-NEXT:    retq
   %1 = xor i64 %a0, -1
   %2 = add i64 %a0, 1
@@ -119,10 +119,10 @@
 ; X64-LABEL: test__tzmsk_u64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    xorq $-1, %rax
-; X64-NEXT:    subq $1, %rdi
-; X64-NEXT:    andq %rax, %rdi
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    xorq $-1, %rcx
+; X64-NEXT:    subq $1, %rax
+; X64-NEXT:    andq %rcx, %rax
 ; X64-NEXT:    retq
   %1 = xor i64 %a0, -1
   %2 = sub i64 %a0, 1
diff --git a/llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll
index d550cb7..0664d04 100644
--- a/llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll
@@ -72,10 +72,10 @@
 ; X64-LABEL: test__blcic_u32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    xorl $-1, %eax
-; X64-NEXT:    addl $1, %edi
-; X64-NEXT:    andl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    xorl $-1, %ecx
+; X64-NEXT:    addl $1, %eax
+; X64-NEXT:    andl %ecx, %eax
 ; X64-NEXT:    retq
   %1 = xor i32 %a0, -1
   %2 = add i32 %a0, 1
@@ -154,10 +154,10 @@
 ; X64-LABEL: test__blsic_u32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    xorl $-1, %eax
-; X64-NEXT:    subl $1, %edi
-; X64-NEXT:    orl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    xorl $-1, %ecx
+; X64-NEXT:    subl $1, %eax
+; X64-NEXT:    orl %ecx, %eax
 ; X64-NEXT:    retq
   %1 = xor i32 %a0, -1
   %2 = sub i32 %a0, 1
@@ -178,10 +178,10 @@
 ; X64-LABEL: test__t1mskc_u32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    xorl $-1, %eax
-; X64-NEXT:    addl $1, %edi
-; X64-NEXT:    orl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    xorl $-1, %ecx
+; X64-NEXT:    addl $1, %eax
+; X64-NEXT:    orl %ecx, %eax
 ; X64-NEXT:    retq
   %1 = xor i32 %a0, -1
   %2 = add i32 %a0, 1
@@ -202,10 +202,10 @@
 ; X64-LABEL: test__tzmsk_u32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    xorl $-1, %eax
-; X64-NEXT:    subl $1, %edi
-; X64-NEXT:    andl %eax, %edi
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    xorl $-1, %ecx
+; X64-NEXT:    subl $1, %eax
+; X64-NEXT:    andl %ecx, %eax
 ; X64-NEXT:    retq
   %1 = xor i32 %a0, -1
   %2 = sub i32 %a0, 1
diff --git a/llvm/test/CodeGen/X86/tbm_patterns.ll b/llvm/test/CodeGen/X86/tbm_patterns.ll
index 0b63a0e..6865cc5 100644
--- a/llvm/test/CodeGen/X86/tbm_patterns.ll
+++ b/llvm/test/CodeGen/X86/tbm_patterns.ll
@@ -52,10 +52,10 @@
 define i32 @test_x86_tbm_bextri_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_bextri_u32_z2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    shrl $4, %edi
 ; CHECK-NEXT:    testl $4095, %edi # imm = 0xFFF
-; CHECK-NEXT:    cmovnel %edx, %esi
-; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmovnel %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = lshr i32 %a, 4
   %t1 = and i32 %t0, 4095
@@ -113,10 +113,10 @@
 define i64 @test_x86_tbm_bextri_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_bextri_u64_z2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    shrl $4, %edi
 ; CHECK-NEXT:    testl $4095, %edi # imm = 0xFFF
-; CHECK-NEXT:    cmovneq %rdx, %rsi
-; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    cmovneq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = lshr i64 %a, 4
   %t1 = and i64 %t0, 4095
@@ -151,11 +151,11 @@
 define i32 @test_x86_tbm_blcfill_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_blcfill_u32_z2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT:    leal 1(%rdi), %eax
-; CHECK-NEXT:    testl %edi, %eax
-; CHECK-NEXT:    cmovnel %edx, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    leal 1(%rdi), %ecx
+; CHECK-NEXT:    testl %edi, %ecx
+; CHECK-NEXT:    cmovnel %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = add i32 %a, 1
   %t1 = and i32 %t0, %a
@@ -190,10 +190,10 @@
 define i64 @test_x86_tbm_blcfill_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_blcfill_u64_z2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    leaq 1(%rdi), %rax
-; CHECK-NEXT:    testq %rdi, %rax
-; CHECK-NEXT:    cmovneq %rdx, %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    leaq 1(%rdi), %rcx
+; CHECK-NEXT:    testq %rdi, %rcx
+; CHECK-NEXT:    cmovneq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = add i64 %a, 1
   %t1 = and i64 %t0, %a
@@ -230,12 +230,12 @@
 define i32 @test_x86_tbm_blci_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_blci_u32_z2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT:    leal 1(%rdi), %eax
-; CHECK-NEXT:    notl %eax
-; CHECK-NEXT:    orl %edi, %eax
-; CHECK-NEXT:    cmovnel %edx, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    leal 1(%rdi), %ecx
+; CHECK-NEXT:    notl %ecx
+; CHECK-NEXT:    orl %edi, %ecx
+; CHECK-NEXT:    cmovnel %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = add i32 1, %a
   %t1 = xor i32 %t0, -1
@@ -273,11 +273,11 @@
 define i64 @test_x86_tbm_blci_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_blci_u64_z2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    leaq 1(%rdi), %rax
-; CHECK-NEXT:    notq %rax
-; CHECK-NEXT:    orq %rdi, %rax
-; CHECK-NEXT:    cmovneq %rdx, %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    leaq 1(%rdi), %rcx
+; CHECK-NEXT:    notq %rcx
+; CHECK-NEXT:    orq %rdi, %rcx
+; CHECK-NEXT:    cmovneq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = add i64 1, %a
   %t1 = xor i64 %t0, -1
@@ -335,12 +335,12 @@
 define i32 @test_x86_tbm_blcic_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_blcic_u32_z2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    notl %eax
-; CHECK-NEXT:    incl %edi
-; CHECK-NEXT:    testl %eax, %edi
-; CHECK-NEXT:    cmovnel %edx, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    notl %ecx
+; CHECK-NEXT:    incl %edi
+; CHECK-NEXT:    testl %ecx, %edi
+; CHECK-NEXT:    cmovnel %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = xor i32 %a, -1
   %t1 = add i32 %a, 1
@@ -378,12 +378,12 @@
 define i64 @test_x86_tbm_blcic_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_blcic_u64_z2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    notq %rax
-; CHECK-NEXT:    incq %rdi
-; CHECK-NEXT:    testq %rax, %rdi
-; CHECK-NEXT:    cmovneq %rdx, %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    movq %rdi, %rcx
+; CHECK-NEXT:    notq %rcx
+; CHECK-NEXT:    incq %rdi
+; CHECK-NEXT:    testq %rcx, %rdi
+; CHECK-NEXT:    cmovneq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = xor i64 %a, -1
   %t1 = add i64 %a, 1
@@ -419,11 +419,11 @@
 define i32 @test_x86_tbm_blcmsk_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_blcmsk_u32_z2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT:    leal 1(%rdi), %eax
-; CHECK-NEXT:    xorl %edi, %eax
-; CHECK-NEXT:    cmovnel %edx, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    leal 1(%rdi), %ecx
+; CHECK-NEXT:    xorl %edi, %ecx
+; CHECK-NEXT:    cmovnel %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = add i32 %a, 1
   %t1 = xor i32 %t0, %a
@@ -458,10 +458,10 @@
 define i64 @test_x86_tbm_blcmsk_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_blcmsk_u64_z2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    leaq 1(%rdi), %rax
-; CHECK-NEXT:    xorq %rdi, %rax
-; CHECK-NEXT:    cmovneq %rdx, %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    leaq 1(%rdi), %rcx
+; CHECK-NEXT:    xorq %rdi, %rcx
+; CHECK-NEXT:    cmovneq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = add i64 %a, 1
   %t1 = xor i64 %t0, %a
@@ -496,11 +496,11 @@
 define i32 @test_x86_tbm_blcs_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_blcs_u32_z2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT:    leal 1(%rdi), %eax
-; CHECK-NEXT:    orl %edi, %eax
-; CHECK-NEXT:    cmovnel %edx, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    leal 1(%rdi), %ecx
+; CHECK-NEXT:    orl %edi, %ecx
+; CHECK-NEXT:    cmovnel %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = add i32 %a, 1
   %t1 = or i32 %t0, %a
@@ -535,10 +535,10 @@
 define i64 @test_x86_tbm_blcs_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_blcs_u64_z2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    leaq 1(%rdi), %rax
-; CHECK-NEXT:    orq %rdi, %rax
-; CHECK-NEXT:    cmovneq %rdx, %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    leaq 1(%rdi), %rcx
+; CHECK-NEXT:    orq %rdi, %rcx
+; CHECK-NEXT:    cmovneq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = add i64 %a, 1
   %t1 = or i64 %t0, %a
@@ -573,11 +573,11 @@
 define i32 @test_x86_tbm_blsfill_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_blsfill_u32_z2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT:    leal -1(%rdi), %eax
-; CHECK-NEXT:    orl %edi, %eax
-; CHECK-NEXT:    cmovnel %edx, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    leal -1(%rdi), %ecx
+; CHECK-NEXT:    orl %edi, %ecx
+; CHECK-NEXT:    cmovnel %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = add i32 %a, -1
   %t1 = or i32 %t0, %a
@@ -612,10 +612,10 @@
 define i64 @test_x86_tbm_blsfill_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_blsfill_u64_z2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    leaq -1(%rdi), %rax
-; CHECK-NEXT:    orq %rdi, %rax
-; CHECK-NEXT:    cmovneq %rdx, %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    leaq -1(%rdi), %rcx
+; CHECK-NEXT:    orq %rdi, %rcx
+; CHECK-NEXT:    cmovneq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = add i64 %a, -1
   %t1 = or i64 %t0, %a
@@ -652,12 +652,12 @@
 define i32 @test_x86_tbm_blsic_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_blsic_u32_z2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    notl %eax
-; CHECK-NEXT:    decl %edi
-; CHECK-NEXT:    orl %eax, %edi
-; CHECK-NEXT:    cmovnel %edx, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    notl %ecx
+; CHECK-NEXT:    decl %edi
+; CHECK-NEXT:    orl %ecx, %edi
+; CHECK-NEXT:    cmovnel %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = xor i32 %a, -1
   %t1 = add i32 %a, -1
@@ -695,12 +695,12 @@
 define i64 @test_x86_tbm_blsic_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_blsic_u64_z2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    notq %rax
-; CHECK-NEXT:    decq %rdi
-; CHECK-NEXT:    orq %rax, %rdi
-; CHECK-NEXT:    cmovneq %rdx, %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    movq %rdi, %rcx
+; CHECK-NEXT:    notq %rcx
+; CHECK-NEXT:    decq %rdi
+; CHECK-NEXT:    orq %rcx, %rdi
+; CHECK-NEXT:    cmovneq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = xor i64 %a, -1
   %t1 = add i64 %a, -1
@@ -739,12 +739,12 @@
 define i32 @test_x86_tbm_t1mskc_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_t1mskc_u32_z2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    notl %eax
-; CHECK-NEXT:    incl %edi
-; CHECK-NEXT:    orl %eax, %edi
-; CHECK-NEXT:    cmovnel %edx, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    notl %ecx
+; CHECK-NEXT:    incl %edi
+; CHECK-NEXT:    orl %ecx, %edi
+; CHECK-NEXT:    cmovnel %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = xor i32 %a, -1
   %t1 = add i32 %a, 1
@@ -783,12 +783,12 @@
 define i64 @test_x86_tbm_t1mskc_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_t1mskc_u64_z2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    notq %rax
-; CHECK-NEXT:    incq %rdi
-; CHECK-NEXT:    orq %rax, %rdi
-; CHECK-NEXT:    cmovneq %rdx, %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    movq %rdi, %rcx
+; CHECK-NEXT:    notq %rcx
+; CHECK-NEXT:    incq %rdi
+; CHECK-NEXT:    orq %rcx, %rdi
+; CHECK-NEXT:    cmovneq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = xor i64 %a, -1
   %t1 = add i64 %a, 1
@@ -827,12 +827,12 @@
 define i32 @test_x86_tbm_tzmsk_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_tzmsk_u32_z2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    notl %eax
-; CHECK-NEXT:    decl %edi
-; CHECK-NEXT:    testl %edi, %eax
-; CHECK-NEXT:    cmovnel %edx, %esi
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    notl %ecx
+; CHECK-NEXT:    decl %edi
+; CHECK-NEXT:    testl %edi, %ecx
+; CHECK-NEXT:    cmovnel %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = xor i32 %a, -1
   %t1 = add i32 %a, -1
@@ -871,12 +871,12 @@
 define i64 @test_x86_tbm_tzmsk_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_tzmsk_u64_z2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    notq %rax
-; CHECK-NEXT:    decq %rdi
-; CHECK-NEXT:    testq %rdi, %rax
-; CHECK-NEXT:    cmovneq %rdx, %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    movq %rdi, %rcx
+; CHECK-NEXT:    notq %rcx
+; CHECK-NEXT:    decq %rdi
+; CHECK-NEXT:    testq %rdi, %rcx
+; CHECK-NEXT:    cmovneq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = xor i64 %a, -1
   %t1 = add i64 %a, -1
diff --git a/llvm/test/CodeGen/X86/trunc-subvector.ll b/llvm/test/CodeGen/X86/trunc-subvector.ll
index f420141..88830ee 100644
--- a/llvm/test/CodeGen/X86/trunc-subvector.ll
+++ b/llvm/test/CodeGen/X86/trunc-subvector.ll
@@ -41,9 +41,8 @@
 ; SSE2-LABEL: test3:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: test3:
@@ -165,9 +164,9 @@
 define <2 x i32> @test8(<8 x i32> %v) {
 ; SSE2-LABEL: test8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: test8:
diff --git a/llvm/test/CodeGen/X86/twoaddr-lea.ll b/llvm/test/CodeGen/X86/twoaddr-lea.ll
index 2944b17..fdcd99a 100644
--- a/llvm/test/CodeGen/X86/twoaddr-lea.ll
+++ b/llvm/test/CodeGen/X86/twoaddr-lea.ll
@@ -11,8 +11,8 @@
 
 define i32 @test1(i32 %X) nounwind {
 ; CHECK-LABEL: test1:
-; CHECK-NOT: mov
-; CHECK: leal 1(%rdi)
+; CHECK: movl %edi, %eax
+; CHECK: leal 1(%rax)
         %Z = add i32 %X, 1
         store volatile i32 %Z, i32* @G
         ret i32 %X
diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll
index 5d001a9..72b1fcc 100644
--- a/llvm/test/CodeGen/X86/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll
@@ -15,8 +15,8 @@
 ;
 ; X64-LABEL: a:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $3, %ecx
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl $3, %ecx
 ; X64-NEXT:    mull %ecx
 ; X64-NEXT:    seto %al
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
index a1193af..7961fba 100644
--- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
@@ -6,13 +6,13 @@
 ; X64-LABEL: muloti_test:
 ; X64:       # %bb.0: # %start
 ; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    testq %rcx, %rcx
-; X64-NEXT:    setne %al
+; X64-NEXT:    setne %dl
 ; X64-NEXT:    testq %rsi, %rsi
 ; X64-NEXT:    setne %r9b
-; X64-NEXT:    andb %al, %r9b
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %rdx
+; X64-NEXT:    andb %dl, %r9b
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    seto %r10b
 ; X64-NEXT:    movq %rcx, %rax
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll
index c1d7bb6..ac55478 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll
@@ -10,18 +10,20 @@
 define i8 @out8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: out8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andb $60, %dil
-; CHECK-NOBMI-NEXT:    andb $-61, %sil
-; CHECK-NOBMI-NEXT:    orb %dil, %sil
 ; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    andb $60, %dil
+; CHECK-NOBMI-NEXT:    andb $-61, %al
+; CHECK-NOBMI-NEXT:    orb %dil, %al
+; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out8_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andb $60, %dil
-; CHECK-BMI-NEXT:    andb $-61, %sil
-; CHECK-BMI-NEXT:    orb %dil, %sil
 ; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    andb $60, %dil
+; CHECK-BMI-NEXT:    andb $-61, %al
+; CHECK-BMI-NEXT:    orb %dil, %al
+; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i8 %x, 60
   %my = and i8 %y, -61
@@ -110,18 +112,20 @@
 define i8 @in8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: in8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
+; CHECK-NOBMI-NEXT:    movl %esi, %eax
 ; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    andb $60, %dil
-; CHECK-NOBMI-NEXT:    xorb %dil, %sil
-; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    xorb %dil, %al
+; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in8_constmask:
 ; CHECK-BMI:       # %bb.0:
+; CHECK-BMI-NEXT:    movl %esi, %eax
 ; CHECK-BMI-NEXT:    xorl %esi, %edi
 ; CHECK-BMI-NEXT:    andb $60, %dil
-; CHECK-BMI-NEXT:    xorb %dil, %sil
-; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    xorb %dil, %al
+; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i8 %x, %y
   %n1 = and i8 %n0, 60
@@ -132,18 +136,20 @@
 define i16 @in16_constmask(i16 %x, i16 %y) {
 ; CHECK-NOBMI-LABEL: in16_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl $4080, %edi # imm = 0xFF0
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $4080, %eax # imm = 0xFF0
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in16_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andl $4080, %edi # imm = 0xFF0
-; CHECK-BMI-NEXT:    xorl %esi, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    andl $4080, %eax # imm = 0xFF0
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i16 %x, %y
   %n1 = and i16 %n0, 4080
@@ -154,18 +160,18 @@
 define i32 @in32_constmask(i32 %x, i32 %y) {
 ; CHECK-NOBMI-LABEL: in32_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl $16776960, %edi # imm = 0xFFFF00
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $16776960, %eax # imm = 0xFFFF00
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in32_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andl $16776960, %edi # imm = 0xFFFF00
-; CHECK-BMI-NEXT:    xorl %esi, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    andl $16776960, %eax # imm = 0xFFFF00
+; CHECK-BMI-NEXT:    xorl %esi, %eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i32 %x, %y
   %n1 = and i32 %n0, 16776960
@@ -202,18 +208,18 @@
 define i32 @in_constmask_commutativity_0_1(i32 %x, i32 %y) {
 ; CHECK-NOBMI-LABEL: in_constmask_commutativity_0_1:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl $16776960, %edi # imm = 0xFFFF00
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $16776960, %eax # imm = 0xFFFF00
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_constmask_commutativity_0_1:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andl $16776960, %edi # imm = 0xFFFF00
-; CHECK-BMI-NEXT:    xorl %esi, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    andl $16776960, %eax # imm = 0xFFFF00
+; CHECK-BMI-NEXT:    xorl %esi, %eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i32 %x, %y
   %n1 = and i32 %n0, 16776960
@@ -224,18 +230,18 @@
 define i32 @in_constmask_commutativity_1_0(i32 %x, i32 %y) {
 ; CHECK-NOBMI-LABEL: in_constmask_commutativity_1_0:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
-; CHECK-NOBMI-NEXT:    andl $16776960, %esi # imm = 0xFFFF00
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
 ; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
+; CHECK-NOBMI-NEXT:    andl $16776960, %eax # imm = 0xFFFF00
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_constmask_commutativity_1_0:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    xorl %edi, %esi
-; CHECK-BMI-NEXT:    andl $16776960, %esi # imm = 0xFFFF00
-; CHECK-BMI-NEXT:    xorl %edi, %esi
 ; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    xorl %edi, %eax
+; CHECK-BMI-NEXT:    andl $16776960, %eax # imm = 0xFFFF00
+; CHECK-BMI-NEXT:    xorl %edi, %eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i32 %x, %y
   %n1 = and i32 %n0, 16776960
@@ -246,18 +252,18 @@
 define i32 @in_constmask_commutativity_1_1(i32 %x, i32 %y) {
 ; CHECK-NOBMI-LABEL: in_constmask_commutativity_1_1:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
-; CHECK-NOBMI-NEXT:    andl $16776960, %esi # imm = 0xFFFF00
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
 ; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
+; CHECK-NOBMI-NEXT:    andl $16776960, %eax # imm = 0xFFFF00
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_constmask_commutativity_1_1:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    xorl %edi, %esi
-; CHECK-BMI-NEXT:    andl $16776960, %esi # imm = 0xFFFF00
-; CHECK-BMI-NEXT:    xorl %edi, %esi
 ; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    xorl %edi, %eax
+; CHECK-BMI-NEXT:    andl $16776960, %eax # imm = 0xFFFF00
+; CHECK-BMI-NEXT:    xorl %edi, %eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i32 %x, %y
   %n1 = and i32 %n0, 16776960
@@ -272,20 +278,20 @@
 define i32 @in_complex_y0_constmask(i32 %x, i32 %y_hi, i32 %y_low) {
 ; CHECK-NOBMI-LABEL: in_complex_y0_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl %edx, %esi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl $16776960, %edi # imm = 0xFFFF00
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %esi
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $16776960, %eax # imm = 0xFFFF00
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_complex_y0_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andl %edx, %esi
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andl $16776960, %edi # imm = 0xFFFF00
-; CHECK-BMI-NEXT:    xorl %esi, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    andl %edx, %esi
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    andl $16776960, %eax # imm = 0xFFFF00
+; CHECK-BMI-NEXT:    xorl %esi, %eax
 ; CHECK-BMI-NEXT:    retq
   %y = and i32 %y_hi, %y_low
   %n0 = xor i32 %x, %y
@@ -297,20 +303,20 @@
 define i32 @in_complex_y1_constmask(i32 %x, i32 %y_hi, i32 %y_low) {
 ; CHECK-NOBMI-LABEL: in_complex_y1_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl %edx, %esi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl $16776960, %edi # imm = 0xFFFF00
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %esi
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $16776960, %eax # imm = 0xFFFF00
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_complex_y1_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andl %edx, %esi
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andl $16776960, %edi # imm = 0xFFFF00
-; CHECK-BMI-NEXT:    xorl %esi, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    andl %edx, %esi
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    andl $16776960, %eax # imm = 0xFFFF00
+; CHECK-BMI-NEXT:    xorl %esi, %eax
 ; CHECK-BMI-NEXT:    retq
   %y = and i32 %y_hi, %y_low
   %n0 = xor i32 %x, %y
@@ -416,18 +422,18 @@
 define i32 @n0_badconstmask(i32 %x, i32 %y) {
 ; CHECK-NOBMI-LABEL: n0_badconstmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl $16776960, %edi # imm = 0xFFFF00
-; CHECK-NOBMI-NEXT:    andl $-16776960, %esi # imm = 0xFF000100
-; CHECK-NOBMI-NEXT:    orl %edi, %esi
 ; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $16776960, %edi # imm = 0xFFFF00
+; CHECK-NOBMI-NEXT:    andl $-16776960, %eax # imm = 0xFF000100
+; CHECK-NOBMI-NEXT:    orl %edi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: n0_badconstmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andl $16776960, %edi # imm = 0xFFFF00
-; CHECK-BMI-NEXT:    andl $-16776960, %esi # imm = 0xFF000100
-; CHECK-BMI-NEXT:    orl %edi, %esi
 ; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    andl $16776960, %edi # imm = 0xFFFF00
+; CHECK-BMI-NEXT:    andl $-16776960, %eax # imm = 0xFF000100
+; CHECK-BMI-NEXT:    orl %edi, %eax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i32 %x, 16776960
   %my = and i32 %y, -16776960 ; instead of -16776961
@@ -438,18 +444,18 @@
 define i32 @n1_thirdvar_constmask(i32 %x, i32 %y, i32 %z) {
 ; CHECK-NOBMI-LABEL: n1_thirdvar_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl $16776960, %edi # imm = 0xFFFF00
-; CHECK-NOBMI-NEXT:    xorl %edx, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $16776960, %eax # imm = 0xFFFF00
+; CHECK-NOBMI-NEXT:    xorl %edx, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: n1_thirdvar_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andl $16776960, %edi # imm = 0xFFFF00
-; CHECK-BMI-NEXT:    xorl %edx, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    andl $16776960, %eax # imm = 0xFFFF00
+; CHECK-BMI-NEXT:    xorl %edx, %eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i32 %x, %y
   %n1 = and i32 %n0, 16776960
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll
index 90d1c56..95de3bb 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll
@@ -10,18 +10,20 @@
 define i8 @out8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: out8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andb $85, %dil
-; CHECK-NOBMI-NEXT:    andb $-86, %sil
-; CHECK-NOBMI-NEXT:    orb %dil, %sil
 ; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    andb $85, %dil
+; CHECK-NOBMI-NEXT:    andb $-86, %al
+; CHECK-NOBMI-NEXT:    orb %dil, %al
+; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out8_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andb $85, %dil
-; CHECK-BMI-NEXT:    andb $-86, %sil
-; CHECK-BMI-NEXT:    orb %dil, %sil
 ; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    andb $85, %dil
+; CHECK-BMI-NEXT:    andb $-86, %al
+; CHECK-BMI-NEXT:    orb %dil, %al
+; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i8 %x, 85
   %my = and i8 %y, -86
@@ -110,18 +112,20 @@
 define i8 @in8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: in8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
+; CHECK-NOBMI-NEXT:    movl %esi, %eax
 ; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    andb $85, %dil
-; CHECK-NOBMI-NEXT:    xorb %dil, %sil
-; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    xorb %dil, %al
+; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in8_constmask:
 ; CHECK-BMI:       # %bb.0:
+; CHECK-BMI-NEXT:    movl %esi, %eax
 ; CHECK-BMI-NEXT:    xorl %esi, %edi
 ; CHECK-BMI-NEXT:    andb $85, %dil
-; CHECK-BMI-NEXT:    xorb %dil, %sil
-; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    xorb %dil, %al
+; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i8 %x, %y
   %n1 = and i8 %n0, 85
@@ -132,18 +136,20 @@
 define i16 @in16_constmask(i16 %x, i16 %y) {
 ; CHECK-NOBMI-LABEL: in16_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl $21845, %edi # imm = 0x5555
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $21845, %eax # imm = 0x5555
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in16_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andl $21845, %edi # imm = 0x5555
-; CHECK-BMI-NEXT:    xorl %esi, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    andl $21845, %eax # imm = 0x5555
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i16 %x, %y
   %n1 = and i16 %n0, 21845
@@ -154,18 +160,18 @@
 define i32 @in32_constmask(i32 %x, i32 %y) {
 ; CHECK-NOBMI-LABEL: in32_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in32_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; CHECK-BMI-NEXT:    xorl %esi, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; CHECK-BMI-NEXT:    xorl %esi, %eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i32 %x, %y
   %n1 = and i32 %n0, 1431655765
@@ -202,18 +208,18 @@
 define i32 @in_constmask_commutativity_0_1(i32 %x, i32 %y) {
 ; CHECK-NOBMI-LABEL: in_constmask_commutativity_0_1:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_constmask_commutativity_0_1:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; CHECK-BMI-NEXT:    xorl %esi, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; CHECK-BMI-NEXT:    xorl %esi, %eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i32 %x, %y
   %n1 = and i32 %n0, 1431655765
@@ -224,18 +230,18 @@
 define i32 @in_constmask_commutativity_1_0(i32 %x, i32 %y) {
 ; CHECK-NOBMI-LABEL: in_constmask_commutativity_1_0:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
-; CHECK-NOBMI-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
 ; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
+; CHECK-NOBMI-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_constmask_commutativity_1_0:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    xorl %edi, %esi
-; CHECK-BMI-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; CHECK-BMI-NEXT:    xorl %edi, %esi
 ; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    xorl %edi, %eax
+; CHECK-BMI-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; CHECK-BMI-NEXT:    xorl %edi, %eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i32 %x, %y
   %n1 = and i32 %n0, 1431655765
@@ -246,18 +252,18 @@
 define i32 @in_constmask_commutativity_1_1(i32 %x, i32 %y) {
 ; CHECK-NOBMI-LABEL: in_constmask_commutativity_1_1:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
-; CHECK-NOBMI-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
 ; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
+; CHECK-NOBMI-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_constmask_commutativity_1_1:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    xorl %edi, %esi
-; CHECK-BMI-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; CHECK-BMI-NEXT:    xorl %edi, %esi
 ; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    xorl %edi, %eax
+; CHECK-BMI-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; CHECK-BMI-NEXT:    xorl %edi, %eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i32 %x, %y
   %n1 = and i32 %n0, 1431655765
@@ -272,20 +278,20 @@
 define i32 @in_complex_y0_constmask(i32 %x, i32 %y_hi, i32 %y_low) {
 ; CHECK-NOBMI-LABEL: in_complex_y0_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl %edx, %esi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %esi
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_complex_y0_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andl %edx, %esi
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; CHECK-BMI-NEXT:    xorl %esi, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    andl %edx, %esi
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; CHECK-BMI-NEXT:    xorl %esi, %eax
 ; CHECK-BMI-NEXT:    retq
   %y = and i32 %y_hi, %y_low
   %n0 = xor i32 %x, %y
@@ -297,20 +303,20 @@
 define i32 @in_complex_y1_constmask(i32 %x, i32 %y_hi, i32 %y_low) {
 ; CHECK-NOBMI-LABEL: in_complex_y1_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl %edx, %esi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %esi
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_complex_y1_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andl %edx, %esi
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; CHECK-BMI-NEXT:    xorl %esi, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    andl %edx, %esi
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; CHECK-BMI-NEXT:    xorl %esi, %eax
 ; CHECK-BMI-NEXT:    retq
   %y = and i32 %y_hi, %y_low
   %n0 = xor i32 %x, %y
@@ -416,18 +422,18 @@
 define i32 @n0_badconstmask(i32 %x, i32 %y) {
 ; CHECK-NOBMI-LABEL: n0_badconstmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; CHECK-NOBMI-NEXT:    andl $-1431655765, %esi # imm = 0xAAAAAAAB
-; CHECK-NOBMI-NEXT:    orl %edi, %esi
 ; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $1431655765, %edi # imm = 0x55555555
+; CHECK-NOBMI-NEXT:    andl $-1431655765, %eax # imm = 0xAAAAAAAB
+; CHECK-NOBMI-NEXT:    orl %edi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: n0_badconstmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; CHECK-BMI-NEXT:    andl $-1431655765, %esi # imm = 0xAAAAAAAB
-; CHECK-BMI-NEXT:    orl %edi, %esi
 ; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    andl $1431655765, %edi # imm = 0x55555555
+; CHECK-BMI-NEXT:    andl $-1431655765, %eax # imm = 0xAAAAAAAB
+; CHECK-BMI-NEXT:    orl %edi, %eax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i32 %x, 1431655765
   %my = and i32 %y, -1431655765 ; instead of -1431655766
@@ -438,18 +444,18 @@
 define i32 @n1_thirdvar_constmask(i32 %x, i32 %y, i32 %z) {
 ; CHECK-NOBMI-LABEL: n1_thirdvar_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; CHECK-NOBMI-NEXT:    xorl %edx, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; CHECK-NOBMI-NEXT:    xorl %edx, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: n1_thirdvar_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; CHECK-BMI-NEXT:    xorl %edx, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; CHECK-BMI-NEXT:    xorl %edx, %eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i32 %x, %y
   %n1 = and i32 %n0, 1431655765
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll
index b8a5fb3..c7579e2 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll
@@ -10,18 +10,20 @@
 define i8 @out8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: out8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andb $15, %dil
-; CHECK-NOBMI-NEXT:    andb $-16, %sil
-; CHECK-NOBMI-NEXT:    orb %dil, %sil
 ; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    andb $15, %dil
+; CHECK-NOBMI-NEXT:    andb $-16, %al
+; CHECK-NOBMI-NEXT:    orb %dil, %al
+; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out8_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andb $15, %dil
-; CHECK-BMI-NEXT:    andb $-16, %sil
-; CHECK-BMI-NEXT:    orb %dil, %sil
 ; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    andb $15, %dil
+; CHECK-BMI-NEXT:    andb $-16, %al
+; CHECK-BMI-NEXT:    orb %dil, %al
+; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i8 %x, 15
   %my = and i8 %y, -16
@@ -110,18 +112,20 @@
 define i8 @in8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: in8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
+; CHECK-NOBMI-NEXT:    movl %esi, %eax
 ; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    andb $15, %dil
-; CHECK-NOBMI-NEXT:    xorb %dil, %sil
-; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    xorb %dil, %al
+; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in8_constmask:
 ; CHECK-BMI:       # %bb.0:
+; CHECK-BMI-NEXT:    movl %esi, %eax
 ; CHECK-BMI-NEXT:    xorl %esi, %edi
 ; CHECK-BMI-NEXT:    andb $15, %dil
-; CHECK-BMI-NEXT:    xorb %dil, %sil
-; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    xorb %dil, %al
+; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i8 %x, %y
   %n1 = and i8 %n0, 15
@@ -132,18 +136,20 @@
 define i16 @in16_constmask(i16 %x, i16 %y) {
 ; CHECK-NOBMI-LABEL: in16_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl $3855, %edi # imm = 0xF0F
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $3855, %eax # imm = 0xF0F
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in16_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andl $3855, %edi # imm = 0xF0F
-; CHECK-BMI-NEXT:    xorl %esi, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    andl $3855, %eax # imm = 0xF0F
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i16 %x, %y
   %n1 = and i16 %n0, 3855
@@ -154,18 +160,18 @@
 define i32 @in32_constmask(i32 %x, i32 %y) {
 ; CHECK-NOBMI-LABEL: in32_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in32_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; CHECK-BMI-NEXT:    xorl %esi, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; CHECK-BMI-NEXT:    xorl %esi, %eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i32 %x, %y
   %n1 = and i32 %n0, 252645135
@@ -202,18 +208,18 @@
 define i32 @in_constmask_commutativity_0_1(i32 %x, i32 %y) {
 ; CHECK-NOBMI-LABEL: in_constmask_commutativity_0_1:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_constmask_commutativity_0_1:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; CHECK-BMI-NEXT:    xorl %esi, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; CHECK-BMI-NEXT:    xorl %esi, %eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i32 %x, %y
   %n1 = and i32 %n0, 252645135
@@ -224,18 +230,18 @@
 define i32 @in_constmask_commutativity_1_0(i32 %x, i32 %y) {
 ; CHECK-NOBMI-LABEL: in_constmask_commutativity_1_0:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
-; CHECK-NOBMI-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
 ; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
+; CHECK-NOBMI-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_constmask_commutativity_1_0:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    xorl %edi, %esi
-; CHECK-BMI-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
-; CHECK-BMI-NEXT:    xorl %edi, %esi
 ; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    xorl %edi, %eax
+; CHECK-BMI-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; CHECK-BMI-NEXT:    xorl %edi, %eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i32 %x, %y
   %n1 = and i32 %n0, 252645135
@@ -246,18 +252,18 @@
 define i32 @in_constmask_commutativity_1_1(i32 %x, i32 %y) {
 ; CHECK-NOBMI-LABEL: in_constmask_commutativity_1_1:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
-; CHECK-NOBMI-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
 ; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
+; CHECK-NOBMI-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_constmask_commutativity_1_1:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    xorl %edi, %esi
-; CHECK-BMI-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
-; CHECK-BMI-NEXT:    xorl %edi, %esi
 ; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    xorl %edi, %eax
+; CHECK-BMI-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; CHECK-BMI-NEXT:    xorl %edi, %eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i32 %x, %y
   %n1 = and i32 %n0, 252645135
@@ -272,20 +278,20 @@
 define i32 @in_complex_y0_constmask(i32 %x, i32 %y_hi, i32 %y_low) {
 ; CHECK-NOBMI-LABEL: in_complex_y0_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl %edx, %esi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %esi
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_complex_y0_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andl %edx, %esi
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; CHECK-BMI-NEXT:    xorl %esi, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    andl %edx, %esi
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; CHECK-BMI-NEXT:    xorl %esi, %eax
 ; CHECK-BMI-NEXT:    retq
   %y = and i32 %y_hi, %y_low
   %n0 = xor i32 %x, %y
@@ -297,20 +303,20 @@
 define i32 @in_complex_y1_constmask(i32 %x, i32 %y_hi, i32 %y_low) {
 ; CHECK-NOBMI-LABEL: in_complex_y1_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl %edx, %esi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %esi
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_complex_y1_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andl %edx, %esi
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; CHECK-BMI-NEXT:    xorl %esi, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    andl %edx, %esi
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; CHECK-BMI-NEXT:    xorl %esi, %eax
 ; CHECK-BMI-NEXT:    retq
   %y = and i32 %y_hi, %y_low
   %n0 = xor i32 %x, %y
@@ -416,18 +422,18 @@
 define i32 @n0_badconstmask(i32 %x, i32 %y) {
 ; CHECK-NOBMI-LABEL: n0_badconstmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; CHECK-NOBMI-NEXT:    andl $-252645135, %esi # imm = 0xF0F0F0F1
-; CHECK-NOBMI-NEXT:    orl %edi, %esi
 ; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; CHECK-NOBMI-NEXT:    andl $-252645135, %eax # imm = 0xF0F0F0F1
+; CHECK-NOBMI-NEXT:    orl %edi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: n0_badconstmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; CHECK-BMI-NEXT:    andl $-252645135, %esi # imm = 0xF0F0F0F1
-; CHECK-BMI-NEXT:    orl %edi, %esi
 ; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; CHECK-BMI-NEXT:    andl $-252645135, %eax # imm = 0xF0F0F0F1
+; CHECK-BMI-NEXT:    orl %edi, %eax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i32 %x, 252645135
   %my = and i32 %y, -252645135 ; instead of -252645136
@@ -438,18 +444,18 @@
 define i32 @n1_thirdvar_constmask(i32 %x, i32 %y, i32 %z) {
 ; CHECK-NOBMI-LABEL: n1_thirdvar_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; CHECK-NOBMI-NEXT:    xorl %edx, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; CHECK-NOBMI-NEXT:    xorl %edx, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: n1_thirdvar_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; CHECK-BMI-NEXT:    xorl %edx, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; CHECK-BMI-NEXT:    xorl %edx, %eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i32 %x, %y
   %n1 = and i32 %n0, 252645135
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll
index cd0fa5f..4a63eba 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll
@@ -10,18 +10,20 @@
 define i8 @out8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: out8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andb $15, %dil
-; CHECK-NOBMI-NEXT:    andb $-16, %sil
-; CHECK-NOBMI-NEXT:    orb %dil, %sil
 ; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    andb $15, %dil
+; CHECK-NOBMI-NEXT:    andb $-16, %al
+; CHECK-NOBMI-NEXT:    orb %dil, %al
+; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out8_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andb $15, %dil
-; CHECK-BMI-NEXT:    andb $-16, %sil
-; CHECK-BMI-NEXT:    orb %dil, %sil
 ; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    andb $15, %dil
+; CHECK-BMI-NEXT:    andb $-16, %al
+; CHECK-BMI-NEXT:    orb %dil, %al
+; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i8 %x, 15
   %my = and i8 %y, -16
@@ -100,18 +102,20 @@
 define i8 @in8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: in8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
+; CHECK-NOBMI-NEXT:    movl %esi, %eax
 ; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    andb $15, %dil
-; CHECK-NOBMI-NEXT:    xorb %dil, %sil
-; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    xorb %dil, %al
+; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in8_constmask:
 ; CHECK-BMI:       # %bb.0:
+; CHECK-BMI-NEXT:    movl %esi, %eax
 ; CHECK-BMI-NEXT:    xorl %esi, %edi
 ; CHECK-BMI-NEXT:    andb $15, %dil
-; CHECK-BMI-NEXT:    xorb %dil, %sil
-; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    xorb %dil, %al
+; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i8 %x, %y
   %n1 = and i8 %n0, 15
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
index 00b8756..b4065c2 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
@@ -6,20 +6,22 @@
 define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 ; CHECK-NOBMI-LABEL: out8:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    notb %dl
-; CHECK-NOBMI-NEXT:    andb %sil, %dl
-; CHECK-NOBMI-NEXT:    orb %dil, %dl
 ; CHECK-NOBMI-NEXT:    movl %edx, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %edi
+; CHECK-NOBMI-NEXT:    notb %al
+; CHECK-NOBMI-NEXT:    andb %sil, %al
+; CHECK-NOBMI-NEXT:    orb %dil, %al
+; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out8:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andl %edx, %edi
-; CHECK-BMI-NEXT:    notb %dl
-; CHECK-BMI-NEXT:    andb %sil, %dl
-; CHECK-BMI-NEXT:    orb %dil, %dl
 ; CHECK-BMI-NEXT:    movl %edx, %eax
+; CHECK-BMI-NEXT:    andl %edx, %edi
+; CHECK-BMI-NEXT:    notb %al
+; CHECK-BMI-NEXT:    andb %sil, %al
+; CHECK-BMI-NEXT:    orb %dil, %al
+; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i8 %x, %mask
   %notmask = xor i8 %mask, -1
@@ -31,11 +33,12 @@
 define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 ; CHECK-NOBMI-LABEL: out16:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    notl %edx
-; CHECK-NOBMI-NEXT:    andl %esi, %edx
-; CHECK-NOBMI-NEXT:    orl %edi, %edx
 ; CHECK-NOBMI-NEXT:    movl %edx, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %edi
+; CHECK-NOBMI-NEXT:    notl %eax
+; CHECK-NOBMI-NEXT:    andl %esi, %eax
+; CHECK-NOBMI-NEXT:    orl %edi, %eax
+; CHECK-NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out16:
@@ -55,11 +58,11 @@
 define i32 @out32(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: out32:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    notl %edx
-; CHECK-NOBMI-NEXT:    andl %esi, %edx
-; CHECK-NOBMI-NEXT:    orl %edi, %edx
 ; CHECK-NOBMI-NEXT:    movl %edx, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %edi
+; CHECK-NOBMI-NEXT:    notl %eax
+; CHECK-NOBMI-NEXT:    andl %esi, %eax
+; CHECK-NOBMI-NEXT:    orl %edi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out32:
@@ -78,11 +81,11 @@
 define i64 @out64(i64 %x, i64 %y, i64 %mask) {
 ; CHECK-NOBMI-LABEL: out64:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andq %rdx, %rdi
-; CHECK-NOBMI-NEXT:    notq %rdx
-; CHECK-NOBMI-NEXT:    andq %rsi, %rdx
-; CHECK-NOBMI-NEXT:    orq %rdi, %rdx
 ; CHECK-NOBMI-NEXT:    movq %rdx, %rax
+; CHECK-NOBMI-NEXT:    andq %rdx, %rdi
+; CHECK-NOBMI-NEXT:    notq %rax
+; CHECK-NOBMI-NEXT:    andq %rsi, %rax
+; CHECK-NOBMI-NEXT:    orq %rdi, %rax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out64:
@@ -104,10 +107,11 @@
 define i8 @in8(i8 %x, i8 %y, i8 %mask) {
 ; CHECK-NOBMI-LABEL: in8:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in8:
@@ -126,10 +130,11 @@
 define i16 @in16(i16 %x, i16 %y, i16 %mask) {
 ; CHECK-NOBMI-LABEL: in16:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in16:
@@ -148,10 +153,10 @@
 define i32 @in32(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: in32:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in32:
@@ -169,10 +174,10 @@
 define i64 @in64(i64 %x, i64 %y, i64 %mask) {
 ; CHECK-NOBMI-LABEL: in64:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorq %rsi, %rdi
-; CHECK-NOBMI-NEXT:    andq %rdx, %rdi
-; CHECK-NOBMI-NEXT:    xorq %rsi, %rdi
 ; CHECK-NOBMI-NEXT:    movq %rdi, %rax
+; CHECK-NOBMI-NEXT:    xorq %rsi, %rax
+; CHECK-NOBMI-NEXT:    andq %rdx, %rax
+; CHECK-NOBMI-NEXT:    xorq %rsi, %rax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in64:
@@ -192,10 +197,10 @@
 define i32 @in_commutativity_0_0_1(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: in_commutativity_0_0_1:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_commutativity_0_0_1:
@@ -212,10 +217,10 @@
 define i32 @in_commutativity_0_1_0(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: in_commutativity_0_1_0:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_commutativity_0_1_0:
@@ -232,10 +237,10 @@
 define i32 @in_commutativity_0_1_1(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: in_commutativity_0_1_1:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_commutativity_0_1_1:
@@ -252,10 +257,10 @@
 define i32 @in_commutativity_1_0_0(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: in_commutativity_1_0_0:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
-; CHECK-NOBMI-NEXT:    andl %edx, %esi
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
 ; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_commutativity_1_0_0:
@@ -272,10 +277,10 @@
 define i32 @in_commutativity_1_0_1(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: in_commutativity_1_0_1:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
-; CHECK-NOBMI-NEXT:    andl %edx, %esi
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
 ; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_commutativity_1_0_1:
@@ -292,10 +297,10 @@
 define i32 @in_commutativity_1_1_0(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: in_commutativity_1_1_0:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
-; CHECK-NOBMI-NEXT:    andl %edx, %esi
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
 ; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_commutativity_1_1_0:
@@ -312,10 +317,10 @@
 define i32 @in_commutativity_1_1_1(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: in_commutativity_1_1_1:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
-; CHECK-NOBMI-NEXT:    andl %edx, %esi
-; CHECK-NOBMI-NEXT:    xorl %edi, %esi
 ; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %edi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_commutativity_1_1_1:
@@ -335,11 +340,11 @@
 define i32 @in_complex_y0(i32 %x, i32 %y_hi, i32 %y_low, i32 %mask) {
 ; CHECK-NOBMI-LABEL: in_complex_y0:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl %edx, %esi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl %ecx, %edi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %esi
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %ecx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_complex_y0:
@@ -358,11 +363,11 @@
 define i32 @in_complex_y1(i32 %x, i32 %y_hi, i32 %y_low, i32 %mask) {
 ; CHECK-NOBMI-LABEL: in_complex_y1:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl %edx, %esi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl %ecx, %edi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %esi
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %ecx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_complex_y1:
@@ -384,11 +389,11 @@
 define i32 @in_complex_m0(i32 %x, i32 %y, i32 %m_a, i32 %m_b) {
 ; CHECK-NOBMI-LABEL: in_complex_m0:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %ecx, %edx
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %ecx, %edx
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_complex_m0:
@@ -407,11 +412,11 @@
 define i32 @in_complex_m1(i32 %x, i32 %y, i32 %m_a, i32 %m_b) {
 ; CHECK-NOBMI-LABEL: in_complex_m1:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %ecx, %edx
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %ecx, %edx
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_complex_m1:
@@ -433,12 +438,12 @@
 define i32 @in_complex_y0_m0(i32 %x, i32 %y_hi, i32 %y_low, i32 %m_a, i32 %m_b) {
 ; CHECK-NOBMI-LABEL: in_complex_y0_m0:
 ; CHECK-NOBMI:       # %bb.0:
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
 ; CHECK-NOBMI-NEXT:    andl %edx, %esi
 ; CHECK-NOBMI-NEXT:    xorl %r8d, %ecx
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl %ecx, %edi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %ecx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_complex_y0_m0:
@@ -459,12 +464,12 @@
 define i32 @in_complex_y1_m0(i32 %x, i32 %y_hi, i32 %y_low, i32 %m_a, i32 %m_b) {
 ; CHECK-NOBMI-LABEL: in_complex_y1_m0:
 ; CHECK-NOBMI:       # %bb.0:
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
 ; CHECK-NOBMI-NEXT:    andl %edx, %esi
 ; CHECK-NOBMI-NEXT:    xorl %r8d, %ecx
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl %ecx, %edi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %ecx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_complex_y1_m0:
@@ -485,12 +490,12 @@
 define i32 @in_complex_y0_m1(i32 %x, i32 %y_hi, i32 %y_low, i32 %m_a, i32 %m_b) {
 ; CHECK-NOBMI-LABEL: in_complex_y0_m1:
 ; CHECK-NOBMI:       # %bb.0:
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
 ; CHECK-NOBMI-NEXT:    andl %edx, %esi
 ; CHECK-NOBMI-NEXT:    xorl %r8d, %ecx
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl %ecx, %edi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %ecx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_complex_y0_m1:
@@ -511,12 +516,12 @@
 define i32 @in_complex_y1_m1(i32 %x, i32 %y_hi, i32 %y_low, i32 %m_a, i32 %m_b) {
 ; CHECK-NOBMI-LABEL: in_complex_y1_m1:
 ; CHECK-NOBMI:       # %bb.0:
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
 ; CHECK-NOBMI-NEXT:    andl %edx, %esi
 ; CHECK-NOBMI-NEXT:    xorl %r8d, %ecx
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl %ecx, %edi
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %ecx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_complex_y1_m1:
@@ -540,18 +545,18 @@
 define i32 @out_constant_varx_mone(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: out_constant_varx_mone:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    notl %edx
-; CHECK-NOBMI-NEXT:    orl %edx, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    notl %edx
+; CHECK-NOBMI-NEXT:    orl %edx, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out_constant_varx_mone:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andl %edx, %edi
-; CHECK-BMI-NEXT:    notl %edx
-; CHECK-BMI-NEXT:    orl %edx, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    andl %edx, %eax
+; CHECK-BMI-NEXT:    notl %edx
+; CHECK-BMI-NEXT:    orl %edx, %eax
 ; CHECK-BMI-NEXT:    retq
   %notmask = xor i32 %mask, -1
   %mx = and i32 %mask, %x
@@ -562,10 +567,10 @@
 define i32 @in_constant_varx_mone(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: in_constant_varx_mone:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    notl %edi
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    notl %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    notl %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    notl %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_constant_varx_mone:
@@ -603,11 +608,11 @@
 define i32 @in_constant_varx_mone_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: in_constant_varx_mone_invmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    notl %edx
-; CHECK-NOBMI-NEXT:    notl %edi
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    notl %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    notl %edx
+; CHECK-NOBMI-NEXT:    notl %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    notl %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_constant_varx_mone_invmask:
@@ -649,10 +654,10 @@
 define i32 @in_constant_varx_42(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: in_constant_varx_42:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl $42, %edi
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    xorl $42, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl $42, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl $42, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_constant_varx_42:
@@ -671,11 +676,11 @@
 ; CHECK-NOBMI-LABEL: out_constant_varx_42_invmask:
 ; CHECK-NOBMI:       # %bb.0:
 ; CHECK-NOBMI-NEXT:    movl %edx, %eax
-; CHECK-NOBMI-NEXT:    notl %eax
-; CHECK-NOBMI-NEXT:    andl %edi, %eax
-; CHECK-NOBMI-NEXT:    andl $42, %edx
-; CHECK-NOBMI-NEXT:    orl %eax, %edx
-; CHECK-NOBMI-NEXT:    movl %edx, %eax
+; CHECK-NOBMI-NEXT:    movl %edx, %ecx
+; CHECK-NOBMI-NEXT:    notl %ecx
+; CHECK-NOBMI-NEXT:    andl %edi, %ecx
+; CHECK-NOBMI-NEXT:    andl $42, %eax
+; CHECK-NOBMI-NEXT:    orl %ecx, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out_constant_varx_42_invmask:
@@ -694,11 +699,11 @@
 define i32 @in_constant_varx_42_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: in_constant_varx_42_invmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    notl %edx
-; CHECK-NOBMI-NEXT:    xorl $42, %edi
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    xorl $42, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    notl %edx
+; CHECK-NOBMI-NEXT:    xorl $42, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl $42, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in_constant_varx_42_invmask:
@@ -757,18 +762,18 @@
 define i32 @out_constant_mone_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: out_constant_mone_vary_invmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl %edx, %esi
-; CHECK-NOBMI-NEXT:    notl %edx
-; CHECK-NOBMI-NEXT:    orl %edx, %esi
 ; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    notl %edx
+; CHECK-NOBMI-NEXT:    orl %edx, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out_constant_mone_vary_invmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andl %edx, %esi
-; CHECK-BMI-NEXT:    notl %edx
-; CHECK-BMI-NEXT:    orl %edx, %esi
 ; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    andl %edx, %eax
+; CHECK-BMI-NEXT:    notl %edx
+; CHECK-BMI-NEXT:    orl %edx, %eax
 ; CHECK-BMI-NEXT:    retq
   %notmask = xor i32 %mask, -1
   %mx = and i32 %notmask, -1
@@ -845,20 +850,20 @@
 define i32 @out_constant_42_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: out_constant_42_vary_invmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl %edx, %esi
+; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
 ; CHECK-NOBMI-NEXT:    notl %edx
 ; CHECK-NOBMI-NEXT:    andl $42, %edx
-; CHECK-NOBMI-NEXT:    orl %edx, %esi
-; CHECK-NOBMI-NEXT:    movl %esi, %eax
+; CHECK-NOBMI-NEXT:    orl %edx, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out_constant_42_vary_invmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andl %edx, %esi
+; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    andl %edx, %eax
 ; CHECK-BMI-NEXT:    notl %edx
 ; CHECK-BMI-NEXT:    andl $42, %edx
-; CHECK-BMI-NEXT:    orl %edx, %esi
-; CHECK-BMI-NEXT:    movl %esi, %eax
+; CHECK-BMI-NEXT:    orl %edx, %eax
 ; CHECK-BMI-NEXT:    retq
   %notmask = xor i32 %mask, -1
   %mx = and i32 %notmask, 42
@@ -879,11 +884,11 @@
 ;
 ; CHECK-BMI-LABEL: in_constant_42_vary_invmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andl %edx, %esi
-; CHECK-BMI-NEXT:    notl %edx
-; CHECK-BMI-NEXT:    andl $42, %edx
-; CHECK-BMI-NEXT:    orl %esi, %edx
 ; CHECK-BMI-NEXT:    movl %edx, %eax
+; CHECK-BMI-NEXT:    andl %edx, %esi
+; CHECK-BMI-NEXT:    notl %eax
+; CHECK-BMI-NEXT:    andl $42, %eax
+; CHECK-BMI-NEXT:    orl %esi, %eax
 ; CHECK-BMI-NEXT:    retq
   %notmask = xor i32 %mask, -1
   %n0 = xor i32 42, %y ; %x
@@ -982,11 +987,11 @@
 define i32 @n0_badmask(i32 %x, i32 %y, i32 %mask, i32 %mask2) {
 ; CHECK-NOBMI-LABEL: n0_badmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    notl %ecx
-; CHECK-NOBMI-NEXT:    andl %esi, %ecx
-; CHECK-NOBMI-NEXT:    orl %edi, %ecx
 ; CHECK-NOBMI-NEXT:    movl %ecx, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %edi
+; CHECK-NOBMI-NEXT:    notl %eax
+; CHECK-NOBMI-NEXT:    andl %esi, %eax
+; CHECK-NOBMI-NEXT:    orl %edi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: n0_badmask:
@@ -1004,20 +1009,20 @@
 define i32 @n0_badxor(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-NOBMI-LABEL: n0_badxor:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    xorl $1, %edx
-; CHECK-NOBMI-NEXT:    andl %esi, %edx
-; CHECK-NOBMI-NEXT:    orl %edi, %edx
 ; CHECK-NOBMI-NEXT:    movl %edx, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %edi
+; CHECK-NOBMI-NEXT:    xorl $1, %eax
+; CHECK-NOBMI-NEXT:    andl %esi, %eax
+; CHECK-NOBMI-NEXT:    orl %edi, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: n0_badxor:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andl %edx, %edi
-; CHECK-BMI-NEXT:    xorl $1, %edx
-; CHECK-BMI-NEXT:    andl %esi, %edx
-; CHECK-BMI-NEXT:    orl %edi, %edx
 ; CHECK-BMI-NEXT:    movl %edx, %eax
+; CHECK-BMI-NEXT:    andl %edx, %edi
+; CHECK-BMI-NEXT:    xorl $1, %eax
+; CHECK-BMI-NEXT:    andl %esi, %eax
+; CHECK-BMI-NEXT:    orl %edi, %eax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i32 %x, %mask
   %notmask = xor i32 %mask, 1 ; instead of -1
@@ -1028,18 +1033,18 @@
 define i32 @n1_thirdvar(i32 %x, i32 %y, i32 %z, i32 %mask) {
 ; CHECK-NOBMI-LABEL: n1_thirdvar:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andl %ecx, %edi
-; CHECK-NOBMI-NEXT:    xorl %edx, %edi
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %ecx, %eax
+; CHECK-NOBMI-NEXT:    xorl %edx, %eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: n1_thirdvar:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andl %ecx, %edi
-; CHECK-BMI-NEXT:    xorl %edx, %edi
 ; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorl %esi, %eax
+; CHECK-BMI-NEXT:    andl %ecx, %eax
+; CHECK-BMI-NEXT:    xorl %edx, %eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i32 %x, %y
   %n1 = and i32 %n0, %mask
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
index 4e50bf9..d9cc204 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
@@ -10,13 +10,13 @@
 define <4 x i32> @out_constant_varx_mone(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: out_constant_varx_mone:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [nan,nan,nan,nan]
 ; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    andps (%rsi), %xmm0
 ; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_constant_varx_mone:
@@ -49,11 +49,11 @@
 define <4 x i32> @in_constant_varx_mone(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: in_constant_varx_mone:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm0
 ; CHECK-SSE1-NEXT:    andnps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    xorps {{.*}}(%rip), %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: in_constant_varx_mone:
@@ -84,12 +84,12 @@
 define <4 x i32> @out_constant_varx_mone_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: out_constant_varx_mone_invmask:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    andnps (%rsi), %xmm1
 ; CHECK-SSE1-NEXT:    orps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    movaps %xmm1, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_constant_varx_mone_invmask:
@@ -120,6 +120,7 @@
 define <4 x i32> @in_constant_varx_mone_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: in_constant_varx_mone_invmask:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm0
 ; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [nan,nan,nan,nan]
 ; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm2
@@ -127,7 +128,6 @@
 ; CHECK-SSE1-NEXT:    andnps %xmm2, %xmm0
 ; CHECK-SSE1-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: in_constant_varx_mone_invmask:
@@ -161,13 +161,13 @@
 define <4 x i32> @out_constant_varx_42(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: out_constant_varx_42:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm1
 ; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    andnps {{.*}}(%rip), %xmm0
 ; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_constant_varx_42:
@@ -198,13 +198,13 @@
 define <4 x i32> @in_constant_varx_42(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: in_constant_varx_42:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm1
 ; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    andnps {{.*}}(%rip), %xmm0
 ; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: in_constant_varx_42:
@@ -235,13 +235,13 @@
 define <4 x i32> @out_constant_varx_42_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: out_constant_varx_42_invmask:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    andnps (%rsi), %xmm1
 ; CHECK-SSE1-NEXT:    andps {{.*}}(%rip), %xmm0
 ; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_constant_varx_42_invmask:
@@ -273,13 +273,13 @@
 define <4 x i32> @in_constant_varx_42_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: in_constant_varx_42_invmask:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    andnps (%rsi), %xmm1
 ; CHECK-SSE1-NEXT:    andps {{.*}}(%rip), %xmm0
 ; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: in_constant_varx_42_invmask:
@@ -310,12 +310,12 @@
 define <4 x i32> @out_constant_mone_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: out_constant_mone_vary:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm1
 ; CHECK-SSE1-NEXT:    orps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    movaps %xmm1, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_constant_mone_vary:
@@ -345,12 +345,12 @@
 define <4 x i32> @in_constant_mone_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: in_constant_mone_vary:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm1
 ; CHECK-SSE1-NEXT:    orps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    movaps %xmm1, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: in_constant_mone_vary:
@@ -380,13 +380,13 @@
 define <4 x i32> @out_constant_mone_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: out_constant_mone_vary_invmask:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [nan,nan,nan,nan]
 ; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    andps (%rdx), %xmm0
 ; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_constant_mone_vary_invmask:
@@ -420,13 +420,13 @@
 define <4 x i32> @in_constant_mone_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: in_constant_mone_vary_invmask:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [nan,nan,nan,nan]
 ; CHECK-SSE1-NEXT:    xorps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    andps (%rdx), %xmm0
 ; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: in_constant_mone_vary_invmask:
@@ -459,13 +459,13 @@
 define <4 x i32> @out_constant_42_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: out_constant_42_vary:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    movaps {{.*#+}} xmm1 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44]
 ; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm0
 ; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_constant_42_vary:
@@ -496,13 +496,13 @@
 define <4 x i32> @in_constant_42_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: in_constant_42_vary:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm1
 ; CHECK-SSE1-NEXT:    andps {{.*}}(%rip), %xmm0
 ; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: in_constant_42_vary:
@@ -533,13 +533,13 @@
 define <4 x i32> @out_constant_42_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: out_constant_42_vary_invmask:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    andnps {{.*}}(%rip), %xmm1
 ; CHECK-SSE1-NEXT:    andps (%rdx), %xmm0
 ; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_constant_42_vary_invmask:
@@ -571,13 +571,13 @@
 define <4 x i32> @in_constant_42_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
 ; CHECK-SSE1-LABEL: in_constant_42_vary_invmask:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    movaps (%rdx), %xmm1
 ; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    andnps {{.*}}(%rip), %xmm0
 ; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: in_constant_42_vary_invmask:
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
index a7e516c..4061f47 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
@@ -16,11 +16,12 @@
 define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl %edx, %edi
-; CHECK-NEXT:    notb %dl
-; CHECK-NEXT:    andb %sil, %dl
-; CHECK-NEXT:    orb %dil, %dl
 ; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    andl %edx, %edi
+; CHECK-NEXT:    notb %al
+; CHECK-NEXT:    andb %sil, %al
+; CHECK-NEXT:    orb %dil, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %mx = and <1 x i8> %x, %mask
   %notmask = xor <1 x i8> %mask, <i8 -1>
@@ -36,29 +37,31 @@
 define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v2i8:
 ; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    movl %r8d, %eax
 ; CHECK-BASELINE-NEXT:    andl %r8d, %edi
 ; CHECK-BASELINE-NEXT:    andl %r9d, %esi
-; CHECK-BASELINE-NEXT:    notb %r8b
+; CHECK-BASELINE-NEXT:    notb %al
 ; CHECK-BASELINE-NEXT:    notb %r9b
 ; CHECK-BASELINE-NEXT:    andb %cl, %r9b
-; CHECK-BASELINE-NEXT:    andb %dl, %r8b
-; CHECK-BASELINE-NEXT:    orb %dil, %r8b
+; CHECK-BASELINE-NEXT:    andb %dl, %al
+; CHECK-BASELINE-NEXT:    orb %dil, %al
 ; CHECK-BASELINE-NEXT:    orb %sil, %r9b
-; CHECK-BASELINE-NEXT:    movl %r8d, %eax
+; CHECK-BASELINE-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BASELINE-NEXT:    movl %r9d, %edx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v2i8:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movl %r8d, %eax
 ; CHECK-SSE1-NEXT:    andl %r8d, %edi
 ; CHECK-SSE1-NEXT:    andl %r9d, %esi
-; CHECK-SSE1-NEXT:    notb %r8b
+; CHECK-SSE1-NEXT:    notb %al
 ; CHECK-SSE1-NEXT:    notb %r9b
 ; CHECK-SSE1-NEXT:    andb %cl, %r9b
-; CHECK-SSE1-NEXT:    andb %dl, %r8b
-; CHECK-SSE1-NEXT:    orb %dil, %r8b
+; CHECK-SSE1-NEXT:    andb %dl, %al
+; CHECK-SSE1-NEXT:    orb %dil, %al
 ; CHECK-SSE1-NEXT:    orb %sil, %r9b
-; CHECK-SSE1-NEXT:    movl %r8d, %eax
+; CHECK-SSE1-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-SSE1-NEXT:    movl %r9d, %edx
 ; CHECK-SSE1-NEXT:    retq
 ;
@@ -87,11 +90,12 @@
 define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl %edx, %edi
-; CHECK-NEXT:    notl %edx
-; CHECK-NEXT:    andl %esi, %edx
-; CHECK-NEXT:    orl %edi, %edx
 ; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    andl %edx, %edi
+; CHECK-NEXT:    notl %eax
+; CHECK-NEXT:    andl %esi, %eax
+; CHECK-NEXT:    orl %edi, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %mx = and <1 x i16> %x, %mask
   %notmask = xor <1 x i16> %mask, <i16 -1>
@@ -108,62 +112,62 @@
 ; CHECK-BASELINE-LABEL: out_v4i8:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
 ; CHECK-BASELINE-NEXT:    andb %bl, %r8b
-; CHECK-BASELINE-NEXT:    andb %al, %cl
-; CHECK-BASELINE-NEXT:    andb %r11b, %dl
-; CHECK-BASELINE-NEXT:    andb %r10b, %sil
-; CHECK-BASELINE-NEXT:    notb %r11b
-; CHECK-BASELINE-NEXT:    notb %al
-; CHECK-BASELINE-NEXT:    notb %bl
+; CHECK-BASELINE-NEXT:    andb %r11b, %cl
+; CHECK-BASELINE-NEXT:    andb %r10b, %dl
+; CHECK-BASELINE-NEXT:    andb %dil, %sil
 ; CHECK-BASELINE-NEXT:    notb %r10b
-; CHECK-BASELINE-NEXT:    andb %r9b, %r10b
-; CHECK-BASELINE-NEXT:    orb %sil, %r10b
+; CHECK-BASELINE-NEXT:    notb %r11b
+; CHECK-BASELINE-NEXT:    notb %bl
+; CHECK-BASELINE-NEXT:    notb %dil
+; CHECK-BASELINE-NEXT:    andb %r9b, %dil
+; CHECK-BASELINE-NEXT:    orb %sil, %dil
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
 ; CHECK-BASELINE-NEXT:    orb %r8b, %bl
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT:    orb %cl, %al
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
-; CHECK-BASELINE-NEXT:    orb %dl, %r11b
-; CHECK-BASELINE-NEXT:    movb %bl, 3(%rdi)
-; CHECK-BASELINE-NEXT:    movb %al, 2(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r11b, 1(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r10b, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    orb %cl, %r11b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    orb %dl, %r10b
+; CHECK-BASELINE-NEXT:    movb %bl, 3(%rax)
+; CHECK-BASELINE-NEXT:    movb %r11b, 2(%rax)
+; CHECK-BASELINE-NEXT:    movb %r10b, 1(%rax)
+; CHECK-BASELINE-NEXT:    movb %dil, (%rax)
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i8:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
 ; CHECK-SSE1-NEXT:    andb %bl, %r8b
-; CHECK-SSE1-NEXT:    andb %al, %cl
-; CHECK-SSE1-NEXT:    andb %r11b, %dl
-; CHECK-SSE1-NEXT:    andb %r10b, %sil
-; CHECK-SSE1-NEXT:    notb %r11b
-; CHECK-SSE1-NEXT:    notb %al
-; CHECK-SSE1-NEXT:    notb %bl
+; CHECK-SSE1-NEXT:    andb %r11b, %cl
+; CHECK-SSE1-NEXT:    andb %r10b, %dl
+; CHECK-SSE1-NEXT:    andb %dil, %sil
 ; CHECK-SSE1-NEXT:    notb %r10b
-; CHECK-SSE1-NEXT:    andb %r9b, %r10b
-; CHECK-SSE1-NEXT:    orb %sil, %r10b
+; CHECK-SSE1-NEXT:    notb %r11b
+; CHECK-SSE1-NEXT:    notb %bl
+; CHECK-SSE1-NEXT:    notb %dil
+; CHECK-SSE1-NEXT:    andb %r9b, %dil
+; CHECK-SSE1-NEXT:    orb %sil, %dil
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
 ; CHECK-SSE1-NEXT:    orb %r8b, %bl
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT:    orb %cl, %al
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
-; CHECK-SSE1-NEXT:    orb %dl, %r11b
-; CHECK-SSE1-NEXT:    movb %bl, 3(%rdi)
-; CHECK-SSE1-NEXT:    movb %al, 2(%rdi)
-; CHECK-SSE1-NEXT:    movb %r11b, 1(%rdi)
-; CHECK-SSE1-NEXT:    movb %r10b, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    orb %cl, %r11b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    orb %dl, %r10b
+; CHECK-SSE1-NEXT:    movb %bl, 3(%rax)
+; CHECK-SSE1-NEXT:    movb %r11b, 2(%rax)
+; CHECK-SSE1-NEXT:    movb %r10b, 1(%rax)
+; CHECK-SSE1-NEXT:    movb %dil, (%rax)
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    retq
 ;
@@ -192,52 +196,52 @@
 define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v4i8_undef:
 ; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT:    andb %al, %r8b
-; CHECK-BASELINE-NEXT:    andb %r11b, %dl
-; CHECK-BASELINE-NEXT:    andb %r10b, %sil
+; CHECK-BASELINE-NEXT:    andb %r11b, %r8b
+; CHECK-BASELINE-NEXT:    andb %r10b, %dl
+; CHECK-BASELINE-NEXT:    andb %dil, %sil
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    notb %r11b
-; CHECK-BASELINE-NEXT:    notb %al
 ; CHECK-BASELINE-NEXT:    notb %r10b
-; CHECK-BASELINE-NEXT:    andb %r9b, %r10b
-; CHECK-BASELINE-NEXT:    orb %sil, %r10b
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT:    orb %r8b, %al
+; CHECK-BASELINE-NEXT:    notb %r11b
+; CHECK-BASELINE-NEXT:    notb %dil
+; CHECK-BASELINE-NEXT:    andb %r9b, %dil
+; CHECK-BASELINE-NEXT:    orb %sil, %dil
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
-; CHECK-BASELINE-NEXT:    orb %dl, %r11b
-; CHECK-BASELINE-NEXT:    movb %cl, 2(%rdi)
-; CHECK-BASELINE-NEXT:    movb %al, 3(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r11b, 1(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r10b, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    orb %r8b, %r11b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    orb %dl, %r10b
+; CHECK-BASELINE-NEXT:    movb %cl, 2(%rax)
+; CHECK-BASELINE-NEXT:    movb %r11b, 3(%rax)
+; CHECK-BASELINE-NEXT:    movb %r10b, 1(%rax)
+; CHECK-BASELINE-NEXT:    movb %dil, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i8_undef:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT:    andb %al, %r8b
-; CHECK-SSE1-NEXT:    andb %r11b, %dl
-; CHECK-SSE1-NEXT:    andb %r10b, %sil
+; CHECK-SSE1-NEXT:    andb %r11b, %r8b
+; CHECK-SSE1-NEXT:    andb %r10b, %dl
+; CHECK-SSE1-NEXT:    andb %dil, %sil
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    notb %r11b
-; CHECK-SSE1-NEXT:    notb %al
 ; CHECK-SSE1-NEXT:    notb %r10b
-; CHECK-SSE1-NEXT:    andb %r9b, %r10b
-; CHECK-SSE1-NEXT:    orb %sil, %r10b
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT:    orb %r8b, %al
+; CHECK-SSE1-NEXT:    notb %r11b
+; CHECK-SSE1-NEXT:    notb %dil
+; CHECK-SSE1-NEXT:    andb %r9b, %dil
+; CHECK-SSE1-NEXT:    orb %sil, %dil
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
-; CHECK-SSE1-NEXT:    orb %dl, %r11b
-; CHECK-SSE1-NEXT:    movb %cl, 2(%rdi)
-; CHECK-SSE1-NEXT:    movb %al, 3(%rdi)
-; CHECK-SSE1-NEXT:    movb %r11b, 1(%rdi)
-; CHECK-SSE1-NEXT:    movb %r10b, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    orb %r8b, %r11b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    orb %dl, %r10b
+; CHECK-SSE1-NEXT:    movb %cl, 2(%rax)
+; CHECK-SSE1-NEXT:    movb %r11b, 3(%rax)
+; CHECK-SSE1-NEXT:    movb %r10b, 1(%rax)
+; CHECK-SSE1-NEXT:    movb %dil, (%rax)
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v4i8_undef:
@@ -265,29 +269,31 @@
 define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v2i16:
 ; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    movl %r8d, %eax
 ; CHECK-BASELINE-NEXT:    andl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    andl %r8d, %edi
-; CHECK-BASELINE-NEXT:    notl %r8d
+; CHECK-BASELINE-NEXT:    notl %eax
 ; CHECK-BASELINE-NEXT:    notl %r9d
 ; CHECK-BASELINE-NEXT:    andl %ecx, %r9d
 ; CHECK-BASELINE-NEXT:    orl %esi, %r9d
-; CHECK-BASELINE-NEXT:    andl %edx, %r8d
-; CHECK-BASELINE-NEXT:    orl %edi, %r8d
-; CHECK-BASELINE-NEXT:    movl %r8d, %eax
+; CHECK-BASELINE-NEXT:    andl %edx, %eax
+; CHECK-BASELINE-NEXT:    orl %edi, %eax
+; CHECK-BASELINE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-BASELINE-NEXT:    movl %r9d, %edx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v2i16:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movl %r8d, %eax
 ; CHECK-SSE1-NEXT:    andl %r9d, %esi
 ; CHECK-SSE1-NEXT:    andl %r8d, %edi
-; CHECK-SSE1-NEXT:    notl %r8d
+; CHECK-SSE1-NEXT:    notl %eax
 ; CHECK-SSE1-NEXT:    notl %r9d
 ; CHECK-SSE1-NEXT:    andl %ecx, %r9d
 ; CHECK-SSE1-NEXT:    orl %esi, %r9d
-; CHECK-SSE1-NEXT:    andl %edx, %r8d
-; CHECK-SSE1-NEXT:    orl %edi, %r8d
-; CHECK-SSE1-NEXT:    movl %r8d, %eax
+; CHECK-SSE1-NEXT:    andl %edx, %eax
+; CHECK-SSE1-NEXT:    orl %edi, %eax
+; CHECK-SSE1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-SSE1-NEXT:    movl %r9d, %edx
 ; CHECK-SSE1-NEXT:    retq
 ;
@@ -316,11 +322,11 @@
 define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl %edx, %edi
-; CHECK-NEXT:    notl %edx
-; CHECK-NEXT:    andl %esi, %edx
-; CHECK-NEXT:    orl %edi, %edx
 ; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    andl %edx, %edi
+; CHECK-NEXT:    notl %eax
+; CHECK-NEXT:    andl %esi, %eax
+; CHECK-NEXT:    orl %edi, %eax
 ; CHECK-NEXT:    retq
   %mx = and <1 x i32> %x, %mask
   %notmask = xor <1 x i32> %mask, <i32 -1>
@@ -342,60 +348,60 @@
 ; CHECK-BASELINE-NEXT:    pushq %r13
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT:    andb %al, %r9b
-; CHECK-BASELINE-NEXT:    andb %bl, %r8b
-; CHECK-BASELINE-NEXT:    andb %r14b, %cl
+; CHECK-BASELINE-NEXT:    andb %bl, %r9b
+; CHECK-BASELINE-NEXT:    andb %r15b, %r8b
+; CHECK-BASELINE-NEXT:    andb %bpl, %cl
 ; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    andb %r11b, %dl
 ; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    andb %r10b, %sil
+; CHECK-BASELINE-NEXT:    andb %dil, %sil
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
 ; CHECK-BASELINE-NEXT:    andb %r12b, %r13b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    andb %r15b, %cl
+; CHECK-BASELINE-NEXT:    andb %r14b, %cl
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; CHECK-BASELINE-NEXT:    andb %bpl, %dl
-; CHECK-BASELINE-NEXT:    notb %r10b
+; CHECK-BASELINE-NEXT:    andb %r10b, %dl
+; CHECK-BASELINE-NEXT:    notb %dil
 ; CHECK-BASELINE-NEXT:    notb %r11b
-; CHECK-BASELINE-NEXT:    notb %r14b
-; CHECK-BASELINE-NEXT:    notb %bl
-; CHECK-BASELINE-NEXT:    notb %al
 ; CHECK-BASELINE-NEXT:    notb %bpl
 ; CHECK-BASELINE-NEXT:    notb %r15b
+; CHECK-BASELINE-NEXT:    notb %bl
+; CHECK-BASELINE-NEXT:    notb %r10b
+; CHECK-BASELINE-NEXT:    notb %r14b
 ; CHECK-BASELINE-NEXT:    notb %r12b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
 ; CHECK-BASELINE-NEXT:    orb %r13b, %r12b
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
-; CHECK-BASELINE-NEXT:    orb %cl, %r15b
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
-; CHECK-BASELINE-NEXT:    orb %dl, %bpl
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT:    orb %r9b, %al
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
-; CHECK-BASELINE-NEXT:    orb %r8b, %bl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
-; CHECK-BASELINE-NEXT:    orb {{[-0-9]+}}(%r{{[sb]}}p), %r14b # 1-byte Folded Reload
+; CHECK-BASELINE-NEXT:    orb %cl, %r14b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    orb %dl, %r10b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    orb %r9b, %bl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
+; CHECK-BASELINE-NEXT:    orb %r8b, %r15b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
+; CHECK-BASELINE-NEXT:    orb {{[-0-9]+}}(%r{{[sb]}}p), %bpl # 1-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
 ; CHECK-BASELINE-NEXT:    orb {{[-0-9]+}}(%r{{[sb]}}p), %r11b # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
-; CHECK-BASELINE-NEXT:    orb %sil, %r10b
-; CHECK-BASELINE-NEXT:    movb %r12b, 7(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r15b, 6(%rdi)
-; CHECK-BASELINE-NEXT:    movb %bpl, 5(%rdi)
-; CHECK-BASELINE-NEXT:    movb %al, 4(%rdi)
-; CHECK-BASELINE-NEXT:    movb %bl, 3(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r14b, 2(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r11b, 1(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r10b, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dil
+; CHECK-BASELINE-NEXT:    orb %sil, %dil
+; CHECK-BASELINE-NEXT:    movb %r12b, 7(%rax)
+; CHECK-BASELINE-NEXT:    movb %r14b, 6(%rax)
+; CHECK-BASELINE-NEXT:    movb %r10b, 5(%rax)
+; CHECK-BASELINE-NEXT:    movb %bl, 4(%rax)
+; CHECK-BASELINE-NEXT:    movb %r15b, 3(%rax)
+; CHECK-BASELINE-NEXT:    movb %bpl, 2(%rax)
+; CHECK-BASELINE-NEXT:    movb %r11b, 1(%rax)
+; CHECK-BASELINE-NEXT:    movb %dil, (%rax)
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r13
@@ -412,60 +418,60 @@
 ; CHECK-SSE1-NEXT:    pushq %r13
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT:    andb %al, %r9b
-; CHECK-SSE1-NEXT:    andb %bl, %r8b
-; CHECK-SSE1-NEXT:    andb %r14b, %cl
+; CHECK-SSE1-NEXT:    andb %bl, %r9b
+; CHECK-SSE1-NEXT:    andb %r15b, %r8b
+; CHECK-SSE1-NEXT:    andb %bpl, %cl
 ; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    andb %r11b, %dl
 ; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    andb %r10b, %sil
+; CHECK-SSE1-NEXT:    andb %dil, %sil
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
 ; CHECK-SSE1-NEXT:    andb %r12b, %r13b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    andb %r15b, %cl
+; CHECK-SSE1-NEXT:    andb %r14b, %cl
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; CHECK-SSE1-NEXT:    andb %bpl, %dl
-; CHECK-SSE1-NEXT:    notb %r10b
+; CHECK-SSE1-NEXT:    andb %r10b, %dl
+; CHECK-SSE1-NEXT:    notb %dil
 ; CHECK-SSE1-NEXT:    notb %r11b
-; CHECK-SSE1-NEXT:    notb %r14b
-; CHECK-SSE1-NEXT:    notb %bl
-; CHECK-SSE1-NEXT:    notb %al
 ; CHECK-SSE1-NEXT:    notb %bpl
 ; CHECK-SSE1-NEXT:    notb %r15b
+; CHECK-SSE1-NEXT:    notb %bl
+; CHECK-SSE1-NEXT:    notb %r10b
+; CHECK-SSE1-NEXT:    notb %r14b
 ; CHECK-SSE1-NEXT:    notb %r12b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
 ; CHECK-SSE1-NEXT:    orb %r13b, %r12b
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
-; CHECK-SSE1-NEXT:    orb %cl, %r15b
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
-; CHECK-SSE1-NEXT:    orb %dl, %bpl
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT:    orb %r9b, %al
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
-; CHECK-SSE1-NEXT:    orb %r8b, %bl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
-; CHECK-SSE1-NEXT:    orb {{[-0-9]+}}(%r{{[sb]}}p), %r14b # 1-byte Folded Reload
+; CHECK-SSE1-NEXT:    orb %cl, %r14b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    orb %dl, %r10b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    orb %r9b, %bl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
+; CHECK-SSE1-NEXT:    orb %r8b, %r15b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
+; CHECK-SSE1-NEXT:    orb {{[-0-9]+}}(%r{{[sb]}}p), %bpl # 1-byte Folded Reload
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
 ; CHECK-SSE1-NEXT:    orb {{[-0-9]+}}(%r{{[sb]}}p), %r11b # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
-; CHECK-SSE1-NEXT:    orb %sil, %r10b
-; CHECK-SSE1-NEXT:    movb %r12b, 7(%rdi)
-; CHECK-SSE1-NEXT:    movb %r15b, 6(%rdi)
-; CHECK-SSE1-NEXT:    movb %bpl, 5(%rdi)
-; CHECK-SSE1-NEXT:    movb %al, 4(%rdi)
-; CHECK-SSE1-NEXT:    movb %bl, 3(%rdi)
-; CHECK-SSE1-NEXT:    movb %r14b, 2(%rdi)
-; CHECK-SSE1-NEXT:    movb %r11b, 1(%rdi)
-; CHECK-SSE1-NEXT:    movb %r10b, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dil
+; CHECK-SSE1-NEXT:    orb %sil, %dil
+; CHECK-SSE1-NEXT:    movb %r12b, 7(%rax)
+; CHECK-SSE1-NEXT:    movb %r14b, 6(%rax)
+; CHECK-SSE1-NEXT:    movb %r10b, 5(%rax)
+; CHECK-SSE1-NEXT:    movb %bl, 4(%rax)
+; CHECK-SSE1-NEXT:    movb %r15b, 3(%rax)
+; CHECK-SSE1-NEXT:    movb %bpl, 2(%rax)
+; CHECK-SSE1-NEXT:    movb %r11b, 1(%rax)
+; CHECK-SSE1-NEXT:    movb %dil, (%rax)
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r13
@@ -500,62 +506,62 @@
 ; CHECK-BASELINE-LABEL: out_v4i16:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
 ; CHECK-BASELINE-NEXT:    andl %ebx, %esi
-; CHECK-BASELINE-NEXT:    andl %eax, %r8d
+; CHECK-BASELINE-NEXT:    andl %edi, %r8d
 ; CHECK-BASELINE-NEXT:    andl %r11d, %ecx
 ; CHECK-BASELINE-NEXT:    andl %r10d, %edx
 ; CHECK-BASELINE-NEXT:    notl %r10d
 ; CHECK-BASELINE-NEXT:    notl %r11d
-; CHECK-BASELINE-NEXT:    notl %eax
+; CHECK-BASELINE-NEXT:    notl %edi
 ; CHECK-BASELINE-NEXT:    notl %ebx
 ; CHECK-BASELINE-NEXT:    andl %r9d, %ebx
 ; CHECK-BASELINE-NEXT:    orl %esi, %ebx
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %ax
-; CHECK-BASELINE-NEXT:    orl %r8d, %eax
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %di
+; CHECK-BASELINE-NEXT:    orl %r8d, %edi
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
 ; CHECK-BASELINE-NEXT:    orl %ecx, %r11d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
 ; CHECK-BASELINE-NEXT:    orl %edx, %r10d
-; CHECK-BASELINE-NEXT:    movw %bx, (%rdi)
-; CHECK-BASELINE-NEXT:    movw %ax, 6(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r11w, 4(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r10w, 2(%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    movw %bx, (%rax)
+; CHECK-BASELINE-NEXT:    movw %di, 6(%rax)
+; CHECK-BASELINE-NEXT:    movw %r11w, 4(%rax)
+; CHECK-BASELINE-NEXT:    movw %r10w, 2(%rax)
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i16:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
 ; CHECK-SSE1-NEXT:    andl %ebx, %esi
-; CHECK-SSE1-NEXT:    andl %eax, %r8d
+; CHECK-SSE1-NEXT:    andl %edi, %r8d
 ; CHECK-SSE1-NEXT:    andl %r11d, %ecx
 ; CHECK-SSE1-NEXT:    andl %r10d, %edx
 ; CHECK-SSE1-NEXT:    notl %r10d
 ; CHECK-SSE1-NEXT:    notl %r11d
-; CHECK-SSE1-NEXT:    notl %eax
+; CHECK-SSE1-NEXT:    notl %edi
 ; CHECK-SSE1-NEXT:    notl %ebx
 ; CHECK-SSE1-NEXT:    andl %r9d, %ebx
 ; CHECK-SSE1-NEXT:    orl %esi, %ebx
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %ax
-; CHECK-SSE1-NEXT:    orl %r8d, %eax
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %di
+; CHECK-SSE1-NEXT:    orl %r8d, %edi
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
 ; CHECK-SSE1-NEXT:    orl %ecx, %r11d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
 ; CHECK-SSE1-NEXT:    orl %edx, %r10d
-; CHECK-SSE1-NEXT:    movw %bx, (%rdi)
-; CHECK-SSE1-NEXT:    movw %ax, 6(%rdi)
-; CHECK-SSE1-NEXT:    movw %r11w, 4(%rdi)
-; CHECK-SSE1-NEXT:    movw %r10w, 2(%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    movw %bx, (%rax)
+; CHECK-SSE1-NEXT:    movw %di, 6(%rax)
+; CHECK-SSE1-NEXT:    movw %r11w, 4(%rax)
+; CHECK-SSE1-NEXT:    movw %r10w, 2(%rax)
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    retq
 ;
@@ -584,52 +590,52 @@
 define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v4i16_undef:
 ; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-BASELINE-NEXT:    andl %eax, %esi
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    andl %edi, %esi
 ; CHECK-BASELINE-NEXT:    andl %r11d, %r8d
 ; CHECK-BASELINE-NEXT:    andl %r10d, %edx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
 ; CHECK-BASELINE-NEXT:    notl %r10d
 ; CHECK-BASELINE-NEXT:    notl %r11d
-; CHECK-BASELINE-NEXT:    notl %eax
-; CHECK-BASELINE-NEXT:    andl %r9d, %eax
-; CHECK-BASELINE-NEXT:    orl %esi, %eax
+; CHECK-BASELINE-NEXT:    notl %edi
+; CHECK-BASELINE-NEXT:    andl %r9d, %edi
+; CHECK-BASELINE-NEXT:    orl %esi, %edi
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
 ; CHECK-BASELINE-NEXT:    orl %r8d, %r11d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
 ; CHECK-BASELINE-NEXT:    orl %edx, %r10d
-; CHECK-BASELINE-NEXT:    movw %cx, 4(%rdi)
-; CHECK-BASELINE-NEXT:    movw %ax, (%rdi)
-; CHECK-BASELINE-NEXT:    movw %r11w, 6(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r10w, 2(%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
+; CHECK-BASELINE-NEXT:    movw %di, (%rax)
+; CHECK-BASELINE-NEXT:    movw %r11w, 6(%rax)
+; CHECK-BASELINE-NEXT:    movw %r10w, 2(%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i16_undef:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-SSE1-NEXT:    andl %eax, %esi
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    andl %edi, %esi
 ; CHECK-SSE1-NEXT:    andl %r11d, %r8d
 ; CHECK-SSE1-NEXT:    andl %r10d, %edx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
 ; CHECK-SSE1-NEXT:    notl %r10d
 ; CHECK-SSE1-NEXT:    notl %r11d
-; CHECK-SSE1-NEXT:    notl %eax
-; CHECK-SSE1-NEXT:    andl %r9d, %eax
-; CHECK-SSE1-NEXT:    orl %esi, %eax
+; CHECK-SSE1-NEXT:    notl %edi
+; CHECK-SSE1-NEXT:    andl %r9d, %edi
+; CHECK-SSE1-NEXT:    orl %esi, %edi
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
 ; CHECK-SSE1-NEXT:    orl %r8d, %r11d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
 ; CHECK-SSE1-NEXT:    orl %edx, %r10d
-; CHECK-SSE1-NEXT:    movw %cx, 4(%rdi)
-; CHECK-SSE1-NEXT:    movw %ax, (%rdi)
-; CHECK-SSE1-NEXT:    movw %r11w, 6(%rdi)
-; CHECK-SSE1-NEXT:    movw %r10w, 2(%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
+; CHECK-SSE1-NEXT:    movw %di, (%rax)
+; CHECK-SSE1-NEXT:    movw %r11w, 6(%rax)
+; CHECK-SSE1-NEXT:    movw %r10w, 2(%rax)
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v4i16_undef:
@@ -657,29 +663,29 @@
 define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v2i32:
 ; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    movl %r8d, %eax
 ; CHECK-BASELINE-NEXT:    andl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    andl %r8d, %edi
-; CHECK-BASELINE-NEXT:    notl %r8d
+; CHECK-BASELINE-NEXT:    notl %eax
 ; CHECK-BASELINE-NEXT:    notl %r9d
 ; CHECK-BASELINE-NEXT:    andl %ecx, %r9d
 ; CHECK-BASELINE-NEXT:    orl %esi, %r9d
-; CHECK-BASELINE-NEXT:    andl %edx, %r8d
-; CHECK-BASELINE-NEXT:    orl %edi, %r8d
-; CHECK-BASELINE-NEXT:    movl %r8d, %eax
+; CHECK-BASELINE-NEXT:    andl %edx, %eax
+; CHECK-BASELINE-NEXT:    orl %edi, %eax
 ; CHECK-BASELINE-NEXT:    movl %r9d, %edx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v2i32:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movl %r8d, %eax
 ; CHECK-SSE1-NEXT:    andl %r9d, %esi
 ; CHECK-SSE1-NEXT:    andl %r8d, %edi
-; CHECK-SSE1-NEXT:    notl %r8d
+; CHECK-SSE1-NEXT:    notl %eax
 ; CHECK-SSE1-NEXT:    notl %r9d
 ; CHECK-SSE1-NEXT:    andl %ecx, %r9d
 ; CHECK-SSE1-NEXT:    orl %esi, %r9d
-; CHECK-SSE1-NEXT:    andl %edx, %r8d
-; CHECK-SSE1-NEXT:    orl %edi, %r8d
-; CHECK-SSE1-NEXT:    movl %r8d, %eax
+; CHECK-SSE1-NEXT:    andl %edx, %eax
+; CHECK-SSE1-NEXT:    orl %edi, %eax
 ; CHECK-SSE1-NEXT:    movl %r9d, %edx
 ; CHECK-SSE1-NEXT:    retq
 ;
@@ -708,11 +714,11 @@
 define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
 ; CHECK-LABEL: out_v1i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andq %rdx, %rdi
-; CHECK-NEXT:    notq %rdx
-; CHECK-NEXT:    andq %rsi, %rdx
-; CHECK-NEXT:    orq %rdi, %rdx
 ; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    andq %rdx, %rdi
+; CHECK-NEXT:    notq %rax
+; CHECK-NEXT:    andq %rsi, %rax
+; CHECK-NEXT:    orq %rdi, %rax
 ; CHECK-NEXT:    retq
   %mx = and <1 x i64> %x, %mask
   %notmask = xor <1 x i64> %mask, <i64 -1>
@@ -737,6 +743,8 @@
 ; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
@@ -747,12 +755,6 @@
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dl
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-BASELINE-NEXT:    andb %al, %sil
-; CHECK-BASELINE-NEXT:    notb %al
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT:    orb %sil, %al
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
 ; CHECK-BASELINE-NEXT:    andb %cl, %sil
 ; CHECK-BASELINE-NEXT:    notb %cl
@@ -803,51 +805,55 @@
 ; CHECK-BASELINE-NEXT:    notb %r10b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-BASELINE-NEXT:    orb %sil, %r10b
-; CHECK-BASELINE-NEXT:    movb %al, 15(%rdi)
-; CHECK-BASELINE-NEXT:    movb %cl, 14(%rdi)
-; CHECK-BASELINE-NEXT:    movb %dl, 13(%rdi)
-; CHECK-BASELINE-NEXT:    movb %bl, 12(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r13b, 11(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r12b, 10(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r15b, 9(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r14b, 8(%rdi)
-; CHECK-BASELINE-NEXT:    movb %bpl, 7(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r11b, 6(%rdi)
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT:    andb %al, %r9b
-; CHECK-BASELINE-NEXT:    notb %al
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT:    orb %r9b, %al
-; CHECK-BASELINE-NEXT:    movb %r10b, 5(%rdi)
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    andb %dil, %sil
+; CHECK-BASELINE-NEXT:    notb %dil
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dil
+; CHECK-BASELINE-NEXT:    orb %sil, %dil
+; CHECK-BASELINE-NEXT:    movb %cl, 15(%rax)
+; CHECK-BASELINE-NEXT:    movb %dl, 14(%rax)
+; CHECK-BASELINE-NEXT:    movb %bl, 13(%rax)
+; CHECK-BASELINE-NEXT:    movb %r13b, 12(%rax)
+; CHECK-BASELINE-NEXT:    movb %r12b, 11(%rax)
+; CHECK-BASELINE-NEXT:    movb %r15b, 10(%rax)
+; CHECK-BASELINE-NEXT:    movb %r14b, 9(%rax)
+; CHECK-BASELINE-NEXT:    movb %bpl, 8(%rax)
+; CHECK-BASELINE-NEXT:    movb %r11b, 7(%rax)
+; CHECK-BASELINE-NEXT:    movb %r10b, 6(%rax)
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    andb %cl, %r8b
+; CHECK-BASELINE-NEXT:    andb %cl, %r9b
 ; CHECK-BASELINE-NEXT:    notb %cl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    orb %r8b, %cl
-; CHECK-BASELINE-NEXT:    movb %al, 4(%rdi)
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-BASELINE-NEXT:    andb %al, %dl
-; CHECK-BASELINE-NEXT:    notb %al
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT:    orb %dl, %al
-; CHECK-BASELINE-NEXT:    movb %cl, 3(%rdi)
+; CHECK-BASELINE-NEXT:    orb %r9b, %cl
+; CHECK-BASELINE-NEXT:    movb %dil, 5(%rax)
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    andb %dl, %r8b
+; CHECK-BASELINE-NEXT:    notb %dl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    orb %r8b, %dl
+; CHECK-BASELINE-NEXT:    movb %cl, 4(%rax)
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-BASELINE-NEXT:    andb %cl, %dl
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-BASELINE-NEXT:    andb %cl, %sil
 ; CHECK-BASELINE-NEXT:    notb %cl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    orb %dl, %cl
-; CHECK-BASELINE-NEXT:    movb %al, 2(%rdi)
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-BASELINE-NEXT:    andb %al, %dl
-; CHECK-BASELINE-NEXT:    notb %al
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT:    orb %dl, %al
-; CHECK-BASELINE-NEXT:    movb %cl, 1(%rdi)
-; CHECK-BASELINE-NEXT:    movb %al, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    orb %sil, %cl
+; CHECK-BASELINE-NEXT:    movb %dl, 3(%rax)
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-BASELINE-NEXT:    andb %dl, %sil
+; CHECK-BASELINE-NEXT:    notb %dl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    orb %sil, %dl
+; CHECK-BASELINE-NEXT:    movb %cl, 2(%rax)
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-BASELINE-NEXT:    andb %cl, %sil
+; CHECK-BASELINE-NEXT:    notb %cl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    orb %sil, %cl
+; CHECK-BASELINE-NEXT:    movb %dl, 1(%rax)
+; CHECK-BASELINE-NEXT:    movb %cl, (%rax)
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r13
@@ -867,6 +873,8 @@
 ; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
@@ -877,12 +885,6 @@
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dl
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
-; CHECK-SSE1-NEXT:    andb %al, %sil
-; CHECK-SSE1-NEXT:    notb %al
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT:    orb %sil, %al
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
 ; CHECK-SSE1-NEXT:    andb %cl, %sil
 ; CHECK-SSE1-NEXT:    notb %cl
@@ -933,51 +935,55 @@
 ; CHECK-SSE1-NEXT:    notb %r10b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-SSE1-NEXT:    orb %sil, %r10b
-; CHECK-SSE1-NEXT:    movb %al, 15(%rdi)
-; CHECK-SSE1-NEXT:    movb %cl, 14(%rdi)
-; CHECK-SSE1-NEXT:    movb %dl, 13(%rdi)
-; CHECK-SSE1-NEXT:    movb %bl, 12(%rdi)
-; CHECK-SSE1-NEXT:    movb %r13b, 11(%rdi)
-; CHECK-SSE1-NEXT:    movb %r12b, 10(%rdi)
-; CHECK-SSE1-NEXT:    movb %r15b, 9(%rdi)
-; CHECK-SSE1-NEXT:    movb %r14b, 8(%rdi)
-; CHECK-SSE1-NEXT:    movb %bpl, 7(%rdi)
-; CHECK-SSE1-NEXT:    movb %r11b, 6(%rdi)
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT:    andb %al, %r9b
-; CHECK-SSE1-NEXT:    notb %al
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT:    orb %r9b, %al
-; CHECK-SSE1-NEXT:    movb %r10b, 5(%rdi)
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    andb %dil, %sil
+; CHECK-SSE1-NEXT:    notb %dil
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dil
+; CHECK-SSE1-NEXT:    orb %sil, %dil
+; CHECK-SSE1-NEXT:    movb %cl, 15(%rax)
+; CHECK-SSE1-NEXT:    movb %dl, 14(%rax)
+; CHECK-SSE1-NEXT:    movb %bl, 13(%rax)
+; CHECK-SSE1-NEXT:    movb %r13b, 12(%rax)
+; CHECK-SSE1-NEXT:    movb %r12b, 11(%rax)
+; CHECK-SSE1-NEXT:    movb %r15b, 10(%rax)
+; CHECK-SSE1-NEXT:    movb %r14b, 9(%rax)
+; CHECK-SSE1-NEXT:    movb %bpl, 8(%rax)
+; CHECK-SSE1-NEXT:    movb %r11b, 7(%rax)
+; CHECK-SSE1-NEXT:    movb %r10b, 6(%rax)
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    andb %cl, %r8b
+; CHECK-SSE1-NEXT:    andb %cl, %r9b
 ; CHECK-SSE1-NEXT:    notb %cl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    orb %r8b, %cl
-; CHECK-SSE1-NEXT:    movb %al, 4(%rdi)
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-SSE1-NEXT:    andb %al, %dl
-; CHECK-SSE1-NEXT:    notb %al
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT:    orb %dl, %al
-; CHECK-SSE1-NEXT:    movb %cl, 3(%rdi)
+; CHECK-SSE1-NEXT:    orb %r9b, %cl
+; CHECK-SSE1-NEXT:    movb %dil, 5(%rax)
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    andb %dl, %r8b
+; CHECK-SSE1-NEXT:    notb %dl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    orb %r8b, %dl
+; CHECK-SSE1-NEXT:    movb %cl, 4(%rax)
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-SSE1-NEXT:    andb %cl, %dl
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-SSE1-NEXT:    andb %cl, %sil
 ; CHECK-SSE1-NEXT:    notb %cl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    orb %dl, %cl
-; CHECK-SSE1-NEXT:    movb %al, 2(%rdi)
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-SSE1-NEXT:    andb %al, %dl
-; CHECK-SSE1-NEXT:    notb %al
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT:    orb %dl, %al
-; CHECK-SSE1-NEXT:    movb %cl, 1(%rdi)
-; CHECK-SSE1-NEXT:    movb %al, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    orb %sil, %cl
+; CHECK-SSE1-NEXT:    movb %dl, 3(%rax)
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-SSE1-NEXT:    andb %dl, %sil
+; CHECK-SSE1-NEXT:    notb %dl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    orb %sil, %dl
+; CHECK-SSE1-NEXT:    movb %cl, 2(%rax)
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-SSE1-NEXT:    andb %cl, %sil
+; CHECK-SSE1-NEXT:    notb %cl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    orb %sil, %cl
+; CHECK-SSE1-NEXT:    movb %dl, 1(%rax)
+; CHECK-SSE1-NEXT:    movb %cl, (%rax)
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r13
@@ -1010,6 +1016,7 @@
 ; CHECK-BASELINE-NEXT:    pushq %rbp
 ; CHECK-BASELINE-NEXT:    pushq %r14
 ; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
@@ -1033,11 +1040,11 @@
 ; CHECK-BASELINE-NEXT:    notl %ebx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bx
 ; CHECK-BASELINE-NEXT:    orl %r9d, %ebx
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-BASELINE-NEXT:    andl %eax, %r8d
-; CHECK-BASELINE-NEXT:    notl %eax
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %ax
-; CHECK-BASELINE-NEXT:    orl %r8d, %eax
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    andl %edi, %r8d
+; CHECK-BASELINE-NEXT:    notl %edi
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %di
+; CHECK-BASELINE-NEXT:    orl %r8d, %edi
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
 ; CHECK-BASELINE-NEXT:    andl %ebp, %ecx
 ; CHECK-BASELINE-NEXT:    notl %ebp
@@ -1053,15 +1060,14 @@
 ; CHECK-BASELINE-NEXT:    notl %edx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-BASELINE-NEXT:    orl %esi, %edx
-; CHECK-BASELINE-NEXT:    movw %r14w, 14(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r11w, 12(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r10w, 10(%rdi)
-; CHECK-BASELINE-NEXT:    movw %bx, 8(%rdi)
-; CHECK-BASELINE-NEXT:    movw %ax, 6(%rdi)
-; CHECK-BASELINE-NEXT:    movw %bp, 4(%rdi)
-; CHECK-BASELINE-NEXT:    movw %cx, 2(%rdi)
-; CHECK-BASELINE-NEXT:    movw %dx, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    movw %r14w, 14(%rax)
+; CHECK-BASELINE-NEXT:    movw %r11w, 12(%rax)
+; CHECK-BASELINE-NEXT:    movw %r10w, 10(%rax)
+; CHECK-BASELINE-NEXT:    movw %bx, 8(%rax)
+; CHECK-BASELINE-NEXT:    movw %di, 6(%rax)
+; CHECK-BASELINE-NEXT:    movw %bp, 4(%rax)
+; CHECK-BASELINE-NEXT:    movw %cx, 2(%rax)
+; CHECK-BASELINE-NEXT:    movw %dx, (%rax)
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r14
 ; CHECK-BASELINE-NEXT:    popq %rbp
@@ -1072,6 +1078,7 @@
 ; CHECK-SSE1-NEXT:    pushq %rbp
 ; CHECK-SSE1-NEXT:    pushq %r14
 ; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
@@ -1095,11 +1102,11 @@
 ; CHECK-SSE1-NEXT:    notl %ebx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bx
 ; CHECK-SSE1-NEXT:    orl %r9d, %ebx
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-SSE1-NEXT:    andl %eax, %r8d
-; CHECK-SSE1-NEXT:    notl %eax
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %ax
-; CHECK-SSE1-NEXT:    orl %r8d, %eax
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    andl %edi, %r8d
+; CHECK-SSE1-NEXT:    notl %edi
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %di
+; CHECK-SSE1-NEXT:    orl %r8d, %edi
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
 ; CHECK-SSE1-NEXT:    andl %ebp, %ecx
 ; CHECK-SSE1-NEXT:    notl %ebp
@@ -1115,15 +1122,14 @@
 ; CHECK-SSE1-NEXT:    notl %edx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-SSE1-NEXT:    orl %esi, %edx
-; CHECK-SSE1-NEXT:    movw %r14w, 14(%rdi)
-; CHECK-SSE1-NEXT:    movw %r11w, 12(%rdi)
-; CHECK-SSE1-NEXT:    movw %r10w, 10(%rdi)
-; CHECK-SSE1-NEXT:    movw %bx, 8(%rdi)
-; CHECK-SSE1-NEXT:    movw %ax, 6(%rdi)
-; CHECK-SSE1-NEXT:    movw %bp, 4(%rdi)
-; CHECK-SSE1-NEXT:    movw %cx, 2(%rdi)
-; CHECK-SSE1-NEXT:    movw %dx, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    movw %r14w, 14(%rax)
+; CHECK-SSE1-NEXT:    movw %r11w, 12(%rax)
+; CHECK-SSE1-NEXT:    movw %r10w, 10(%rax)
+; CHECK-SSE1-NEXT:    movw %bx, 8(%rax)
+; CHECK-SSE1-NEXT:    movw %di, 6(%rax)
+; CHECK-SSE1-NEXT:    movw %bp, 4(%rax)
+; CHECK-SSE1-NEXT:    movw %cx, 2(%rax)
+; CHECK-SSE1-NEXT:    movw %dx, (%rax)
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r14
 ; CHECK-SSE1-NEXT:    popq %rbp
@@ -1151,47 +1157,47 @@
 ; CHECK-BASELINE-LABEL: out_v4i32:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    movl (%rcx), %r8d
 ; CHECK-BASELINE-NEXT:    movl 4(%rcx), %r9d
-; CHECK-BASELINE-NEXT:    movl 8(%rcx), %eax
+; CHECK-BASELINE-NEXT:    movl 8(%rcx), %edi
 ; CHECK-BASELINE-NEXT:    movl 12(%rcx), %ecx
 ; CHECK-BASELINE-NEXT:    movl 12(%rsi), %r10d
 ; CHECK-BASELINE-NEXT:    andl %ecx, %r10d
 ; CHECK-BASELINE-NEXT:    movl 8(%rsi), %r11d
-; CHECK-BASELINE-NEXT:    andl %eax, %r11d
+; CHECK-BASELINE-NEXT:    andl %edi, %r11d
 ; CHECK-BASELINE-NEXT:    movl 4(%rsi), %ebx
 ; CHECK-BASELINE-NEXT:    andl %r9d, %ebx
 ; CHECK-BASELINE-NEXT:    movl (%rsi), %esi
 ; CHECK-BASELINE-NEXT:    andl %r8d, %esi
 ; CHECK-BASELINE-NEXT:    notl %r8d
 ; CHECK-BASELINE-NEXT:    notl %r9d
-; CHECK-BASELINE-NEXT:    notl %eax
+; CHECK-BASELINE-NEXT:    notl %edi
 ; CHECK-BASELINE-NEXT:    notl %ecx
 ; CHECK-BASELINE-NEXT:    andl 12(%rdx), %ecx
 ; CHECK-BASELINE-NEXT:    orl %r10d, %ecx
-; CHECK-BASELINE-NEXT:    andl 8(%rdx), %eax
-; CHECK-BASELINE-NEXT:    orl %r11d, %eax
+; CHECK-BASELINE-NEXT:    andl 8(%rdx), %edi
+; CHECK-BASELINE-NEXT:    orl %r11d, %edi
 ; CHECK-BASELINE-NEXT:    andl 4(%rdx), %r9d
 ; CHECK-BASELINE-NEXT:    orl %ebx, %r9d
 ; CHECK-BASELINE-NEXT:    andl (%rdx), %r8d
 ; CHECK-BASELINE-NEXT:    orl %esi, %r8d
-; CHECK-BASELINE-NEXT:    movl %ecx, 12(%rdi)
-; CHECK-BASELINE-NEXT:    movl %eax, 8(%rdi)
-; CHECK-BASELINE-NEXT:    movl %r9d, 4(%rdi)
-; CHECK-BASELINE-NEXT:    movl %r8d, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    movl %ecx, 12(%rax)
+; CHECK-BASELINE-NEXT:    movl %edi, 8(%rax)
+; CHECK-BASELINE-NEXT:    movl %r9d, 4(%rax)
+; CHECK-BASELINE-NEXT:    movl %r8d, (%rax)
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i32:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm1
 ; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm0
 ; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v4i32:
@@ -1222,42 +1228,42 @@
 define <4 x i32> @out_v4i32_undef(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v4i32_undef:
 ; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    movl 8(%rsi), %r8d
 ; CHECK-BASELINE-NEXT:    movl (%rcx), %r9d
 ; CHECK-BASELINE-NEXT:    movl 4(%rcx), %r10d
-; CHECK-BASELINE-NEXT:    movl 12(%rcx), %eax
+; CHECK-BASELINE-NEXT:    movl 12(%rcx), %edi
 ; CHECK-BASELINE-NEXT:    andl 8(%rcx), %r8d
 ; CHECK-BASELINE-NEXT:    movl 12(%rsi), %ecx
-; CHECK-BASELINE-NEXT:    andl %eax, %ecx
+; CHECK-BASELINE-NEXT:    andl %edi, %ecx
 ; CHECK-BASELINE-NEXT:    movl 4(%rsi), %r11d
 ; CHECK-BASELINE-NEXT:    andl %r10d, %r11d
 ; CHECK-BASELINE-NEXT:    movl (%rsi), %esi
 ; CHECK-BASELINE-NEXT:    andl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    notl %r9d
 ; CHECK-BASELINE-NEXT:    notl %r10d
-; CHECK-BASELINE-NEXT:    notl %eax
-; CHECK-BASELINE-NEXT:    andl 12(%rdx), %eax
-; CHECK-BASELINE-NEXT:    orl %ecx, %eax
+; CHECK-BASELINE-NEXT:    notl %edi
+; CHECK-BASELINE-NEXT:    andl 12(%rdx), %edi
+; CHECK-BASELINE-NEXT:    orl %ecx, %edi
 ; CHECK-BASELINE-NEXT:    andl 4(%rdx), %r10d
 ; CHECK-BASELINE-NEXT:    orl %r11d, %r10d
 ; CHECK-BASELINE-NEXT:    andl (%rdx), %r9d
 ; CHECK-BASELINE-NEXT:    orl %esi, %r9d
-; CHECK-BASELINE-NEXT:    movl %r8d, 8(%rdi)
-; CHECK-BASELINE-NEXT:    movl %eax, 12(%rdi)
-; CHECK-BASELINE-NEXT:    movl %r10d, 4(%rdi)
-; CHECK-BASELINE-NEXT:    movl %r9d, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    movl %r8d, 8(%rax)
+; CHECK-BASELINE-NEXT:    movl %edi, 12(%rax)
+; CHECK-BASELINE-NEXT:    movl %r10d, 4(%rax)
+; CHECK-BASELINE-NEXT:    movl %r9d, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i32_undef:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm1
 ; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm0
 ; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v4i32_undef:
@@ -1288,29 +1294,29 @@
 define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v2i64:
 ; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    movq %r8, %rax
 ; CHECK-BASELINE-NEXT:    andq %r9, %rsi
 ; CHECK-BASELINE-NEXT:    andq %r8, %rdi
-; CHECK-BASELINE-NEXT:    notq %r8
+; CHECK-BASELINE-NEXT:    notq %rax
 ; CHECK-BASELINE-NEXT:    notq %r9
 ; CHECK-BASELINE-NEXT:    andq %rcx, %r9
 ; CHECK-BASELINE-NEXT:    orq %rsi, %r9
-; CHECK-BASELINE-NEXT:    andq %rdx, %r8
-; CHECK-BASELINE-NEXT:    orq %rdi, %r8
-; CHECK-BASELINE-NEXT:    movq %r8, %rax
+; CHECK-BASELINE-NEXT:    andq %rdx, %rax
+; CHECK-BASELINE-NEXT:    orq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    movq %r9, %rdx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v2i64:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %r8, %rax
 ; CHECK-SSE1-NEXT:    andq %r9, %rsi
 ; CHECK-SSE1-NEXT:    andq %r8, %rdi
-; CHECK-SSE1-NEXT:    notq %r8
+; CHECK-SSE1-NEXT:    notq %rax
 ; CHECK-SSE1-NEXT:    notq %r9
 ; CHECK-SSE1-NEXT:    andq %rcx, %r9
 ; CHECK-SSE1-NEXT:    orq %rsi, %r9
-; CHECK-SSE1-NEXT:    andq %rdx, %r8
-; CHECK-SSE1-NEXT:    orq %rdi, %r8
-; CHECK-SSE1-NEXT:    movq %r8, %rax
+; CHECK-SSE1-NEXT:    andq %rdx, %rax
+; CHECK-SSE1-NEXT:    orq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movq %r9, %rdx
 ; CHECK-SSE1-NEXT:    retq
 ;
@@ -2287,6 +2293,7 @@
 ; CHECK-BASELINE-NEXT:    pushq %r15
 ; CHECK-BASELINE-NEXT:    pushq %r14
 ; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    movl 4(%rcx), %r8d
 ; CHECK-BASELINE-NEXT:    movl 8(%rcx), %r9d
 ; CHECK-BASELINE-NEXT:    movl 12(%rcx), %r10d
@@ -2299,51 +2306,50 @@
 ; CHECK-BASELINE-NEXT:    notl %ebp
 ; CHECK-BASELINE-NEXT:    andl 28(%rdx), %ebp
 ; CHECK-BASELINE-NEXT:    orl %r14d, %ebp
-; CHECK-BASELINE-NEXT:    movl 24(%rsi), %eax
-; CHECK-BASELINE-NEXT:    andl %ebx, %eax
+; CHECK-BASELINE-NEXT:    movl 24(%rsi), %edi
+; CHECK-BASELINE-NEXT:    andl %ebx, %edi
 ; CHECK-BASELINE-NEXT:    notl %ebx
 ; CHECK-BASELINE-NEXT:    andl 24(%rdx), %ebx
-; CHECK-BASELINE-NEXT:    orl %eax, %ebx
-; CHECK-BASELINE-NEXT:    movl 20(%rsi), %eax
-; CHECK-BASELINE-NEXT:    andl %r15d, %eax
+; CHECK-BASELINE-NEXT:    orl %edi, %ebx
+; CHECK-BASELINE-NEXT:    movl 20(%rsi), %edi
+; CHECK-BASELINE-NEXT:    andl %r15d, %edi
 ; CHECK-BASELINE-NEXT:    notl %r15d
 ; CHECK-BASELINE-NEXT:    andl 20(%rdx), %r15d
-; CHECK-BASELINE-NEXT:    orl %eax, %r15d
-; CHECK-BASELINE-NEXT:    movl 16(%rsi), %eax
-; CHECK-BASELINE-NEXT:    andl %r11d, %eax
+; CHECK-BASELINE-NEXT:    orl %edi, %r15d
+; CHECK-BASELINE-NEXT:    movl 16(%rsi), %edi
+; CHECK-BASELINE-NEXT:    andl %r11d, %edi
 ; CHECK-BASELINE-NEXT:    notl %r11d
 ; CHECK-BASELINE-NEXT:    andl 16(%rdx), %r11d
-; CHECK-BASELINE-NEXT:    orl %eax, %r11d
-; CHECK-BASELINE-NEXT:    movl 12(%rsi), %eax
-; CHECK-BASELINE-NEXT:    andl %r10d, %eax
+; CHECK-BASELINE-NEXT:    orl %edi, %r11d
+; CHECK-BASELINE-NEXT:    movl 12(%rsi), %edi
+; CHECK-BASELINE-NEXT:    andl %r10d, %edi
 ; CHECK-BASELINE-NEXT:    notl %r10d
 ; CHECK-BASELINE-NEXT:    andl 12(%rdx), %r10d
-; CHECK-BASELINE-NEXT:    orl %eax, %r10d
-; CHECK-BASELINE-NEXT:    movl 8(%rsi), %eax
-; CHECK-BASELINE-NEXT:    andl %r9d, %eax
+; CHECK-BASELINE-NEXT:    orl %edi, %r10d
+; CHECK-BASELINE-NEXT:    movl 8(%rsi), %edi
+; CHECK-BASELINE-NEXT:    andl %r9d, %edi
 ; CHECK-BASELINE-NEXT:    notl %r9d
 ; CHECK-BASELINE-NEXT:    andl 8(%rdx), %r9d
-; CHECK-BASELINE-NEXT:    orl %eax, %r9d
-; CHECK-BASELINE-NEXT:    movl 4(%rsi), %eax
-; CHECK-BASELINE-NEXT:    andl %r8d, %eax
+; CHECK-BASELINE-NEXT:    orl %edi, %r9d
+; CHECK-BASELINE-NEXT:    movl 4(%rsi), %edi
+; CHECK-BASELINE-NEXT:    andl %r8d, %edi
 ; CHECK-BASELINE-NEXT:    notl %r8d
 ; CHECK-BASELINE-NEXT:    andl 4(%rdx), %r8d
-; CHECK-BASELINE-NEXT:    orl %eax, %r8d
-; CHECK-BASELINE-NEXT:    movl (%rcx), %eax
-; CHECK-BASELINE-NEXT:    movl (%rsi), %ecx
-; CHECK-BASELINE-NEXT:    andl %eax, %ecx
-; CHECK-BASELINE-NEXT:    notl %eax
-; CHECK-BASELINE-NEXT:    andl (%rdx), %eax
-; CHECK-BASELINE-NEXT:    orl %ecx, %eax
-; CHECK-BASELINE-NEXT:    movl %ebp, 28(%rdi)
-; CHECK-BASELINE-NEXT:    movl %ebx, 24(%rdi)
-; CHECK-BASELINE-NEXT:    movl %r15d, 20(%rdi)
-; CHECK-BASELINE-NEXT:    movl %r11d, 16(%rdi)
-; CHECK-BASELINE-NEXT:    movl %r10d, 12(%rdi)
-; CHECK-BASELINE-NEXT:    movl %r9d, 8(%rdi)
-; CHECK-BASELINE-NEXT:    movl %r8d, 4(%rdi)
-; CHECK-BASELINE-NEXT:    movl %eax, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    orl %edi, %r8d
+; CHECK-BASELINE-NEXT:    movl (%rcx), %ecx
+; CHECK-BASELINE-NEXT:    movl (%rsi), %esi
+; CHECK-BASELINE-NEXT:    andl %ecx, %esi
+; CHECK-BASELINE-NEXT:    notl %ecx
+; CHECK-BASELINE-NEXT:    andl (%rdx), %ecx
+; CHECK-BASELINE-NEXT:    orl %esi, %ecx
+; CHECK-BASELINE-NEXT:    movl %ebp, 28(%rax)
+; CHECK-BASELINE-NEXT:    movl %ebx, 24(%rax)
+; CHECK-BASELINE-NEXT:    movl %r15d, 20(%rax)
+; CHECK-BASELINE-NEXT:    movl %r11d, 16(%rax)
+; CHECK-BASELINE-NEXT:    movl %r10d, 12(%rax)
+; CHECK-BASELINE-NEXT:    movl %r9d, 8(%rax)
+; CHECK-BASELINE-NEXT:    movl %r8d, 4(%rax)
+; CHECK-BASELINE-NEXT:    movl %ecx, (%rax)
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r14
 ; CHECK-BASELINE-NEXT:    popq %r15
@@ -2356,6 +2362,7 @@
 ; CHECK-SSE1-NEXT:    pushq %r15
 ; CHECK-SSE1-NEXT:    pushq %r14
 ; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movl 4(%rcx), %r8d
 ; CHECK-SSE1-NEXT:    movl 8(%rcx), %r9d
 ; CHECK-SSE1-NEXT:    movl 12(%rcx), %r10d
@@ -2368,51 +2375,50 @@
 ; CHECK-SSE1-NEXT:    notl %ebp
 ; CHECK-SSE1-NEXT:    andl 28(%rdx), %ebp
 ; CHECK-SSE1-NEXT:    orl %r14d, %ebp
-; CHECK-SSE1-NEXT:    movl 24(%rsi), %eax
-; CHECK-SSE1-NEXT:    andl %ebx, %eax
+; CHECK-SSE1-NEXT:    movl 24(%rsi), %edi
+; CHECK-SSE1-NEXT:    andl %ebx, %edi
 ; CHECK-SSE1-NEXT:    notl %ebx
 ; CHECK-SSE1-NEXT:    andl 24(%rdx), %ebx
-; CHECK-SSE1-NEXT:    orl %eax, %ebx
-; CHECK-SSE1-NEXT:    movl 20(%rsi), %eax
-; CHECK-SSE1-NEXT:    andl %r15d, %eax
+; CHECK-SSE1-NEXT:    orl %edi, %ebx
+; CHECK-SSE1-NEXT:    movl 20(%rsi), %edi
+; CHECK-SSE1-NEXT:    andl %r15d, %edi
 ; CHECK-SSE1-NEXT:    notl %r15d
 ; CHECK-SSE1-NEXT:    andl 20(%rdx), %r15d
-; CHECK-SSE1-NEXT:    orl %eax, %r15d
-; CHECK-SSE1-NEXT:    movl 16(%rsi), %eax
-; CHECK-SSE1-NEXT:    andl %r11d, %eax
+; CHECK-SSE1-NEXT:    orl %edi, %r15d
+; CHECK-SSE1-NEXT:    movl 16(%rsi), %edi
+; CHECK-SSE1-NEXT:    andl %r11d, %edi
 ; CHECK-SSE1-NEXT:    notl %r11d
 ; CHECK-SSE1-NEXT:    andl 16(%rdx), %r11d
-; CHECK-SSE1-NEXT:    orl %eax, %r11d
-; CHECK-SSE1-NEXT:    movl 12(%rsi), %eax
-; CHECK-SSE1-NEXT:    andl %r10d, %eax
+; CHECK-SSE1-NEXT:    orl %edi, %r11d
+; CHECK-SSE1-NEXT:    movl 12(%rsi), %edi
+; CHECK-SSE1-NEXT:    andl %r10d, %edi
 ; CHECK-SSE1-NEXT:    notl %r10d
 ; CHECK-SSE1-NEXT:    andl 12(%rdx), %r10d
-; CHECK-SSE1-NEXT:    orl %eax, %r10d
-; CHECK-SSE1-NEXT:    movl 8(%rsi), %eax
-; CHECK-SSE1-NEXT:    andl %r9d, %eax
+; CHECK-SSE1-NEXT:    orl %edi, %r10d
+; CHECK-SSE1-NEXT:    movl 8(%rsi), %edi
+; CHECK-SSE1-NEXT:    andl %r9d, %edi
 ; CHECK-SSE1-NEXT:    notl %r9d
 ; CHECK-SSE1-NEXT:    andl 8(%rdx), %r9d
-; CHECK-SSE1-NEXT:    orl %eax, %r9d
-; CHECK-SSE1-NEXT:    movl 4(%rsi), %eax
-; CHECK-SSE1-NEXT:    andl %r8d, %eax
+; CHECK-SSE1-NEXT:    orl %edi, %r9d
+; CHECK-SSE1-NEXT:    movl 4(%rsi), %edi
+; CHECK-SSE1-NEXT:    andl %r8d, %edi
 ; CHECK-SSE1-NEXT:    notl %r8d
 ; CHECK-SSE1-NEXT:    andl 4(%rdx), %r8d
-; CHECK-SSE1-NEXT:    orl %eax, %r8d
-; CHECK-SSE1-NEXT:    movl (%rcx), %eax
-; CHECK-SSE1-NEXT:    movl (%rsi), %ecx
-; CHECK-SSE1-NEXT:    andl %eax, %ecx
-; CHECK-SSE1-NEXT:    notl %eax
-; CHECK-SSE1-NEXT:    andl (%rdx), %eax
-; CHECK-SSE1-NEXT:    orl %ecx, %eax
-; CHECK-SSE1-NEXT:    movl %ebp, 28(%rdi)
-; CHECK-SSE1-NEXT:    movl %ebx, 24(%rdi)
-; CHECK-SSE1-NEXT:    movl %r15d, 20(%rdi)
-; CHECK-SSE1-NEXT:    movl %r11d, 16(%rdi)
-; CHECK-SSE1-NEXT:    movl %r10d, 12(%rdi)
-; CHECK-SSE1-NEXT:    movl %r9d, 8(%rdi)
-; CHECK-SSE1-NEXT:    movl %r8d, 4(%rdi)
-; CHECK-SSE1-NEXT:    movl %eax, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    orl %edi, %r8d
+; CHECK-SSE1-NEXT:    movl (%rcx), %ecx
+; CHECK-SSE1-NEXT:    movl (%rsi), %esi
+; CHECK-SSE1-NEXT:    andl %ecx, %esi
+; CHECK-SSE1-NEXT:    notl %ecx
+; CHECK-SSE1-NEXT:    andl (%rdx), %ecx
+; CHECK-SSE1-NEXT:    orl %esi, %ecx
+; CHECK-SSE1-NEXT:    movl %ebp, 28(%rax)
+; CHECK-SSE1-NEXT:    movl %ebx, 24(%rax)
+; CHECK-SSE1-NEXT:    movl %r15d, 20(%rax)
+; CHECK-SSE1-NEXT:    movl %r11d, 16(%rax)
+; CHECK-SSE1-NEXT:    movl %r10d, 12(%rax)
+; CHECK-SSE1-NEXT:    movl %r9d, 8(%rax)
+; CHECK-SSE1-NEXT:    movl %r8d, 4(%rax)
+; CHECK-SSE1-NEXT:    movl %ecx, (%rax)
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r14
 ; CHECK-SSE1-NEXT:    popq %r15
@@ -2453,70 +2459,70 @@
 ; CHECK-BASELINE-LABEL: out_v4i64:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    movq (%rcx), %r8
 ; CHECK-BASELINE-NEXT:    movq 8(%rcx), %r9
-; CHECK-BASELINE-NEXT:    movq 16(%rcx), %rax
+; CHECK-BASELINE-NEXT:    movq 16(%rcx), %rdi
 ; CHECK-BASELINE-NEXT:    movq 24(%rcx), %rcx
 ; CHECK-BASELINE-NEXT:    movq 24(%rsi), %r10
 ; CHECK-BASELINE-NEXT:    andq %rcx, %r10
 ; CHECK-BASELINE-NEXT:    movq 16(%rsi), %r11
-; CHECK-BASELINE-NEXT:    andq %rax, %r11
+; CHECK-BASELINE-NEXT:    andq %rdi, %r11
 ; CHECK-BASELINE-NEXT:    movq 8(%rsi), %rbx
 ; CHECK-BASELINE-NEXT:    andq %r9, %rbx
 ; CHECK-BASELINE-NEXT:    movq (%rsi), %rsi
 ; CHECK-BASELINE-NEXT:    andq %r8, %rsi
 ; CHECK-BASELINE-NEXT:    notq %r8
 ; CHECK-BASELINE-NEXT:    notq %r9
-; CHECK-BASELINE-NEXT:    notq %rax
+; CHECK-BASELINE-NEXT:    notq %rdi
 ; CHECK-BASELINE-NEXT:    notq %rcx
 ; CHECK-BASELINE-NEXT:    andq 24(%rdx), %rcx
 ; CHECK-BASELINE-NEXT:    orq %r10, %rcx
-; CHECK-BASELINE-NEXT:    andq 16(%rdx), %rax
-; CHECK-BASELINE-NEXT:    orq %r11, %rax
+; CHECK-BASELINE-NEXT:    andq 16(%rdx), %rdi
+; CHECK-BASELINE-NEXT:    orq %r11, %rdi
 ; CHECK-BASELINE-NEXT:    andq 8(%rdx), %r9
 ; CHECK-BASELINE-NEXT:    orq %rbx, %r9
 ; CHECK-BASELINE-NEXT:    andq (%rdx), %r8
 ; CHECK-BASELINE-NEXT:    orq %rsi, %r8
-; CHECK-BASELINE-NEXT:    movq %rcx, 24(%rdi)
-; CHECK-BASELINE-NEXT:    movq %rax, 16(%rdi)
-; CHECK-BASELINE-NEXT:    movq %r9, 8(%rdi)
-; CHECK-BASELINE-NEXT:    movq %r8, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    movq %rcx, 24(%rax)
+; CHECK-BASELINE-NEXT:    movq %rdi, 16(%rax)
+; CHECK-BASELINE-NEXT:    movq %r9, 8(%rax)
+; CHECK-BASELINE-NEXT:    movq %r8, (%rax)
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i64:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movq (%rcx), %r8
 ; CHECK-SSE1-NEXT:    movq 8(%rcx), %r9
-; CHECK-SSE1-NEXT:    movq 16(%rcx), %rax
+; CHECK-SSE1-NEXT:    movq 16(%rcx), %rdi
 ; CHECK-SSE1-NEXT:    movq 24(%rcx), %rcx
 ; CHECK-SSE1-NEXT:    movq 24(%rsi), %r10
 ; CHECK-SSE1-NEXT:    andq %rcx, %r10
 ; CHECK-SSE1-NEXT:    movq 16(%rsi), %r11
-; CHECK-SSE1-NEXT:    andq %rax, %r11
+; CHECK-SSE1-NEXT:    andq %rdi, %r11
 ; CHECK-SSE1-NEXT:    movq 8(%rsi), %rbx
 ; CHECK-SSE1-NEXT:    andq %r9, %rbx
 ; CHECK-SSE1-NEXT:    movq (%rsi), %rsi
 ; CHECK-SSE1-NEXT:    andq %r8, %rsi
 ; CHECK-SSE1-NEXT:    notq %r8
 ; CHECK-SSE1-NEXT:    notq %r9
-; CHECK-SSE1-NEXT:    notq %rax
+; CHECK-SSE1-NEXT:    notq %rdi
 ; CHECK-SSE1-NEXT:    notq %rcx
 ; CHECK-SSE1-NEXT:    andq 24(%rdx), %rcx
 ; CHECK-SSE1-NEXT:    orq %r10, %rcx
-; CHECK-SSE1-NEXT:    andq 16(%rdx), %rax
-; CHECK-SSE1-NEXT:    orq %r11, %rax
+; CHECK-SSE1-NEXT:    andq 16(%rdx), %rdi
+; CHECK-SSE1-NEXT:    orq %r11, %rdi
 ; CHECK-SSE1-NEXT:    andq 8(%rdx), %r9
 ; CHECK-SSE1-NEXT:    orq %rbx, %r9
 ; CHECK-SSE1-NEXT:    andq (%rdx), %r8
 ; CHECK-SSE1-NEXT:    orq %rsi, %r8
-; CHECK-SSE1-NEXT:    movq %rcx, 24(%rdi)
-; CHECK-SSE1-NEXT:    movq %rax, 16(%rdi)
-; CHECK-SSE1-NEXT:    movq %r9, 8(%rdi)
-; CHECK-SSE1-NEXT:    movq %r8, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    movq %rcx, 24(%rax)
+; CHECK-SSE1-NEXT:    movq %rdi, 16(%rax)
+; CHECK-SSE1-NEXT:    movq %r9, 8(%rax)
+; CHECK-SSE1-NEXT:    movq %r8, (%rax)
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    retq
 ;
@@ -2561,10 +2567,11 @@
 define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %esi, %edi
-; CHECK-NEXT:    andl %edx, %edi
-; CHECK-NEXT:    xorl %esi, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    andl %edx, %eax
+; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %n0 = xor <1 x i8> %x, %y
   %n1 = and <1 x i8> %n0, %mask
@@ -2579,25 +2586,27 @@
 define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: in_v2i8:
 ; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    movl %edi, %eax
 ; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
-; CHECK-BASELINE-NEXT:    xorl %edx, %edi
-; CHECK-BASELINE-NEXT:    andl %r8d, %edi
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    andl %r8d, %eax
 ; CHECK-BASELINE-NEXT:    andl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
-; CHECK-BASELINE-NEXT:    xorl %edx, %edi
-; CHECK-BASELINE-NEXT:    movl %edi, %eax
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BASELINE-NEXT:    movl %esi, %edx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: in_v2i8:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movl %edi, %eax
 ; CHECK-SSE1-NEXT:    xorl %ecx, %esi
-; CHECK-SSE1-NEXT:    xorl %edx, %edi
-; CHECK-SSE1-NEXT:    andl %r8d, %edi
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    andl %r8d, %eax
 ; CHECK-SSE1-NEXT:    andl %r9d, %esi
 ; CHECK-SSE1-NEXT:    xorl %ecx, %esi
-; CHECK-SSE1-NEXT:    xorl %edx, %edi
-; CHECK-SSE1-NEXT:    movl %edi, %eax
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-SSE1-NEXT:    movl %esi, %edx
 ; CHECK-SSE1-NEXT:    retq
 ;
@@ -2621,10 +2630,11 @@
 define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %esi, %edi
-; CHECK-NEXT:    andl %edx, %edi
-; CHECK-NEXT:    xorl %esi, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    andl %edx, %eax
+; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %n0 = xor <1 x i16> %x, %y
   %n1 = and <1 x i16> %n0, %mask
@@ -2639,50 +2649,50 @@
 define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: in_v4i8:
 ; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT:    xorb %al, %dl
-; CHECK-BASELINE-NEXT:    xorb %r11b, %cl
-; CHECK-BASELINE-NEXT:    xorb %r10b, %r8b
+; CHECK-BASELINE-NEXT:    xorb %r11b, %dl
+; CHECK-BASELINE-NEXT:    xorb %r10b, %cl
+; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
 ; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
-; CHECK-BASELINE-NEXT:    xorb %al, %dl
-; CHECK-BASELINE-NEXT:    xorb %r11b, %cl
-; CHECK-BASELINE-NEXT:    xorb %r10b, %r8b
-; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rdi)
-; CHECK-BASELINE-NEXT:    movb %cl, 2(%rdi)
-; CHECK-BASELINE-NEXT:    movb %dl, 1(%rdi)
-; CHECK-BASELINE-NEXT:    movb %sil, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    xorb %r11b, %dl
+; CHECK-BASELINE-NEXT:    xorb %r10b, %cl
+; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
+; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rax)
+; CHECK-BASELINE-NEXT:    movb %cl, 2(%rax)
+; CHECK-BASELINE-NEXT:    movb %dl, 1(%rax)
+; CHECK-BASELINE-NEXT:    movb %sil, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: in_v4i8:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dil
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT:    xorb %al, %dl
-; CHECK-SSE1-NEXT:    xorb %r11b, %cl
-; CHECK-SSE1-NEXT:    xorb %r10b, %r8b
+; CHECK-SSE1-NEXT:    xorb %r11b, %dl
+; CHECK-SSE1-NEXT:    xorb %r10b, %cl
+; CHECK-SSE1-NEXT:    xorb %dil, %r8b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
 ; CHECK-SSE1-NEXT:    xorb %r9b, %sil
-; CHECK-SSE1-NEXT:    xorb %al, %dl
-; CHECK-SSE1-NEXT:    xorb %r11b, %cl
-; CHECK-SSE1-NEXT:    xorb %r10b, %r8b
-; CHECK-SSE1-NEXT:    movb %r8b, 3(%rdi)
-; CHECK-SSE1-NEXT:    movb %cl, 2(%rdi)
-; CHECK-SSE1-NEXT:    movb %dl, 1(%rdi)
-; CHECK-SSE1-NEXT:    movb %sil, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    xorb %r11b, %dl
+; CHECK-SSE1-NEXT:    xorb %r10b, %cl
+; CHECK-SSE1-NEXT:    xorb %dil, %r8b
+; CHECK-SSE1-NEXT:    movb %r8b, 3(%rax)
+; CHECK-SSE1-NEXT:    movb %cl, 2(%rax)
+; CHECK-SSE1-NEXT:    movb %dl, 1(%rax)
+; CHECK-SSE1-NEXT:    movb %sil, (%rax)
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: in_v4i8:
@@ -2705,25 +2715,27 @@
 define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: in_v2i16:
 ; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    movl %edi, %eax
 ; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
-; CHECK-BASELINE-NEXT:    xorl %edx, %edi
-; CHECK-BASELINE-NEXT:    andl %r8d, %edi
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    andl %r8d, %eax
 ; CHECK-BASELINE-NEXT:    andl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
-; CHECK-BASELINE-NEXT:    xorl %edx, %edi
-; CHECK-BASELINE-NEXT:    movl %edi, %eax
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-BASELINE-NEXT:    movl %esi, %edx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: in_v2i16:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movl %edi, %eax
 ; CHECK-SSE1-NEXT:    xorl %ecx, %esi
-; CHECK-SSE1-NEXT:    xorl %edx, %edi
-; CHECK-SSE1-NEXT:    andl %r8d, %edi
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    andl %r8d, %eax
 ; CHECK-SSE1-NEXT:    andl %r9d, %esi
 ; CHECK-SSE1-NEXT:    xorl %ecx, %esi
-; CHECK-SSE1-NEXT:    xorl %edx, %edi
-; CHECK-SSE1-NEXT:    movl %edi, %eax
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-SSE1-NEXT:    movl %esi, %edx
 ; CHECK-SSE1-NEXT:    retq
 ;
@@ -2747,10 +2759,10 @@
 define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
 ; CHECK-LABEL: in_v1i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %esi, %edi
-; CHECK-NEXT:    andl %edx, %edi
-; CHECK-NEXT:    xorl %esi, %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    andl %edx, %eax
+; CHECK-NEXT:    xorl %esi, %eax
 ; CHECK-NEXT:    retq
   %n0 = xor <1 x i32> %x, %y
   %n1 = and <1 x i32> %n0, %mask
@@ -2772,47 +2784,46 @@
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
 ; CHECK-BASELINE-NEXT:    movl %ecx, %r10d
-; CHECK-BASELINE-NEXT:    movl %edx, %r11d
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
-; CHECK-BASELINE-NEXT:    xorb %bpl, %sil
-; CHECK-BASELINE-NEXT:    xorb %r13b, %r11b
-; CHECK-BASELINE-NEXT:    xorb %r12b, %r10b
-; CHECK-BASELINE-NEXT:    xorb %r15b, %r8b
-; CHECK-BASELINE-NEXT:    xorb %r14b, %r9b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    xorb %r13b, %sil
+; CHECK-BASELINE-NEXT:    xorb %r12b, %dl
+; CHECK-BASELINE-NEXT:    xorb %r15b, %r10b
+; CHECK-BASELINE-NEXT:    xorb %r14b, %r8b
+; CHECK-BASELINE-NEXT:    xorb %bpl, %r9b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %bl
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
 ; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %cl
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT:    xorb %bl, %al
+; CHECK-BASELINE-NEXT:    xorb %r11b, %al
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
-; CHECK-BASELINE-NEXT:    xorb %bpl, %sil
-; CHECK-BASELINE-NEXT:    xorb %r13b, %r11b
-; CHECK-BASELINE-NEXT:    xorb %r12b, %r10b
-; CHECK-BASELINE-NEXT:    xorb %r15b, %r8b
-; CHECK-BASELINE-NEXT:    xorb %r14b, %r9b
-; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
+; CHECK-BASELINE-NEXT:    xorb %r13b, %sil
+; CHECK-BASELINE-NEXT:    xorb %r12b, %dl
+; CHECK-BASELINE-NEXT:    xorb %r15b, %r10b
+; CHECK-BASELINE-NEXT:    xorb %r14b, %r8b
+; CHECK-BASELINE-NEXT:    xorb %bpl, %r9b
+; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %bl
 ; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    xorb %bl, %al
+; CHECK-BASELINE-NEXT:    xorb %r11b, %al
 ; CHECK-BASELINE-NEXT:    movb %al, 7(%rdi)
 ; CHECK-BASELINE-NEXT:    movb %cl, 6(%rdi)
-; CHECK-BASELINE-NEXT:    movb %dl, 5(%rdi)
+; CHECK-BASELINE-NEXT:    movb %bl, 5(%rdi)
 ; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rdi)
 ; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rdi)
 ; CHECK-BASELINE-NEXT:    movb %r10b, 2(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r11b, 1(%rdi)
+; CHECK-BASELINE-NEXT:    movb %dl, 1(%rdi)
 ; CHECK-BASELINE-NEXT:    movb %sil, (%rdi)
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    popq %rbx
@@ -2832,47 +2843,46 @@
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
 ; CHECK-SSE1-NEXT:    movl %ecx, %r10d
-; CHECK-SSE1-NEXT:    movl %edx, %r11d
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
-; CHECK-SSE1-NEXT:    xorb %bpl, %sil
-; CHECK-SSE1-NEXT:    xorb %r13b, %r11b
-; CHECK-SSE1-NEXT:    xorb %r12b, %r10b
-; CHECK-SSE1-NEXT:    xorb %r15b, %r8b
-; CHECK-SSE1-NEXT:    xorb %r14b, %r9b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    xorb %r13b, %sil
+; CHECK-SSE1-NEXT:    xorb %r12b, %dl
+; CHECK-SSE1-NEXT:    xorb %r15b, %r10b
+; CHECK-SSE1-NEXT:    xorb %r14b, %r8b
+; CHECK-SSE1-NEXT:    xorb %bpl, %r9b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %bl
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
 ; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %cl
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT:    xorb %bl, %al
+; CHECK-SSE1-NEXT:    xorb %r11b, %al
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
-; CHECK-SSE1-NEXT:    xorb %bpl, %sil
-; CHECK-SSE1-NEXT:    xorb %r13b, %r11b
-; CHECK-SSE1-NEXT:    xorb %r12b, %r10b
-; CHECK-SSE1-NEXT:    xorb %r15b, %r8b
-; CHECK-SSE1-NEXT:    xorb %r14b, %r9b
-; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
+; CHECK-SSE1-NEXT:    xorb %r13b, %sil
+; CHECK-SSE1-NEXT:    xorb %r12b, %dl
+; CHECK-SSE1-NEXT:    xorb %r15b, %r10b
+; CHECK-SSE1-NEXT:    xorb %r14b, %r8b
+; CHECK-SSE1-NEXT:    xorb %bpl, %r9b
+; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %bl
 ; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    xorb %bl, %al
+; CHECK-SSE1-NEXT:    xorb %r11b, %al
 ; CHECK-SSE1-NEXT:    movb %al, 7(%rdi)
 ; CHECK-SSE1-NEXT:    movb %cl, 6(%rdi)
-; CHECK-SSE1-NEXT:    movb %dl, 5(%rdi)
+; CHECK-SSE1-NEXT:    movb %bl, 5(%rdi)
 ; CHECK-SSE1-NEXT:    movb %r9b, 4(%rdi)
 ; CHECK-SSE1-NEXT:    movb %r8b, 3(%rdi)
 ; CHECK-SSE1-NEXT:    movb %r10b, 2(%rdi)
-; CHECK-SSE1-NEXT:    movb %r11b, 1(%rdi)
+; CHECK-SSE1-NEXT:    movb %dl, 1(%rdi)
 ; CHECK-SSE1-NEXT:    movb %sil, (%rdi)
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    popq %rbx
@@ -2903,50 +2913,50 @@
 define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: in_v4i16:
 ; CHECK-BASELINE:       # %bb.0:
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-BASELINE-NEXT:    xorl %r10d, %r8d
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %ecx
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-BASELINE-NEXT:    xorl %eax, %edx
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    xorl %edi, %edx
 ; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
 ; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
-; CHECK-BASELINE-NEXT:    xorl %eax, %edx
+; CHECK-BASELINE-NEXT:    xorl %edi, %edx
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %ecx
 ; CHECK-BASELINE-NEXT:    xorl %r10d, %r8d
-; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rdi)
-; CHECK-BASELINE-NEXT:    movw %cx, 4(%rdi)
-; CHECK-BASELINE-NEXT:    movw %dx, 2(%rdi)
-; CHECK-BASELINE-NEXT:    movw %si, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
+; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
+; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
+; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: in_v4i16:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-SSE1-NEXT:    xorl %r10d, %r8d
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-SSE1-NEXT:    xorl %r11d, %ecx
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-SSE1-NEXT:    xorl %eax, %edx
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    xorl %edi, %edx
 ; CHECK-SSE1-NEXT:    xorl %r9d, %esi
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
 ; CHECK-SSE1-NEXT:    xorl %r9d, %esi
-; CHECK-SSE1-NEXT:    xorl %eax, %edx
+; CHECK-SSE1-NEXT:    xorl %edi, %edx
 ; CHECK-SSE1-NEXT:    xorl %r11d, %ecx
 ; CHECK-SSE1-NEXT:    xorl %r10d, %r8d
-; CHECK-SSE1-NEXT:    movw %r8w, 6(%rdi)
-; CHECK-SSE1-NEXT:    movw %cx, 4(%rdi)
-; CHECK-SSE1-NEXT:    movw %dx, 2(%rdi)
-; CHECK-SSE1-NEXT:    movw %si, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
+; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
+; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
+; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: in_v4i16:
@@ -2969,25 +2979,25 @@
 define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: in_v2i32:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    xorl %edx, %edi
+; CHECK-BASELINE-NEXT:    movl %edi, %eax
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
 ; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
 ; CHECK-BASELINE-NEXT:    andl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andl %r8d, %edi
-; CHECK-BASELINE-NEXT:    xorl %edx, %edi
+; CHECK-BASELINE-NEXT:    andl %r8d, %eax
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
 ; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
-; CHECK-BASELINE-NEXT:    movl %edi, %eax
 ; CHECK-BASELINE-NEXT:    movl %esi, %edx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: in_v2i32:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    xorl %edx, %edi
+; CHECK-SSE1-NEXT:    movl %edi, %eax
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
 ; CHECK-SSE1-NEXT:    xorl %ecx, %esi
 ; CHECK-SSE1-NEXT:    andl %r9d, %esi
-; CHECK-SSE1-NEXT:    andl %r8d, %edi
-; CHECK-SSE1-NEXT:    xorl %edx, %edi
+; CHECK-SSE1-NEXT:    andl %r8d, %eax
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
 ; CHECK-SSE1-NEXT:    xorl %ecx, %esi
-; CHECK-SSE1-NEXT:    movl %edi, %eax
 ; CHECK-SSE1-NEXT:    movl %esi, %edx
 ; CHECK-SSE1-NEXT:    retq
 ;
@@ -3011,10 +3021,10 @@
 define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
 ; CHECK-LABEL: in_v1i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorq %rsi, %rdi
-; CHECK-NEXT:    andq %rdx, %rdi
-; CHECK-NEXT:    xorq %rsi, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    xorq %rsi, %rax
+; CHECK-NEXT:    andq %rdx, %rax
+; CHECK-NEXT:    xorq %rsi, %rax
 ; CHECK-NEXT:    retq
   %n0 = xor <1 x i64> %x, %y
   %n1 = and <1 x i64> %n0, %mask
@@ -3038,24 +3048,26 @@
 ; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    movq %rdi, %rdx
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT:    xorb %al, %r9b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; CHECK-BASELINE-NEXT:    xorb %dil, %r9b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
-; CHECK-BASELINE-NEXT:    xorb %al, %r9b
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; CHECK-BASELINE-NEXT:    xorb %r10b, %dl
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
-; CHECK-BASELINE-NEXT:    xorb %r10b, %dl
+; CHECK-BASELINE-NEXT:    xorb %dil, %r9b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; CHECK-BASELINE-NEXT:    xorb %r10b, %dil
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dil
+; CHECK-BASELINE-NEXT:    xorb %r10b, %dil
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-BASELINE-NEXT:    xorb %r11b, %r10b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
@@ -3065,13 +3077,9 @@
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
 ; CHECK-BASELINE-NEXT:    xorb %bl, %r11b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bl
-; CHECK-BASELINE-NEXT:    xorb %bpl, %bl
+; CHECK-BASELINE-NEXT:    xorb %r13b, %bl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
-; CHECK-BASELINE-NEXT:    xorb %bpl, %bl
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
-; CHECK-BASELINE-NEXT:    xorb %r13b, %bpl
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
-; CHECK-BASELINE-NEXT:    xorb %r13b, %bpl
+; CHECK-BASELINE-NEXT:    xorb %r13b, %bl
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
 ; CHECK-BASELINE-NEXT:    xorb %r12b, %r13b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
@@ -3085,54 +3093,57 @@
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
 ; CHECK-BASELINE-NEXT:    xorb %r14b, %r15b
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
-; CHECK-BASELINE-NEXT:    xorb %sil, %r14b
+; CHECK-BASELINE-NEXT:    xorb %bpl, %r14b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
-; CHECK-BASELINE-NEXT:    xorb %sil, %r14b
+; CHECK-BASELINE-NEXT:    xorb %bpl, %r14b
+; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
+; CHECK-BASELINE-NEXT:    xorb %al, %bpl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
+; CHECK-BASELINE-NEXT:    xorb %al, %bpl
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; CHECK-BASELINE-NEXT:    xorb %cl, %al
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
 ; CHECK-BASELINE-NEXT:    xorb %cl, %al
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %sil
 ; CHECK-BASELINE-NEXT:    xorb %sil, %cl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-BASELINE-NEXT:    xorb %sil, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, 15(%rdi)
-; CHECK-BASELINE-NEXT:    movb %al, 14(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r14b, 13(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r15b, 12(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r12b, 11(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r13b, 10(%rdi)
-; CHECK-BASELINE-NEXT:    movb %bpl, 9(%rdi)
-; CHECK-BASELINE-NEXT:    movb %bl, 8(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r11b, 7(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r10b, 6(%rdi)
-; CHECK-BASELINE-NEXT:    movb %dl, 5(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movb %cl, 15(%rdx)
+; CHECK-BASELINE-NEXT:    movb %al, 14(%rdx)
+; CHECK-BASELINE-NEXT:    movb %bpl, 13(%rdx)
+; CHECK-BASELINE-NEXT:    movb %r14b, 12(%rdx)
+; CHECK-BASELINE-NEXT:    movb %r15b, 11(%rdx)
+; CHECK-BASELINE-NEXT:    movb %r12b, 10(%rdx)
+; CHECK-BASELINE-NEXT:    movb %r13b, 9(%rdx)
+; CHECK-BASELINE-NEXT:    movb %bl, 8(%rdx)
+; CHECK-BASELINE-NEXT:    movb %r11b, 7(%rdx)
+; CHECK-BASELINE-NEXT:    movb %r10b, 6(%rdx)
+; CHECK-BASELINE-NEXT:    movb %dil, 5(%rdx)
+; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rdx)
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; CHECK-BASELINE-NEXT:    xorb %al, %r8b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
 ; CHECK-BASELINE-NEXT:    xorb %al, %r8b
-; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rdx)
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movb %cl, 2(%rdx)
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, 1(%rdi)
+; CHECK-BASELINE-NEXT:    movb %cl, 1(%rdx)
 ; CHECK-BASELINE-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    movb %cl, (%rdx)
+; CHECK-BASELINE-NEXT:    movq %rdx, %rax
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r13
@@ -3152,24 +3163,26 @@
 ; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    movq %rdi, %rdx
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r15b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r12b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r11b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT:    xorb %al, %r9b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; CHECK-SSE1-NEXT:    xorb %dil, %r9b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
-; CHECK-SSE1-NEXT:    xorb %al, %r9b
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; CHECK-SSE1-NEXT:    xorb %r10b, %dl
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
-; CHECK-SSE1-NEXT:    xorb %r10b, %dl
+; CHECK-SSE1-NEXT:    xorb %dil, %r9b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %dil
+; CHECK-SSE1-NEXT:    xorb %r10b, %dil
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dil
+; CHECK-SSE1-NEXT:    xorb %r10b, %dil
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r10b
 ; CHECK-SSE1-NEXT:    xorb %r11b, %r10b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
@@ -3179,13 +3192,9 @@
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
 ; CHECK-SSE1-NEXT:    xorb %bl, %r11b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bl
-; CHECK-SSE1-NEXT:    xorb %bpl, %bl
+; CHECK-SSE1-NEXT:    xorb %r13b, %bl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
-; CHECK-SSE1-NEXT:    xorb %bpl, %bl
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
-; CHECK-SSE1-NEXT:    xorb %r13b, %bpl
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
-; CHECK-SSE1-NEXT:    xorb %r13b, %bpl
+; CHECK-SSE1-NEXT:    xorb %r13b, %bl
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r13b
 ; CHECK-SSE1-NEXT:    xorb %r12b, %r13b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
@@ -3199,54 +3208,57 @@
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
 ; CHECK-SSE1-NEXT:    xorb %r14b, %r15b
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %r14b
-; CHECK-SSE1-NEXT:    xorb %sil, %r14b
+; CHECK-SSE1-NEXT:    xorb %bpl, %r14b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
-; CHECK-SSE1-NEXT:    xorb %sil, %r14b
+; CHECK-SSE1-NEXT:    xorb %bpl, %r14b
+; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %bpl
+; CHECK-SSE1-NEXT:    xorb %al, %bpl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
+; CHECK-SSE1-NEXT:    xorb %al, %bpl
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; CHECK-SSE1-NEXT:    xorb %cl, %al
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
 ; CHECK-SSE1-NEXT:    xorb %cl, %al
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %sil
 ; CHECK-SSE1-NEXT:    xorb %sil, %cl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-SSE1-NEXT:    xorb %sil, %cl
-; CHECK-SSE1-NEXT:    movb %cl, 15(%rdi)
-; CHECK-SSE1-NEXT:    movb %al, 14(%rdi)
-; CHECK-SSE1-NEXT:    movb %r14b, 13(%rdi)
-; CHECK-SSE1-NEXT:    movb %r15b, 12(%rdi)
-; CHECK-SSE1-NEXT:    movb %r12b, 11(%rdi)
-; CHECK-SSE1-NEXT:    movb %r13b, 10(%rdi)
-; CHECK-SSE1-NEXT:    movb %bpl, 9(%rdi)
-; CHECK-SSE1-NEXT:    movb %bl, 8(%rdi)
-; CHECK-SSE1-NEXT:    movb %r11b, 7(%rdi)
-; CHECK-SSE1-NEXT:    movb %r10b, 6(%rdi)
-; CHECK-SSE1-NEXT:    movb %dl, 5(%rdi)
-; CHECK-SSE1-NEXT:    movb %r9b, 4(%rdi)
+; CHECK-SSE1-NEXT:    movb %cl, 15(%rdx)
+; CHECK-SSE1-NEXT:    movb %al, 14(%rdx)
+; CHECK-SSE1-NEXT:    movb %bpl, 13(%rdx)
+; CHECK-SSE1-NEXT:    movb %r14b, 12(%rdx)
+; CHECK-SSE1-NEXT:    movb %r15b, 11(%rdx)
+; CHECK-SSE1-NEXT:    movb %r12b, 10(%rdx)
+; CHECK-SSE1-NEXT:    movb %r13b, 9(%rdx)
+; CHECK-SSE1-NEXT:    movb %bl, 8(%rdx)
+; CHECK-SSE1-NEXT:    movb %r11b, 7(%rdx)
+; CHECK-SSE1-NEXT:    movb %r10b, 6(%rdx)
+; CHECK-SSE1-NEXT:    movb %dil, 5(%rdx)
+; CHECK-SSE1-NEXT:    movb %r9b, 4(%rdx)
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; CHECK-SSE1-NEXT:    xorb %al, %r8b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
 ; CHECK-SSE1-NEXT:    xorb %al, %r8b
-; CHECK-SSE1-NEXT:    movb %r8b, 3(%rdi)
+; CHECK-SSE1-NEXT:    movb %r8b, 3(%rdx)
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, 2(%rdi)
+; CHECK-SSE1-NEXT:    movb %cl, 2(%rdx)
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, 1(%rdi)
+; CHECK-SSE1-NEXT:    movb %cl, 1(%rdx)
 ; CHECK-SSE1-NEXT:    movb {{[0-9]+}}(%rsp), %al
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    movb %cl, (%rdx)
+; CHECK-SSE1-NEXT:    movq %rdx, %rax
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r13
@@ -3278,12 +3290,13 @@
 ; CHECK-BASELINE-NEXT:    pushq %rbp
 ; CHECK-BASELINE-NEXT:    pushq %r14
 ; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-BASELINE-NEXT:    xorl %r10d, %r9d
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %r8d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-BASELINE-NEXT:    xorl %eax, %ecx
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    xorl %edi, %ecx
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
 ; CHECK-BASELINE-NEXT:    xorl %ebx, %esi
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
@@ -3294,8 +3307,8 @@
 ; CHECK-BASELINE-NEXT:    xorl %ebx, %edx
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
-; CHECK-BASELINE-NEXT:    xorl %eax, %ecx
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-BASELINE-NEXT:    xorl %edi, %ecx
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %r8d
 ; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
@@ -3306,22 +3319,21 @@
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bp
 ; CHECK-BASELINE-NEXT:    xorl %ebx, %ebp
 ; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT:    xorw %ax, %bx
+; CHECK-BASELINE-NEXT:    xorw %di, %bx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bx
-; CHECK-BASELINE-NEXT:    xorl %eax, %ebx
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
-; CHECK-BASELINE-NEXT:    xorw %r14w, %ax
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %ax
-; CHECK-BASELINE-NEXT:    xorl %r14d, %eax
-; CHECK-BASELINE-NEXT:    movw %ax, 14(%rdi)
-; CHECK-BASELINE-NEXT:    movw %bx, 12(%rdi)
-; CHECK-BASELINE-NEXT:    movw %bp, 10(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r9w, 8(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rdi)
-; CHECK-BASELINE-NEXT:    movw %cx, 4(%rdi)
-; CHECK-BASELINE-NEXT:    movw %dx, 2(%rdi)
-; CHECK-BASELINE-NEXT:    movw %si, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    xorl %edi, %ebx
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    xorw %r14w, %di
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %di
+; CHECK-BASELINE-NEXT:    xorl %r14d, %edi
+; CHECK-BASELINE-NEXT:    movw %di, 14(%rax)
+; CHECK-BASELINE-NEXT:    movw %bx, 12(%rax)
+; CHECK-BASELINE-NEXT:    movw %bp, 10(%rax)
+; CHECK-BASELINE-NEXT:    movw %r9w, 8(%rax)
+; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
+; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
+; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
+; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r14
 ; CHECK-BASELINE-NEXT:    popq %rbp
@@ -3332,12 +3344,13 @@
 ; CHECK-SSE1-NEXT:    pushq %rbp
 ; CHECK-SSE1-NEXT:    pushq %r14
 ; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-SSE1-NEXT:    xorl %r10d, %r9d
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-SSE1-NEXT:    xorl %r11d, %r8d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-SSE1-NEXT:    xorl %eax, %ecx
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    xorl %edi, %ecx
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
 ; CHECK-SSE1-NEXT:    xorl %ebx, %esi
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
@@ -3348,8 +3361,8 @@
 ; CHECK-SSE1-NEXT:    xorl %ebx, %edx
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
-; CHECK-SSE1-NEXT:    xorl %eax, %ecx
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-SSE1-NEXT:    xorl %edi, %ecx
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-SSE1-NEXT:    xorl %r11d, %r8d
 ; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
@@ -3360,22 +3373,21 @@
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bp
 ; CHECK-SSE1-NEXT:    xorl %ebx, %ebp
 ; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT:    xorw %ax, %bx
+; CHECK-SSE1-NEXT:    xorw %di, %bx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bx
-; CHECK-SSE1-NEXT:    xorl %eax, %ebx
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
-; CHECK-SSE1-NEXT:    xorw %r14w, %ax
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %ax
-; CHECK-SSE1-NEXT:    xorl %r14d, %eax
-; CHECK-SSE1-NEXT:    movw %ax, 14(%rdi)
-; CHECK-SSE1-NEXT:    movw %bx, 12(%rdi)
-; CHECK-SSE1-NEXT:    movw %bp, 10(%rdi)
-; CHECK-SSE1-NEXT:    movw %r9w, 8(%rdi)
-; CHECK-SSE1-NEXT:    movw %r8w, 6(%rdi)
-; CHECK-SSE1-NEXT:    movw %cx, 4(%rdi)
-; CHECK-SSE1-NEXT:    movw %dx, 2(%rdi)
-; CHECK-SSE1-NEXT:    movw %si, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    xorl %edi, %ebx
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    xorw %r14w, %di
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %di
+; CHECK-SSE1-NEXT:    xorl %r14d, %edi
+; CHECK-SSE1-NEXT:    movw %di, 14(%rax)
+; CHECK-SSE1-NEXT:    movw %bx, 12(%rax)
+; CHECK-SSE1-NEXT:    movw %bp, 10(%rax)
+; CHECK-SSE1-NEXT:    movw %r9w, 8(%rax)
+; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
+; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
+; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
+; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r14
 ; CHECK-SSE1-NEXT:    popq %rbp
@@ -3402,43 +3414,43 @@
 ; CHECK-BASELINE-LABEL: in_v4i32:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r8d
 ; CHECK-BASELINE-NEXT:    movl 8(%rdx), %r9d
 ; CHECK-BASELINE-NEXT:    movl (%rdx), %r11d
 ; CHECK-BASELINE-NEXT:    movl 4(%rdx), %r10d
 ; CHECK-BASELINE-NEXT:    movl (%rsi), %edx
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
-; CHECK-BASELINE-NEXT:    movl 4(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorl %r10d, %eax
+; CHECK-BASELINE-NEXT:    movl 4(%rsi), %edi
+; CHECK-BASELINE-NEXT:    xorl %r10d, %edi
 ; CHECK-BASELINE-NEXT:    movl 8(%rsi), %ebx
 ; CHECK-BASELINE-NEXT:    xorl %r9d, %ebx
 ; CHECK-BASELINE-NEXT:    movl 12(%rsi), %esi
 ; CHECK-BASELINE-NEXT:    xorl %r8d, %esi
 ; CHECK-BASELINE-NEXT:    andl 12(%rcx), %esi
 ; CHECK-BASELINE-NEXT:    andl 8(%rcx), %ebx
-; CHECK-BASELINE-NEXT:    andl 4(%rcx), %eax
+; CHECK-BASELINE-NEXT:    andl 4(%rcx), %edi
 ; CHECK-BASELINE-NEXT:    andl (%rcx), %edx
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
-; CHECK-BASELINE-NEXT:    xorl %r10d, %eax
+; CHECK-BASELINE-NEXT:    xorl %r10d, %edi
 ; CHECK-BASELINE-NEXT:    xorl %r9d, %ebx
 ; CHECK-BASELINE-NEXT:    xorl %r8d, %esi
-; CHECK-BASELINE-NEXT:    movl %esi, 12(%rdi)
-; CHECK-BASELINE-NEXT:    movl %ebx, 8(%rdi)
-; CHECK-BASELINE-NEXT:    movl %eax, 4(%rdi)
-; CHECK-BASELINE-NEXT:    movl %edx, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    movl %esi, 12(%rax)
+; CHECK-BASELINE-NEXT:    movl %ebx, 8(%rax)
+; CHECK-BASELINE-NEXT:    movl %edi, 4(%rax)
+; CHECK-BASELINE-NEXT:    movl %edx, (%rax)
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: in_v4i32:
 ; CHECK-SSE1:       # %bb.0:
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm1
 ; CHECK-SSE1-NEXT:    andps (%rsi), %xmm0
 ; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
 ; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: in_v4i32:
@@ -3468,25 +3480,25 @@
 define <2 x i64> @in_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: in_v2i64:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    xorq %rdx, %rdi
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    xorq %rdx, %rax
 ; CHECK-BASELINE-NEXT:    xorq %rcx, %rsi
 ; CHECK-BASELINE-NEXT:    andq %r9, %rsi
-; CHECK-BASELINE-NEXT:    andq %r8, %rdi
-; CHECK-BASELINE-NEXT:    xorq %rdx, %rdi
+; CHECK-BASELINE-NEXT:    andq %r8, %rax
+; CHECK-BASELINE-NEXT:    xorq %rdx, %rax
 ; CHECK-BASELINE-NEXT:    xorq %rcx, %rsi
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    movq %rsi, %rdx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: in_v2i64:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    xorq %rdx, %rdi
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    xorq %rdx, %rax
 ; CHECK-SSE1-NEXT:    xorq %rcx, %rsi
 ; CHECK-SSE1-NEXT:    andq %r9, %rsi
-; CHECK-SSE1-NEXT:    andq %r8, %rdi
-; CHECK-SSE1-NEXT:    xorq %rdx, %rdi
+; CHECK-SSE1-NEXT:    andq %r8, %rax
+; CHECK-SSE1-NEXT:    xorq %rdx, %rax
 ; CHECK-SSE1-NEXT:    xorq %rcx, %rsi
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movq %rsi, %rdx
 ; CHECK-SSE1-NEXT:    retq
 ;
@@ -4076,142 +4088,141 @@
 ; CHECK-BASELINE-NEXT:    pushq %r13
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
-; CHECK-BASELINE-NEXT:    movq %rcx, %r8
-; CHECK-BASELINE-NEXT:    movzwl 30(%rdx), %eax
-; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl 28(%rdx), %eax
-; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 26(%rdx), %eax
-; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movq %rcx, %r9
+; CHECK-BASELINE-NEXT:    movq %rdi, %r10
+; CHECK-BASELINE-NEXT:    movzwl 30(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl 28(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 26(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    movl 24(%rdx), %eax
 ; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    movzwl 22(%rdx), %eax
 ; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl 20(%rdx), %r13d
-; CHECK-BASELINE-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 18(%rdx), %r11d
+; CHECK-BASELINE-NEXT:    movl 20(%rdx), %r11d
 ; CHECK-BASELINE-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl 16(%rdx), %r14d
+; CHECK-BASELINE-NEXT:    movzwl 18(%rdx), %r14d
 ; CHECK-BASELINE-NEXT:    movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 14(%rdx), %r15d
+; CHECK-BASELINE-NEXT:    movl 16(%rdx), %r15d
 ; CHECK-BASELINE-NEXT:    movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r12d
+; CHECK-BASELINE-NEXT:    movzwl 14(%rdx), %r12d
 ; CHECK-BASELINE-NEXT:    movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 10(%rdx), %r10d
-; CHECK-BASELINE-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl 8(%rdx), %r9d
-; CHECK-BASELINE-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 6(%rdx), %ebx
+; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r13d
+; CHECK-BASELINE-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 10(%rdx), %r8d
+; CHECK-BASELINE-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl 8(%rdx), %ebx
 ; CHECK-BASELINE-NEXT:    movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl (%rdx), %eax
-; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl 4(%rdx), %ebp
+; CHECK-BASELINE-NEXT:    movzwl 6(%rdx), %ebp
 ; CHECK-BASELINE-NEXT:    movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 2(%rdx), %ecx
+; CHECK-BASELINE-NEXT:    movl (%rdx), %ecx
 ; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl (%rsi), %edx
-; CHECK-BASELINE-NEXT:    xorw %ax, %dx
-; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 2(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %cx, %ax
+; CHECK-BASELINE-NEXT:    movl 4(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 2(%rdx), %eax
 ; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl (%rsi), %edx
+; CHECK-BASELINE-NEXT:    xorw %cx, %dx
+; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 2(%rsi), %ecx
+; CHECK-BASELINE-NEXT:    xorw %ax, %cx
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    movzwl 4(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %bp, %ax
+; CHECK-BASELINE-NEXT:    xorw %di, %ax
 ; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    movzwl 6(%rsi), %edx
-; CHECK-BASELINE-NEXT:    xorw %bx, %dx
+; CHECK-BASELINE-NEXT:    xorw %bp, %dx
 ; CHECK-BASELINE-NEXT:    movl %edx, %eax
 ; CHECK-BASELINE-NEXT:    movzwl 8(%rsi), %ecx
-; CHECK-BASELINE-NEXT:    xorw %r9w, %cx
-; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    xorw %bx, %cx
 ; CHECK-BASELINE-NEXT:    movzwl 10(%rsi), %edx
-; CHECK-BASELINE-NEXT:    xorw %r10w, %dx
-; CHECK-BASELINE-NEXT:    movl %edx, %ecx
-; CHECK-BASELINE-NEXT:    movzwl 12(%rsi), %edx
-; CHECK-BASELINE-NEXT:    xorw %r12w, %dx
-; CHECK-BASELINE-NEXT:    movzwl 14(%rsi), %r12d
-; CHECK-BASELINE-NEXT:    xorw %r15w, %r12w
-; CHECK-BASELINE-NEXT:    movzwl 16(%rsi), %r15d
-; CHECK-BASELINE-NEXT:    xorw %r14w, %r15w
-; CHECK-BASELINE-NEXT:    movzwl 18(%rsi), %r14d
-; CHECK-BASELINE-NEXT:    xorw %r11w, %r14w
-; CHECK-BASELINE-NEXT:    movzwl 20(%rsi), %ebp
-; CHECK-BASELINE-NEXT:    xorw %r13w, %bp
-; CHECK-BASELINE-NEXT:    movzwl 22(%rsi), %ebx
-; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bx # 2-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movzwl 24(%rsi), %r11d
-; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r11w # 2-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movzwl 26(%rsi), %r10d
-; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r10w # 2-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movzwl 28(%rsi), %r9d
-; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r9w # 2-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movzwl 30(%rsi), %r13d
-; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r13w # 2-byte Folded Reload
-; CHECK-BASELINE-NEXT:    andw 30(%r8), %r13w
-; CHECK-BASELINE-NEXT:    andw 28(%r8), %r9w
-; CHECK-BASELINE-NEXT:    andw 26(%r8), %r10w
-; CHECK-BASELINE-NEXT:    andw 24(%r8), %r11w
-; CHECK-BASELINE-NEXT:    andw 22(%r8), %bx
-; CHECK-BASELINE-NEXT:    andw 20(%r8), %bp
-; CHECK-BASELINE-NEXT:    andw 18(%r8), %r14w
-; CHECK-BASELINE-NEXT:    andw 16(%r8), %r15w
-; CHECK-BASELINE-NEXT:    andw 14(%r8), %r12w
-; CHECK-BASELINE-NEXT:    andw 12(%r8), %dx
-; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    andw 10(%r8), %cx
-; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-BASELINE-NEXT:    andw 8(%r8), %dx
-; CHECK-BASELINE-NEXT:    andw 6(%r8), %ax
-; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-BASELINE-NEXT:    andw 4(%r8), %cx
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    andw 2(%r8), %ax
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-BASELINE-NEXT:    andw (%r8), %si
-; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movl %ecx, %esi
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorw %r8w, %dx
 ; CHECK-BASELINE-NEXT:    movl %edx, %r8d
+; CHECK-BASELINE-NEXT:    movzwl 12(%rsi), %edx
+; CHECK-BASELINE-NEXT:    xorw %r13w, %dx
+; CHECK-BASELINE-NEXT:    movzwl 14(%rsi), %r13d
+; CHECK-BASELINE-NEXT:    xorw %r12w, %r13w
+; CHECK-BASELINE-NEXT:    movzwl 16(%rsi), %r12d
+; CHECK-BASELINE-NEXT:    xorw %r15w, %r12w
+; CHECK-BASELINE-NEXT:    movzwl 18(%rsi), %r15d
+; CHECK-BASELINE-NEXT:    xorw %r14w, %r15w
+; CHECK-BASELINE-NEXT:    movzwl 20(%rsi), %r14d
+; CHECK-BASELINE-NEXT:    xorw %r11w, %r14w
+; CHECK-BASELINE-NEXT:    movzwl 22(%rsi), %ebp
+; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bp # 2-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movzwl 24(%rsi), %ebx
+; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bx # 2-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movzwl 26(%rsi), %r11d
+; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r11w # 2-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movzwl 28(%rsi), %edi
+; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movzwl 30(%rsi), %esi
+; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Folded Reload
+; CHECK-BASELINE-NEXT:    andw 30(%r9), %si
+; CHECK-BASELINE-NEXT:    andw 28(%r9), %di
+; CHECK-BASELINE-NEXT:    andw 26(%r9), %r11w
+; CHECK-BASELINE-NEXT:    andw 24(%r9), %bx
+; CHECK-BASELINE-NEXT:    andw 22(%r9), %bp
+; CHECK-BASELINE-NEXT:    andw 20(%r9), %r14w
+; CHECK-BASELINE-NEXT:    andw 18(%r9), %r15w
+; CHECK-BASELINE-NEXT:    andw 16(%r9), %r12w
+; CHECK-BASELINE-NEXT:    andw 14(%r9), %r13w
+; CHECK-BASELINE-NEXT:    andw 12(%r9), %dx
+; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    andw 10(%r9), %r8w
+; CHECK-BASELINE-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl %ecx, %edx
+; CHECK-BASELINE-NEXT:    andw 8(%r9), %dx
+; CHECK-BASELINE-NEXT:    andw 6(%r9), %ax
+; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
+; CHECK-BASELINE-NEXT:    andw 4(%r9), %r8w
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-BASELINE-NEXT:    andw 2(%r9), %ax
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-BASELINE-NEXT:    andw (%r9), %cx
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movl %edx, %ecx
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movw %r13w, 30(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r9w, 28(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r10w, 26(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r11w, 24(%rdi)
-; CHECK-BASELINE-NEXT:    movw %bx, 22(%rdi)
-; CHECK-BASELINE-NEXT:    movw %bp, 20(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r14w, 18(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r15w, 16(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r12w, 14(%rdi)
-; CHECK-BASELINE-NEXT:    movw %ax, 12(%rdi)
-; CHECK-BASELINE-NEXT:    movw %dx, 10(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r8w, 8(%rdi)
-; CHECK-BASELINE-NEXT:    movw %cx, 6(%rdi)
-; CHECK-BASELINE-NEXT:    movw %si, 4(%rdi)
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movw %si, 30(%r10)
+; CHECK-BASELINE-NEXT:    movw %di, 28(%r10)
+; CHECK-BASELINE-NEXT:    movw %r11w, 26(%r10)
+; CHECK-BASELINE-NEXT:    movw %bx, 24(%r10)
+; CHECK-BASELINE-NEXT:    movw %bp, 22(%r10)
+; CHECK-BASELINE-NEXT:    movw %r14w, 20(%r10)
+; CHECK-BASELINE-NEXT:    movw %r15w, 18(%r10)
+; CHECK-BASELINE-NEXT:    movw %r12w, 16(%r10)
+; CHECK-BASELINE-NEXT:    movw %r13w, 14(%r10)
+; CHECK-BASELINE-NEXT:    movw %ax, 12(%r10)
+; CHECK-BASELINE-NEXT:    movw %dx, 10(%r10)
+; CHECK-BASELINE-NEXT:    movw %cx, 8(%r10)
+; CHECK-BASELINE-NEXT:    movw %r9w, 6(%r10)
+; CHECK-BASELINE-NEXT:    movw %r8w, 4(%r10)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, 2(%r10)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    movw %ax, (%r10)
+; CHECK-BASELINE-NEXT:    movq %r10, %rax
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r13
@@ -4228,142 +4239,141 @@
 ; CHECK-SSE1-NEXT:    pushq %r13
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
-; CHECK-SSE1-NEXT:    movq %rcx, %r8
-; CHECK-SSE1-NEXT:    movzwl 30(%rdx), %eax
-; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl 28(%rdx), %eax
-; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 26(%rdx), %eax
-; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movq %rcx, %r9
+; CHECK-SSE1-NEXT:    movq %rdi, %r10
+; CHECK-SSE1-NEXT:    movzwl 30(%rdx), %edi
+; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl 28(%rdx), %edi
+; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 26(%rdx), %edi
+; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    movl 24(%rdx), %eax
 ; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    movzwl 22(%rdx), %eax
 ; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl 20(%rdx), %r13d
-; CHECK-SSE1-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 18(%rdx), %r11d
+; CHECK-SSE1-NEXT:    movl 20(%rdx), %r11d
 ; CHECK-SSE1-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl 16(%rdx), %r14d
+; CHECK-SSE1-NEXT:    movzwl 18(%rdx), %r14d
 ; CHECK-SSE1-NEXT:    movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 14(%rdx), %r15d
+; CHECK-SSE1-NEXT:    movl 16(%rdx), %r15d
 ; CHECK-SSE1-NEXT:    movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl 12(%rdx), %r12d
+; CHECK-SSE1-NEXT:    movzwl 14(%rdx), %r12d
 ; CHECK-SSE1-NEXT:    movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 10(%rdx), %r10d
-; CHECK-SSE1-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl 8(%rdx), %r9d
-; CHECK-SSE1-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 6(%rdx), %ebx
+; CHECK-SSE1-NEXT:    movl 12(%rdx), %r13d
+; CHECK-SSE1-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 10(%rdx), %r8d
+; CHECK-SSE1-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl 8(%rdx), %ebx
 ; CHECK-SSE1-NEXT:    movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl (%rdx), %eax
-; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl 4(%rdx), %ebp
+; CHECK-SSE1-NEXT:    movzwl 6(%rdx), %ebp
 ; CHECK-SSE1-NEXT:    movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 2(%rdx), %ecx
+; CHECK-SSE1-NEXT:    movl (%rdx), %ecx
 ; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl (%rsi), %edx
-; CHECK-SSE1-NEXT:    xorw %ax, %dx
-; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 2(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %cx, %ax
+; CHECK-SSE1-NEXT:    movl 4(%rdx), %edi
+; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 2(%rdx), %eax
 ; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl (%rsi), %edx
+; CHECK-SSE1-NEXT:    xorw %cx, %dx
+; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 2(%rsi), %ecx
+; CHECK-SSE1-NEXT:    xorw %ax, %cx
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    movzwl 4(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %bp, %ax
+; CHECK-SSE1-NEXT:    xorw %di, %ax
 ; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    movzwl 6(%rsi), %edx
-; CHECK-SSE1-NEXT:    xorw %bx, %dx
+; CHECK-SSE1-NEXT:    xorw %bp, %dx
 ; CHECK-SSE1-NEXT:    movl %edx, %eax
 ; CHECK-SSE1-NEXT:    movzwl 8(%rsi), %ecx
-; CHECK-SSE1-NEXT:    xorw %r9w, %cx
-; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    xorw %bx, %cx
 ; CHECK-SSE1-NEXT:    movzwl 10(%rsi), %edx
-; CHECK-SSE1-NEXT:    xorw %r10w, %dx
-; CHECK-SSE1-NEXT:    movl %edx, %ecx
-; CHECK-SSE1-NEXT:    movzwl 12(%rsi), %edx
-; CHECK-SSE1-NEXT:    xorw %r12w, %dx
-; CHECK-SSE1-NEXT:    movzwl 14(%rsi), %r12d
-; CHECK-SSE1-NEXT:    xorw %r15w, %r12w
-; CHECK-SSE1-NEXT:    movzwl 16(%rsi), %r15d
-; CHECK-SSE1-NEXT:    xorw %r14w, %r15w
-; CHECK-SSE1-NEXT:    movzwl 18(%rsi), %r14d
-; CHECK-SSE1-NEXT:    xorw %r11w, %r14w
-; CHECK-SSE1-NEXT:    movzwl 20(%rsi), %ebp
-; CHECK-SSE1-NEXT:    xorw %r13w, %bp
-; CHECK-SSE1-NEXT:    movzwl 22(%rsi), %ebx
-; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bx # 2-byte Folded Reload
-; CHECK-SSE1-NEXT:    movzwl 24(%rsi), %r11d
-; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r11w # 2-byte Folded Reload
-; CHECK-SSE1-NEXT:    movzwl 26(%rsi), %r10d
-; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r10w # 2-byte Folded Reload
-; CHECK-SSE1-NEXT:    movzwl 28(%rsi), %r9d
-; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r9w # 2-byte Folded Reload
-; CHECK-SSE1-NEXT:    movzwl 30(%rsi), %r13d
-; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r13w # 2-byte Folded Reload
-; CHECK-SSE1-NEXT:    andw 30(%r8), %r13w
-; CHECK-SSE1-NEXT:    andw 28(%r8), %r9w
-; CHECK-SSE1-NEXT:    andw 26(%r8), %r10w
-; CHECK-SSE1-NEXT:    andw 24(%r8), %r11w
-; CHECK-SSE1-NEXT:    andw 22(%r8), %bx
-; CHECK-SSE1-NEXT:    andw 20(%r8), %bp
-; CHECK-SSE1-NEXT:    andw 18(%r8), %r14w
-; CHECK-SSE1-NEXT:    andw 16(%r8), %r15w
-; CHECK-SSE1-NEXT:    andw 14(%r8), %r12w
-; CHECK-SSE1-NEXT:    andw 12(%r8), %dx
-; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    andw 10(%r8), %cx
-; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-SSE1-NEXT:    andw 8(%r8), %dx
-; CHECK-SSE1-NEXT:    andw 6(%r8), %ax
-; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-SSE1-NEXT:    andw 4(%r8), %cx
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    andw 2(%r8), %ax
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-SSE1-NEXT:    andw (%r8), %si
-; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload
-; CHECK-SSE1-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
-; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
-; CHECK-SSE1-NEXT:    movl %ecx, %esi
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
-; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorw %r8w, %dx
 ; CHECK-SSE1-NEXT:    movl %edx, %r8d
+; CHECK-SSE1-NEXT:    movzwl 12(%rsi), %edx
+; CHECK-SSE1-NEXT:    xorw %r13w, %dx
+; CHECK-SSE1-NEXT:    movzwl 14(%rsi), %r13d
+; CHECK-SSE1-NEXT:    xorw %r12w, %r13w
+; CHECK-SSE1-NEXT:    movzwl 16(%rsi), %r12d
+; CHECK-SSE1-NEXT:    xorw %r15w, %r12w
+; CHECK-SSE1-NEXT:    movzwl 18(%rsi), %r15d
+; CHECK-SSE1-NEXT:    xorw %r14w, %r15w
+; CHECK-SSE1-NEXT:    movzwl 20(%rsi), %r14d
+; CHECK-SSE1-NEXT:    xorw %r11w, %r14w
+; CHECK-SSE1-NEXT:    movzwl 22(%rsi), %ebp
+; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bp # 2-byte Folded Reload
+; CHECK-SSE1-NEXT:    movzwl 24(%rsi), %ebx
+; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bx # 2-byte Folded Reload
+; CHECK-SSE1-NEXT:    movzwl 26(%rsi), %r11d
+; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r11w # 2-byte Folded Reload
+; CHECK-SSE1-NEXT:    movzwl 28(%rsi), %edi
+; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Folded Reload
+; CHECK-SSE1-NEXT:    movzwl 30(%rsi), %esi
+; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Folded Reload
+; CHECK-SSE1-NEXT:    andw 30(%r9), %si
+; CHECK-SSE1-NEXT:    andw 28(%r9), %di
+; CHECK-SSE1-NEXT:    andw 26(%r9), %r11w
+; CHECK-SSE1-NEXT:    andw 24(%r9), %bx
+; CHECK-SSE1-NEXT:    andw 22(%r9), %bp
+; CHECK-SSE1-NEXT:    andw 20(%r9), %r14w
+; CHECK-SSE1-NEXT:    andw 18(%r9), %r15w
+; CHECK-SSE1-NEXT:    andw 16(%r9), %r12w
+; CHECK-SSE1-NEXT:    andw 14(%r9), %r13w
+; CHECK-SSE1-NEXT:    andw 12(%r9), %dx
+; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    andw 10(%r9), %r8w
+; CHECK-SSE1-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl %ecx, %edx
+; CHECK-SSE1-NEXT:    andw 8(%r9), %dx
+; CHECK-SSE1-NEXT:    andw 6(%r9), %ax
+; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
+; CHECK-SSE1-NEXT:    andw 4(%r9), %r8w
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-SSE1-NEXT:    andw 2(%r9), %ax
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-SSE1-NEXT:    andw (%r9), %cx
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    movl %edx, %ecx
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload
-; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Folded Reload
-; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload
-; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload
-; CHECK-SSE1-NEXT:    movw %r13w, 30(%rdi)
-; CHECK-SSE1-NEXT:    movw %r9w, 28(%rdi)
-; CHECK-SSE1-NEXT:    movw %r10w, 26(%rdi)
-; CHECK-SSE1-NEXT:    movw %r11w, 24(%rdi)
-; CHECK-SSE1-NEXT:    movw %bx, 22(%rdi)
-; CHECK-SSE1-NEXT:    movw %bp, 20(%rdi)
-; CHECK-SSE1-NEXT:    movw %r14w, 18(%rdi)
-; CHECK-SSE1-NEXT:    movw %r15w, 16(%rdi)
-; CHECK-SSE1-NEXT:    movw %r12w, 14(%rdi)
-; CHECK-SSE1-NEXT:    movw %ax, 12(%rdi)
-; CHECK-SSE1-NEXT:    movw %dx, 10(%rdi)
-; CHECK-SSE1-NEXT:    movw %r8w, 8(%rdi)
-; CHECK-SSE1-NEXT:    movw %cx, 6(%rdi)
-; CHECK-SSE1-NEXT:    movw %si, 4(%rdi)
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    movw %si, 30(%r10)
+; CHECK-SSE1-NEXT:    movw %di, 28(%r10)
+; CHECK-SSE1-NEXT:    movw %r11w, 26(%r10)
+; CHECK-SSE1-NEXT:    movw %bx, 24(%r10)
+; CHECK-SSE1-NEXT:    movw %bp, 22(%r10)
+; CHECK-SSE1-NEXT:    movw %r14w, 20(%r10)
+; CHECK-SSE1-NEXT:    movw %r15w, 18(%r10)
+; CHECK-SSE1-NEXT:    movw %r12w, 16(%r10)
+; CHECK-SSE1-NEXT:    movw %r13w, 14(%r10)
+; CHECK-SSE1-NEXT:    movw %ax, 12(%r10)
+; CHECK-SSE1-NEXT:    movw %dx, 10(%r10)
+; CHECK-SSE1-NEXT:    movw %cx, 8(%r10)
+; CHECK-SSE1-NEXT:    movw %r9w, 6(%r10)
+; CHECK-SSE1-NEXT:    movw %r8w, 4(%r10)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 2(%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, 2(%r10)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    movw %ax, (%r10)
+; CHECK-SSE1-NEXT:    movq %r10, %rax
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r13
@@ -4573,62 +4583,62 @@
 ; CHECK-BASELINE-LABEL: in_v4i64:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    movq 24(%rdx), %r8
 ; CHECK-BASELINE-NEXT:    movq 16(%rdx), %r9
 ; CHECK-BASELINE-NEXT:    movq (%rdx), %r11
 ; CHECK-BASELINE-NEXT:    movq 8(%rdx), %r10
 ; CHECK-BASELINE-NEXT:    movq (%rsi), %rdx
 ; CHECK-BASELINE-NEXT:    xorq %r11, %rdx
-; CHECK-BASELINE-NEXT:    movq 8(%rsi), %rax
-; CHECK-BASELINE-NEXT:    xorq %r10, %rax
+; CHECK-BASELINE-NEXT:    movq 8(%rsi), %rdi
+; CHECK-BASELINE-NEXT:    xorq %r10, %rdi
 ; CHECK-BASELINE-NEXT:    movq 16(%rsi), %rbx
 ; CHECK-BASELINE-NEXT:    xorq %r9, %rbx
 ; CHECK-BASELINE-NEXT:    movq 24(%rsi), %rsi
 ; CHECK-BASELINE-NEXT:    xorq %r8, %rsi
 ; CHECK-BASELINE-NEXT:    andq 24(%rcx), %rsi
 ; CHECK-BASELINE-NEXT:    andq 16(%rcx), %rbx
-; CHECK-BASELINE-NEXT:    andq 8(%rcx), %rax
+; CHECK-BASELINE-NEXT:    andq 8(%rcx), %rdi
 ; CHECK-BASELINE-NEXT:    andq (%rcx), %rdx
 ; CHECK-BASELINE-NEXT:    xorq %r11, %rdx
-; CHECK-BASELINE-NEXT:    xorq %r10, %rax
+; CHECK-BASELINE-NEXT:    xorq %r10, %rdi
 ; CHECK-BASELINE-NEXT:    xorq %r9, %rbx
 ; CHECK-BASELINE-NEXT:    xorq %r8, %rsi
-; CHECK-BASELINE-NEXT:    movq %rsi, 24(%rdi)
-; CHECK-BASELINE-NEXT:    movq %rbx, 16(%rdi)
-; CHECK-BASELINE-NEXT:    movq %rax, 8(%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdx, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    movq %rsi, 24(%rax)
+; CHECK-BASELINE-NEXT:    movq %rbx, 16(%rax)
+; CHECK-BASELINE-NEXT:    movq %rdi, 8(%rax)
+; CHECK-BASELINE-NEXT:    movq %rdx, (%rax)
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: in_v4i64:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movq 24(%rdx), %r8
 ; CHECK-SSE1-NEXT:    movq 16(%rdx), %r9
 ; CHECK-SSE1-NEXT:    movq (%rdx), %r11
 ; CHECK-SSE1-NEXT:    movq 8(%rdx), %r10
 ; CHECK-SSE1-NEXT:    movq (%rsi), %rdx
 ; CHECK-SSE1-NEXT:    xorq %r11, %rdx
-; CHECK-SSE1-NEXT:    movq 8(%rsi), %rax
-; CHECK-SSE1-NEXT:    xorq %r10, %rax
+; CHECK-SSE1-NEXT:    movq 8(%rsi), %rdi
+; CHECK-SSE1-NEXT:    xorq %r10, %rdi
 ; CHECK-SSE1-NEXT:    movq 16(%rsi), %rbx
 ; CHECK-SSE1-NEXT:    xorq %r9, %rbx
 ; CHECK-SSE1-NEXT:    movq 24(%rsi), %rsi
 ; CHECK-SSE1-NEXT:    xorq %r8, %rsi
 ; CHECK-SSE1-NEXT:    andq 24(%rcx), %rsi
 ; CHECK-SSE1-NEXT:    andq 16(%rcx), %rbx
-; CHECK-SSE1-NEXT:    andq 8(%rcx), %rax
+; CHECK-SSE1-NEXT:    andq 8(%rcx), %rdi
 ; CHECK-SSE1-NEXT:    andq (%rcx), %rdx
 ; CHECK-SSE1-NEXT:    xorq %r11, %rdx
-; CHECK-SSE1-NEXT:    xorq %r10, %rax
+; CHECK-SSE1-NEXT:    xorq %r10, %rdi
 ; CHECK-SSE1-NEXT:    xorq %r9, %rbx
 ; CHECK-SSE1-NEXT:    xorq %r8, %rsi
-; CHECK-SSE1-NEXT:    movq %rsi, 24(%rdi)
-; CHECK-SSE1-NEXT:    movq %rbx, 16(%rdi)
-; CHECK-SSE1-NEXT:    movq %rax, 8(%rdi)
-; CHECK-SSE1-NEXT:    movq %rdx, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    movq %rsi, 24(%rax)
+; CHECK-SSE1-NEXT:    movq %rbx, 16(%rax)
+; CHECK-SSE1-NEXT:    movq %rdi, 8(%rax)
+; CHECK-SSE1-NEXT:    movq %rdx, (%rax)
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/urem-power-of-two.ll b/llvm/test/CodeGen/X86/urem-power-of-two.ll
index 5517c35..c60305e 100644
--- a/llvm/test/CodeGen/X86/urem-power-of-two.ll
+++ b/llvm/test/CodeGen/X86/urem-power-of-two.ll
@@ -14,8 +14,8 @@
 ;
 ; X64-LABEL: const_pow_2:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl $31, %edi
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    andl $31, %eax
 ; X64-NEXT:    retq
   %urem = urem i64 %x, 32
   ret i64 %urem
@@ -35,8 +35,9 @@
 ;
 ; X64-LABEL: shift_left_pow_2:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $1, %eax
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shll %cl, %eax
 ; X64-NEXT:    addl $33554431, %eax # imm = 0x1FFFFFF
 ; X64-NEXT:    andl %edi, %eax
@@ -61,8 +62,9 @@
 ;
 ; X64-LABEL: shift_right_pow_2:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $32768, %eax # imm = 0x8000
 ; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl $32768, %eax # imm = 0x8000
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrl %cl, %eax
 ; X64-NEXT:    decl %eax
 ; X64-NEXT:    andl %edi, %eax
diff --git a/llvm/test/CodeGen/X86/urem-seteq-optsize.ll b/llvm/test/CodeGen/X86/urem-seteq-optsize.ll
index cda7980..9cc450b 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-optsize.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-optsize.ll
@@ -26,10 +26,10 @@
 ;
 ; X64-LABEL: test_minsize:
 ; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    pushq $5
 ; X64-NEXT:    popq %rcx
 ; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    divl %ecx
 ; X64-NEXT:    testl %edx, %edx
 ; X64-NEXT:    pushq $42
diff --git a/llvm/test/CodeGen/X86/use-add-flags.ll b/llvm/test/CodeGen/X86/use-add-flags.ll
index 37baef9..3e7d84c 100644
--- a/llvm/test/CodeGen/X86/use-add-flags.ll
+++ b/llvm/test/CodeGen/X86/use-add-flags.ll
@@ -10,16 +10,16 @@
 define i32 @test1(i32* %x, i32 %y, i32 %a, i32 %b) nounwind {
 ; LNX-LABEL: test1:
 ; LNX:       # %bb.0:
-; LNX-NEXT:    addl (%rdi), %esi
-; LNX-NEXT:    cmovnsl %ecx, %edx
 ; LNX-NEXT:    movl %edx, %eax
+; LNX-NEXT:    addl (%rdi), %esi
+; LNX-NEXT:    cmovnsl %ecx, %eax
 ; LNX-NEXT:    retq
 ;
 ; WIN-LABEL: test1:
 ; WIN:       # %bb.0:
-; WIN-NEXT:    addl (%rcx), %edx
-; WIN-NEXT:    cmovnsl %r9d, %r8d
 ; WIN-NEXT:    movl %r8d, %eax
+; WIN-NEXT:    addl (%rcx), %edx
+; WIN-NEXT:    cmovnsl %r9d, %eax
 ; WIN-NEXT:    retq
 	%tmp2 = load i32, i32* %x, align 4		; <i32> [#uses=1]
 	%tmp4 = add i32 %tmp2, %y		; <i32> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/vec_cast.ll b/llvm/test/CodeGen/X86/vec_cast.ll
index 2fcdf84..dc406c4 100644
--- a/llvm/test/CodeGen/X86/vec_cast.ll
+++ b/llvm/test/CodeGen/X86/vec_cast.ll
@@ -175,11 +175,13 @@
 ; CHECK-LIN-LABEL: i:
 ; CHECK-LIN:       # %bb.0:
 ; CHECK-LIN-NEXT:    movl %edi, %eax
+; CHECK-LIN-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-LIN-NEXT:    retq
 ;
 ; CHECK-WIN-LABEL: i:
 ; CHECK-WIN:       # %bb.0:
 ; CHECK-WIN-NEXT:    movl %ecx, %eax
+; CHECK-WIN-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-WIN-NEXT:    retq
   %c = trunc <1 x i32> %a to <1 x i16>
   ret <1 x i16> %c
diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll
index c0a41d0..fa4c8ab 100644
--- a/llvm/test/CodeGen/X86/vector-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll
@@ -14,38 +14,40 @@
 define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ; SSE-LABEL: test_bitreverse_i8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    rolb $4, %dil
 ; SSE-NEXT:    movl %edi, %eax
-; SSE-NEXT:    andb $51, %al
-; SSE-NEXT:    shlb $2, %al
-; SSE-NEXT:    andb $-52, %dil
-; SSE-NEXT:    shrb $2, %dil
-; SSE-NEXT:    orb %al, %dil
-; SSE-NEXT:    movl %edi, %eax
-; SSE-NEXT:    andb $85, %al
-; SSE-NEXT:    addb %al, %al
-; SSE-NEXT:    andb $-86, %dil
-; SSE-NEXT:    shrb %dil
-; SSE-NEXT:    orb %al, %dil
-; SSE-NEXT:    movl %edi, %eax
+; SSE-NEXT:    rolb $4, %al
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    andb $51, %cl
+; SSE-NEXT:    shlb $2, %cl
+; SSE-NEXT:    andb $-52, %al
+; SSE-NEXT:    shrb $2, %al
+; SSE-NEXT:    orb %cl, %al
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    andb $85, %cl
+; SSE-NEXT:    addb %cl, %cl
+; SSE-NEXT:    andb $-86, %al
+; SSE-NEXT:    shrb %al
+; SSE-NEXT:    orb %cl, %al
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_bitreverse_i8:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    rolb $4, %dil
 ; AVX-NEXT:    movl %edi, %eax
-; AVX-NEXT:    andb $51, %al
-; AVX-NEXT:    shlb $2, %al
-; AVX-NEXT:    andb $-52, %dil
-; AVX-NEXT:    shrb $2, %dil
-; AVX-NEXT:    orb %al, %dil
-; AVX-NEXT:    movl %edi, %eax
-; AVX-NEXT:    andb $85, %al
-; AVX-NEXT:    addb %al, %al
-; AVX-NEXT:    andb $-86, %dil
-; AVX-NEXT:    shrb %dil
-; AVX-NEXT:    orb %al, %dil
-; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    rolb $4, %al
+; AVX-NEXT:    movl %eax, %ecx
+; AVX-NEXT:    andb $51, %cl
+; AVX-NEXT:    shlb $2, %cl
+; AVX-NEXT:    andb $-52, %al
+; AVX-NEXT:    shrb $2, %al
+; AVX-NEXT:    orb %cl, %al
+; AVX-NEXT:    movl %eax, %ecx
+; AVX-NEXT:    andb $85, %cl
+; AVX-NEXT:    addb %cl, %cl
+; AVX-NEXT:    andb $-86, %al
+; AVX-NEXT:    shrb %al
+; AVX-NEXT:    orb %cl, %al
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: test_bitreverse_i8:
diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll
index d2b31b4..934d102 100644
--- a/llvm/test/CodeGen/X86/vector-blend.ll
+++ b/llvm/test/CodeGen/X86/vector-blend.ll
@@ -338,30 +338,30 @@
 define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
 ; SSE2-LABEL: vsel_double8:
 ; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movaps %xmm7, %xmm3
+; SSE2-NEXT:    movaps %xmm5, %xmm1
 ; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
 ; SSE2-NEXT:    movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1]
 ; SSE2-NEXT:    movapd %xmm4, %xmm0
-; SSE2-NEXT:    movaps %xmm5, %xmm1
 ; SSE2-NEXT:    movapd %xmm6, %xmm2
-; SSE2-NEXT:    movaps %xmm7, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: vsel_double8:
 ; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movaps %xmm7, %xmm3
+; SSSE3-NEXT:    movaps %xmm5, %xmm1
 ; SSSE3-NEXT:    movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
 ; SSSE3-NEXT:    movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1]
 ; SSSE3-NEXT:    movapd %xmm4, %xmm0
-; SSSE3-NEXT:    movaps %xmm5, %xmm1
 ; SSSE3-NEXT:    movapd %xmm6, %xmm2
-; SSSE3-NEXT:    movaps %xmm7, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: vsel_double8:
 ; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movaps %xmm7, %xmm3
+; SSE41-NEXT:    movaps %xmm5, %xmm1
 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
 ; SSE41-NEXT:    blendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3]
-; SSE41-NEXT:    movaps %xmm5, %xmm1
-; SSE41-NEXT:    movaps %xmm7, %xmm3
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: vsel_double8:
@@ -377,30 +377,30 @@
 define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
 ; SSE2-LABEL: vsel_i648:
 ; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movaps %xmm7, %xmm3
+; SSE2-NEXT:    movaps %xmm5, %xmm1
 ; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
 ; SSE2-NEXT:    movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1]
 ; SSE2-NEXT:    movapd %xmm4, %xmm0
-; SSE2-NEXT:    movaps %xmm5, %xmm1
 ; SSE2-NEXT:    movapd %xmm6, %xmm2
-; SSE2-NEXT:    movaps %xmm7, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: vsel_i648:
 ; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movaps %xmm7, %xmm3
+; SSSE3-NEXT:    movaps %xmm5, %xmm1
 ; SSSE3-NEXT:    movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
 ; SSSE3-NEXT:    movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1]
 ; SSSE3-NEXT:    movapd %xmm4, %xmm0
-; SSSE3-NEXT:    movaps %xmm5, %xmm1
 ; SSSE3-NEXT:    movapd %xmm6, %xmm2
-; SSSE3-NEXT:    movaps %xmm7, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: vsel_i648:
 ; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movaps %xmm7, %xmm3
+; SSE41-NEXT:    movaps %xmm5, %xmm1
 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
 ; SSE41-NEXT:    blendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3]
-; SSE41-NEXT:    movaps %xmm5, %xmm1
-; SSE41-NEXT:    movaps %xmm7, %xmm3
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: vsel_i648:
@@ -528,22 +528,22 @@
 define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
 ; SSE2-LABEL: constant_blendvpd_avx:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
 ; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
 ; SSE2-NEXT:    movapd %xmm3, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: constant_blendvpd_avx:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
 ; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
 ; SSSE3-NEXT:    movapd %xmm3, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_blendvpd_avx:
 ; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
 ; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: constant_blendvpd_avx:
@@ -740,20 +740,20 @@
 define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE2-LABEL: blend_shufflevector_4xi64:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; SSE2-NEXT:    movaps %xmm3, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: blend_shufflevector_4xi64:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; SSSE3-NEXT:    movaps %xmm3, %xmm1
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: blend_shufflevector_4xi64:
 ; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
 ; SSE41-NEXT:    movaps %xmm3, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: blend_shufflevector_4xi64:
diff --git a/llvm/test/CodeGen/X86/vector-compare-results.ll b/llvm/test/CodeGen/X86/vector-compare-results.ll
index 87a189e..ed4089e 100644
--- a/llvm/test/CodeGen/X86/vector-compare-results.ll
+++ b/llvm/test/CodeGen/X86/vector-compare-results.ll
@@ -344,254 +344,254 @@
 define <32 x i1> @test_cmp_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_cmp_v32i8:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    pcmpgtb %xmm2, %xmm0
 ; SSE2-NEXT:    pcmpgtb %xmm3, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, 2(%rdi)
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, 2(%rdi)
 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, (%rdi)
-; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: test_cmp_v32i8:
 ; SSE42:       # %bb.0:
+; SSE42-NEXT:    movq %rdi, %rax
 ; SSE42-NEXT:    pcmpgtb %xmm2, %xmm0
 ; SSE42-NEXT:    pcmpgtb %xmm3, %xmm1
-; SSE42-NEXT:    pextrb $1, %xmm1, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm1, %ecx
+; SSE42-NEXT:    pextrb $1, %xmm1, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $2, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $3, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $4, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $5, %xmm1, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $6, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $7, %xmm1, %edx
+; SSE42-NEXT:    pextrb $0, %xmm1, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $9, %xmm1, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $2, %xmm1, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $11, %xmm1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $3, %xmm1, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $13, %xmm1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm1, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $14, %xmm1, %ecx
+; SSE42-NEXT:    pextrb $5, %xmm1, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
+; SSE42-NEXT:    shll $5, %ecx
 ; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $15, %xmm1, %edx
-; SSE42-NEXT:    shll $15, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, 2(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm0, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $2, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $3, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $4, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $5, %xmm0, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $6, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $7, %xmm0, %edx
+; SSE42-NEXT:    pextrb $6, %xmm1, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $9, %xmm0, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $7, %xmm1, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm1, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $11, %xmm0, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $9, %xmm1, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $10, %xmm1, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $13, %xmm0, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $11, %xmm1, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm1, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $14, %xmm0, %ecx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $13, %xmm1, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $14, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $15, %xmm1, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, 2(%rdi)
+; SSE42-NEXT:    pextrb $1, %xmm0, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $15, %xmm0, %edx
-; SSE42-NEXT:    shll $15, %edx
+; SSE42-NEXT:    pextrb $0, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $2, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $3, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, (%rdi)
-; SSE42-NEXT:    movq %rdi, %rax
+; SSE42-NEXT:    pextrb $5, %xmm0, %ecx
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    shll $5, %ecx
+; SSE42-NEXT:    orl %edx, %ecx
+; SSE42-NEXT:    pextrb $6, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $7, %xmm0, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $9, %xmm0, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $10, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $11, %xmm0, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $13, %xmm0, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $14, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $15, %xmm0, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, (%rdi)
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: test_cmp_v32i8:
@@ -933,6 +933,7 @@
 define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_cmp_v32i16:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    pcmpgtw %xmm5, %xmm1
 ; SSE2-NEXT:    pcmpgtw %xmm4, %xmm0
 ; SSE2-NEXT:    packsswb %xmm1, %xmm0
@@ -940,253 +941,252 @@
 ; SSE2-NEXT:    pcmpgtw %xmm6, %xmm2
 ; SSE2-NEXT:    packsswb %xmm3, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, 2(%rdi)
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, 2(%rdi)
 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, (%rdi)
-; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: test_cmp_v32i16:
 ; SSE42:       # %bb.0:
+; SSE42-NEXT:    movq %rdi, %rax
 ; SSE42-NEXT:    pcmpgtw %xmm5, %xmm1
 ; SSE42-NEXT:    pcmpgtw %xmm4, %xmm0
 ; SSE42-NEXT:    pcmpgtw %xmm7, %xmm3
 ; SSE42-NEXT:    pcmpgtw %xmm6, %xmm2
-; SSE42-NEXT:    pextrb $2, %xmm2, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm2, %ecx
+; SSE42-NEXT:    pextrb $2, %xmm2, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $4, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $6, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $8, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $10, %xmm2, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $12, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $14, %xmm2, %edx
+; SSE42-NEXT:    pextrb $0, %xmm2, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $2, %xmm3, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm2, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $4, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm3, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $6, %xmm2, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $10, %xmm3, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $8, %xmm2, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm3, %ecx
+; SSE42-NEXT:    pextrb $10, %xmm2, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
+; SSE42-NEXT:    shll $5, %ecx
 ; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $14, %xmm3, %edx
-; SSE42-NEXT:    shll $15, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, 2(%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm0, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $4, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $6, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $8, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $10, %xmm0, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $12, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $14, %xmm0, %edx
+; SSE42-NEXT:    pextrb $12, %xmm2, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $2, %xmm1, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $14, %xmm2, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm3, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $4, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm1, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $2, %xmm3, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $4, %xmm3, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $10, %xmm1, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $6, %xmm3, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm3, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm1, %ecx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $10, %xmm3, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm3, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $14, %xmm3, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, 2(%rdi)
+; SSE42-NEXT:    pextrb $2, %xmm0, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $14, %xmm1, %edx
-; SSE42-NEXT:    shll $15, %edx
+; SSE42-NEXT:    pextrb $0, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $6, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $8, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, (%rdi)
-; SSE42-NEXT:    movq %rdi, %rax
+; SSE42-NEXT:    pextrb $10, %xmm0, %ecx
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    shll $5, %ecx
+; SSE42-NEXT:    orl %edx, %ecx
+; SSE42-NEXT:    pextrb $12, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $14, %xmm0, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $2, %xmm1, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $4, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $6, %xmm1, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $10, %xmm1, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $14, %xmm1, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, (%rdi)
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: test_cmp_v32i16:
@@ -1247,500 +1247,501 @@
 define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE2-LABEL: test_cmp_v64i8:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    pcmpgtb %xmm4, %xmm0
 ; SSE2-NEXT:    pcmpgtb %xmm5, %xmm1
 ; SSE2-NEXT:    pcmpgtb %xmm6, %xmm2
 ; SSE2-NEXT:    pcmpgtb %xmm7, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, 6(%rdi)
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, 6(%rdi)
 ; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, 4(%rdi)
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, 4(%rdi)
 ; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, 2(%rdi)
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, 2(%rdi)
 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, (%rdi)
-; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: test_cmp_v64i8:
 ; SSE42:       # %bb.0:
+; SSE42-NEXT:    movq %rdi, %rax
 ; SSE42-NEXT:    pcmpgtb %xmm4, %xmm0
 ; SSE42-NEXT:    pcmpgtb %xmm5, %xmm1
 ; SSE42-NEXT:    pcmpgtb %xmm6, %xmm2
 ; SSE42-NEXT:    pcmpgtb %xmm7, %xmm3
-; SSE42-NEXT:    pextrb $1, %xmm3, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm3, %ecx
+; SSE42-NEXT:    pextrb $1, %xmm3, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $2, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $3, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $4, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $5, %xmm3, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $6, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $7, %xmm3, %edx
+; SSE42-NEXT:    pextrb $0, %xmm3, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $9, %xmm3, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $2, %xmm3, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $11, %xmm3, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $3, %xmm3, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $13, %xmm3, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm3, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $14, %xmm3, %ecx
+; SSE42-NEXT:    pextrb $5, %xmm3, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
+; SSE42-NEXT:    shll $5, %ecx
 ; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $15, %xmm3, %edx
-; SSE42-NEXT:    shll $15, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, 6(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm2, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $2, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $3, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $4, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $5, %xmm2, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $6, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $7, %xmm2, %edx
+; SSE42-NEXT:    pextrb $6, %xmm3, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $9, %xmm2, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $7, %xmm3, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm3, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $11, %xmm2, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $9, %xmm3, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $10, %xmm3, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $13, %xmm2, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $11, %xmm3, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm3, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $14, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $15, %xmm2, %edx
-; SSE42-NEXT:    shll $15, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, 4(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm1, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $2, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $3, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $4, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $5, %xmm1, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $6, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $7, %xmm1, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $13, %xmm3, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $14, %xmm3, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm1, %ecx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $15, %xmm3, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, 6(%rdi)
+; SSE42-NEXT:    pextrb $1, %xmm2, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $9, %xmm1, %edx
+; SSE42-NEXT:    pextrb $0, %xmm2, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $11, %xmm1, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $2, %xmm2, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $13, %xmm1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $3, %xmm2, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $14, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $15, %xmm1, %edx
-; SSE42-NEXT:    shll $15, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, 2(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm0, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $2, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $3, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $4, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $5, %xmm0, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $6, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $7, %xmm0, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm2, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm0, %ecx
+; SSE42-NEXT:    pextrb $5, %xmm2, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
+; SSE42-NEXT:    shll $5, %ecx
 ; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $9, %xmm0, %edx
+; SSE42-NEXT:    pextrb $6, %xmm2, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $11, %xmm0, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $7, %xmm2, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm2, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $13, %xmm0, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $9, %xmm2, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $10, %xmm2, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $14, %xmm0, %ecx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $11, %xmm2, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm2, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $13, %xmm2, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $14, %xmm2, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $15, %xmm2, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, 4(%rdi)
+; SSE42-NEXT:    pextrb $1, %xmm1, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $15, %xmm0, %edx
-; SSE42-NEXT:    shll $15, %edx
+; SSE42-NEXT:    pextrb $0, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $2, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $3, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, (%rdi)
-; SSE42-NEXT:    movq %rdi, %rax
+; SSE42-NEXT:    pextrb $5, %xmm1, %ecx
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    shll $5, %ecx
+; SSE42-NEXT:    orl %edx, %ecx
+; SSE42-NEXT:    pextrb $6, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $7, %xmm1, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $9, %xmm1, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $10, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $11, %xmm1, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $13, %xmm1, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $14, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $15, %xmm1, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, 2(%rdi)
+; SSE42-NEXT:    pextrb $1, %xmm0, %ecx
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    pextrb $0, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $2, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $3, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $4, %edx
+; SSE42-NEXT:    orl %ecx, %edx
+; SSE42-NEXT:    pextrb $5, %xmm0, %ecx
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    shll $5, %ecx
+; SSE42-NEXT:    orl %edx, %ecx
+; SSE42-NEXT:    pextrb $6, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $7, %xmm0, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $9, %xmm0, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $10, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $11, %xmm0, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $13, %xmm0, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $14, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $15, %xmm0, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, (%rdi)
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: test_cmp_v64i8:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    movq %rdi, %rax
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
 ; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm5, %xmm4
@@ -1749,509 +1750,508 @@
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    vpextrb $0, %xmm1, %ecx
+; AVX1-NEXT:    vpextrb $1, %xmm1, %ecx
 ; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX1-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX1-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX1-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $4, %ecx
-; AVX1-NEXT:    orl %eax, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    shll $5, %eax
-; AVX1-NEXT:    orl %ecx, %eax
-; AVX1-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $6, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm1, %edx
+; AVX1-NEXT:    vpextrb $0, %xmm1, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $7, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $8, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm1, %edx
+; AVX1-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX1-NEXT:    vpextrb $2, %xmm1, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $9, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $10, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm1, %edx
+; AVX1-NEXT:    leal (%rcx,%rdx,4), %ecx
+; AVX1-NEXT:    vpextrb $3, %xmm1, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $11, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $12, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm1, %edx
+; AVX1-NEXT:    leal (%rcx,%rdx,8), %ecx
+; AVX1-NEXT:    vpextrb $4, %xmm1, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $13, %edx
+; AVX1-NEXT:    shll $4, %edx
 ; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm1, %ecx
+; AVX1-NEXT:    vpextrb $5, %xmm1, %ecx
 ; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $14, %ecx
+; AVX1-NEXT:    shll $5, %ecx
 ; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm1, %edx
+; AVX1-NEXT:    vpextrb $6, %xmm1, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $15, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $0, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $16, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $1, %xmm2, %edx
+; AVX1-NEXT:    shll $6, %edx
+; AVX1-NEXT:    vpextrb $7, %xmm1, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $7, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $8, %xmm1, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $17, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $2, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $18, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $3, %xmm2, %edx
+; AVX1-NEXT:    shll $8, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $9, %xmm1, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $9, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $10, %xmm1, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $19, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $4, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $20, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm2, %edx
+; AVX1-NEXT:    shll $10, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $11, %xmm1, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $11, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $12, %xmm1, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $21, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $6, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $22, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm2, %edx
+; AVX1-NEXT:    shll $12, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $13, %xmm1, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $13, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $14, %xmm1, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $23, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $24, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm2, %edx
+; AVX1-NEXT:    shll $14, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $15, %xmm1, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $15, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $0, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $25, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $26, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm2, %edx
+; AVX1-NEXT:    shll $16, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $1, %xmm2, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $17, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $2, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $27, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $28, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm2, %edx
+; AVX1-NEXT:    shll $18, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $3, %xmm2, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $19, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $4, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $29, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $30, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm2, %edx
-; AVX1-NEXT:    shll $31, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    orl %eax, %edx
-; AVX1-NEXT:    movl %edx, 4(%rdi)
-; AVX1-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX1-NEXT:    vpextrb $2, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX1-NEXT:    vpextrb $3, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX1-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $4, %ecx
-; AVX1-NEXT:    orl %eax, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    shll $5, %eax
-; AVX1-NEXT:    orl %ecx, %eax
-; AVX1-NEXT:    vpextrb $6, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $6, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm0, %edx
+; AVX1-NEXT:    shll $20, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $5, %xmm2, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $21, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $6, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $7, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $8, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm0, %edx
+; AVX1-NEXT:    shll $22, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $7, %xmm2, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $23, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $8, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $9, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $10, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm0, %edx
+; AVX1-NEXT:    shll $24, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $9, %xmm2, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $25, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $10, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $11, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $12, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm0, %edx
+; AVX1-NEXT:    shll $26, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $11, %xmm2, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $27, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $12, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $13, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $14, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm0, %edx
+; AVX1-NEXT:    shll $28, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $13, %xmm2, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $29, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $14, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $15, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $0, %xmm4, %ecx
+; AVX1-NEXT:    shll $30, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $15, %xmm2, %esi
+; AVX1-NEXT:    shll $31, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    orl %ecx, %esi
+; AVX1-NEXT:    movl %esi, 4(%rdi)
+; AVX1-NEXT:    vpextrb $1, %xmm0, %ecx
 ; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $16, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $1, %xmm4, %edx
+; AVX1-NEXT:    vpextrb $0, %xmm0, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $17, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $18, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $3, %xmm4, %edx
+; AVX1-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX1-NEXT:    vpextrb $2, %xmm0, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $19, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $20, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm4, %edx
+; AVX1-NEXT:    leal (%rcx,%rdx,4), %ecx
+; AVX1-NEXT:    vpextrb $3, %xmm0, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $21, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $22, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm4, %edx
+; AVX1-NEXT:    leal (%rcx,%rdx,8), %ecx
+; AVX1-NEXT:    vpextrb $4, %xmm0, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $23, %edx
+; AVX1-NEXT:    shll $4, %edx
 ; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm4, %ecx
+; AVX1-NEXT:    vpextrb $5, %xmm0, %ecx
 ; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $24, %ecx
+; AVX1-NEXT:    shll $5, %ecx
 ; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm4, %edx
+; AVX1-NEXT:    vpextrb $6, %xmm0, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $25, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $26, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm4, %edx
+; AVX1-NEXT:    shll $6, %edx
+; AVX1-NEXT:    vpextrb $7, %xmm0, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $7, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $8, %xmm0, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $27, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $28, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm4, %edx
+; AVX1-NEXT:    shll $8, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $9, %xmm0, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $9, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $10, %xmm0, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $29, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $30, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm4, %edx
-; AVX1-NEXT:    shll $31, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    orl %eax, %edx
-; AVX1-NEXT:    movl %edx, (%rdi)
-; AVX1-NEXT:    movq %rdi, %rax
+; AVX1-NEXT:    shll $10, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $11, %xmm0, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $11, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $12, %xmm0, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $12, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $13, %xmm0, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $13, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $14, %xmm0, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $14, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $15, %xmm0, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $15, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $0, %xmm4, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $16, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $1, %xmm4, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $17, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $2, %xmm4, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $18, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $3, %xmm4, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $19, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $4, %xmm4, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $20, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $5, %xmm4, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $21, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $6, %xmm4, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $22, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $7, %xmm4, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $23, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $8, %xmm4, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $24, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $9, %xmm4, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $25, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $10, %xmm4, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $26, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $11, %xmm4, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $27, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $12, %xmm4, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $28, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $13, %xmm4, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $29, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $14, %xmm4, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $30, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $15, %xmm4, %esi
+; AVX1-NEXT:    shll $31, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    orl %ecx, %esi
+; AVX1-NEXT:    movl %esi, (%rdi)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_cmp_v64i8:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq %rdi, %rax
 ; AVX2-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpextrb $0, %xmm1, %ecx
+; AVX2-NEXT:    vpextrb $1, %xmm1, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX2-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX2-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX2-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $4, %ecx
-; AVX2-NEXT:    orl %eax, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    shll $5, %eax
-; AVX2-NEXT:    orl %ecx, %eax
-; AVX2-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $6, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm1, %edx
+; AVX2-NEXT:    vpextrb $0, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $7, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $8, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm1, %edx
+; AVX2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX2-NEXT:    vpextrb $2, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $9, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $10, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $11, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $12, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $13, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $14, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $15, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $16, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $1, %xmm1, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $17, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $18, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
+; AVX2-NEXT:    leal (%rcx,%rdx,4), %ecx
 ; AVX2-NEXT:    vpextrb $3, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $19, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $20, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm1, %edx
+; AVX2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; AVX2-NEXT:    vpextrb $4, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $21, %edx
+; AVX2-NEXT:    shll $4, %edx
 ; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $6, %xmm1, %ecx
+; AVX2-NEXT:    vpextrb $5, %xmm1, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $22, %ecx
+; AVX2-NEXT:    shll $5, %ecx
 ; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm1, %edx
+; AVX2-NEXT:    vpextrb $6, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $23, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $24, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm1, %edx
+; AVX2-NEXT:    shll $6, %edx
+; AVX2-NEXT:    vpextrb $7, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $7, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $8, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $25, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $26, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm1, %edx
+; AVX2-NEXT:    shll $8, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $9, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $9, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $10, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $27, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $28, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm1, %edx
+; AVX2-NEXT:    shll $10, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $11, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $11, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $12, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $29, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $30, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm1, %edx
-; AVX2-NEXT:    shll $31, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    orl %eax, %edx
-; AVX2-NEXT:    movl %edx, 4(%rdi)
-; AVX2-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX2-NEXT:    vpextrb $2, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX2-NEXT:    vpextrb $3, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX2-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $4, %ecx
-; AVX2-NEXT:    orl %eax, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    shll $5, %eax
-; AVX2-NEXT:    orl %ecx, %eax
-; AVX2-NEXT:    vpextrb $6, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $6, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm0, %edx
+; AVX2-NEXT:    shll $12, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $13, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $13, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $14, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $7, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $8, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm0, %edx
+; AVX2-NEXT:    shll $14, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $15, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $15, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpextrb $0, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $9, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $10, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm0, %edx
+; AVX2-NEXT:    shll $16, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $1, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $17, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $2, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $11, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $12, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm0, %edx
+; AVX2-NEXT:    shll $18, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $3, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $19, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $4, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $13, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $14, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm0, %edx
+; AVX2-NEXT:    shll $20, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $5, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $21, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $6, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $15, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $16, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $1, %xmm0, %edx
+; AVX2-NEXT:    shll $22, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $7, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $23, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $8, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $17, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $2, %xmm0, %ecx
+; AVX2-NEXT:    shll $24, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $9, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $25, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $10, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $26, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $11, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $27, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $12, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $28, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $13, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $29, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $14, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $30, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $15, %xmm1, %esi
+; AVX2-NEXT:    shll $31, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    orl %ecx, %esi
+; AVX2-NEXT:    movl %esi, 4(%rdi)
+; AVX2-NEXT:    vpextrb $1, %xmm0, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $18, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
+; AVX2-NEXT:    vpextrb $0, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX2-NEXT:    vpextrb $2, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    leal (%rcx,%rdx,4), %ecx
 ; AVX2-NEXT:    vpextrb $3, %xmm0, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $19, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $20, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm0, %edx
+; AVX2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; AVX2-NEXT:    vpextrb $4, %xmm0, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $21, %edx
+; AVX2-NEXT:    shll $4, %edx
 ; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $6, %xmm0, %ecx
+; AVX2-NEXT:    vpextrb $5, %xmm0, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $22, %ecx
+; AVX2-NEXT:    shll $5, %ecx
 ; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm0, %edx
+; AVX2-NEXT:    vpextrb $6, %xmm0, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $23, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $24, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm0, %edx
+; AVX2-NEXT:    shll $6, %edx
+; AVX2-NEXT:    vpextrb $7, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $7, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $8, %xmm0, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $25, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $26, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm0, %edx
+; AVX2-NEXT:    shll $8, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $9, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $9, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $10, %xmm0, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $27, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $28, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm0, %edx
+; AVX2-NEXT:    shll $10, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $11, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $11, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $12, %xmm0, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $29, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $30, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm0, %edx
-; AVX2-NEXT:    shll $31, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    orl %eax, %edx
-; AVX2-NEXT:    movl %edx, (%rdi)
-; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    shll $12, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $13, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $13, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $14, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $14, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $15, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $15, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $16, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $1, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $17, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $2, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $18, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $3, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $19, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $4, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $20, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $5, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $21, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $6, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $22, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $7, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $23, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $8, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $24, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $9, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $25, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $10, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $26, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $11, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $27, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $12, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $28, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $13, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $29, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $14, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $30, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $15, %xmm0, %esi
+; AVX2-NEXT:    shll $31, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    orl %ecx, %esi
+; AVX2-NEXT:    movl %esi, (%rdi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -2394,6 +2394,7 @@
 define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {
 ; SSE2-LABEL: test_cmp_v32f32:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm9
 ; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm11
 ; SSE2-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm10
@@ -2426,130 +2427,130 @@
 ; SSE2-NEXT:    packuswb %xmm11, %xmm9
 ; SSE2-NEXT:    packuswb %xmm10, %xmm9
 ; SSE2-NEXT:    movdqa %xmm9, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, 2(%rdi)
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, 2(%rdi)
 ; SSE2-NEXT:    movdqa %xmm8, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, (%rdi)
-; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: test_cmp_v32f32:
 ; SSE42:       # %bb.0:
+; SSE42-NEXT:    movq %rdi, %rax
 ; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm15
 ; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
 ; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm13
@@ -2566,125 +2567,124 @@
 ; SSE42-NEXT:    cmpltps %xmm6, %xmm13
 ; SSE42-NEXT:    cmpltps %xmm5, %xmm14
 ; SSE42-NEXT:    cmpltps %xmm4, %xmm15
-; SSE42-NEXT:    pextrb $4, %xmm15, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm15, %ecx
+; SSE42-NEXT:    pextrb $4, %xmm15, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $8, %xmm15, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $12, %xmm15, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $0, %xmm14, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $4, %xmm14, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $8, %xmm14, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $12, %xmm14, %edx
+; SSE42-NEXT:    pextrb $0, %xmm15, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm13, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $4, %xmm13, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $8, %xmm15, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm13, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $12, %xmm13, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $12, %xmm15, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm12, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $4, %xmm12, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $0, %xmm14, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm12, %ecx
+; SSE42-NEXT:    pextrb $4, %xmm14, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
+; SSE42-NEXT:    shll $5, %ecx
 ; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $12, %xmm12, %edx
-; SSE42-NEXT:    shll $15, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, 2(%rdi)
-; SSE42-NEXT:    pextrb $4, %xmm11, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm11, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $8, %xmm11, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $12, %xmm11, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $0, %xmm10, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $4, %xmm10, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $8, %xmm10, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $12, %xmm10, %edx
+; SSE42-NEXT:    pextrb $8, %xmm14, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm9, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $4, %xmm9, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $12, %xmm14, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm13, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm9, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $12, %xmm9, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $4, %xmm13, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm13, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm8, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $4, %xmm8, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $12, %xmm13, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm12, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm8, %ecx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $4, %xmm12, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm12, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $12, %xmm12, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, 2(%rdi)
+; SSE42-NEXT:    pextrb $4, %xmm11, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $12, %xmm8, %edx
-; SSE42-NEXT:    shll $15, %edx
+; SSE42-NEXT:    pextrb $0, %xmm11, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $8, %xmm11, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $12, %xmm11, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $0, %xmm10, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, (%rdi)
-; SSE42-NEXT:    movq %rdi, %rax
+; SSE42-NEXT:    pextrb $4, %xmm10, %ecx
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    shll $5, %ecx
+; SSE42-NEXT:    orl %edx, %ecx
+; SSE42-NEXT:    pextrb $8, %xmm10, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $12, %xmm10, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm9, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $4, %xmm9, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm9, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $12, %xmm9, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm8, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $4, %xmm8, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm8, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $12, %xmm8, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, (%rdi)
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: test_cmp_v32f32:
@@ -2954,6 +2954,7 @@
 define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
 ; SSE2-LABEL: test_cmp_v32i32:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm3
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255]
 ; SSE2-NEXT:    pand %xmm8, %xmm3
@@ -2978,130 +2979,130 @@
 ; SSE2-NEXT:    packuswb %xmm5, %xmm4
 ; SSE2-NEXT:    packuswb %xmm6, %xmm4
 ; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, 2(%rdi)
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, 2(%rdi)
 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, (%rdi)
-; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: test_cmp_v32i32:
 ; SSE42:       # %bb.0:
+; SSE42-NEXT:    movq %rdi, %rax
 ; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm3
 ; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm2
 ; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm1
@@ -3110,125 +3111,124 @@
 ; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm6
 ; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm5
 ; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm4
-; SSE42-NEXT:    pextrb $4, %xmm4, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm4, %ecx
+; SSE42-NEXT:    pextrb $4, %xmm4, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $8, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $12, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $0, %xmm5, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $4, %xmm5, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $8, %xmm5, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $12, %xmm5, %edx
+; SSE42-NEXT:    pextrb $0, %xmm4, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $4, %xmm6, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $8, %xmm4, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $12, %xmm6, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $12, %xmm4, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm7, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $4, %xmm7, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $0, %xmm5, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm7, %ecx
+; SSE42-NEXT:    pextrb $4, %xmm5, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
+; SSE42-NEXT:    shll $5, %ecx
 ; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $12, %xmm7, %edx
-; SSE42-NEXT:    shll $15, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, 2(%rdi)
-; SSE42-NEXT:    pextrb $4, %xmm0, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $8, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $12, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $0, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $4, %xmm1, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $8, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $12, %xmm1, %edx
+; SSE42-NEXT:    pextrb $8, %xmm5, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $4, %xmm2, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $12, %xmm5, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm6, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $12, %xmm2, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $4, %xmm6, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm6, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $4, %xmm3, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $12, %xmm6, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm7, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm3, %ecx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $4, %xmm7, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm7, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $12, %xmm7, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, 2(%rdi)
+; SSE42-NEXT:    pextrb $4, %xmm0, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $12, %xmm3, %edx
-; SSE42-NEXT:    shll $15, %edx
+; SSE42-NEXT:    pextrb $0, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $8, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $12, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $0, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, (%rdi)
-; SSE42-NEXT:    movq %rdi, %rax
+; SSE42-NEXT:    pextrb $4, %xmm1, %ecx
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    shll $5, %ecx
+; SSE42-NEXT:    orl %edx, %ecx
+; SSE42-NEXT:    pextrb $8, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $12, %xmm1, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm2, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $4, %xmm2, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm2, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $12, %xmm2, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm3, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $4, %xmm3, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm3, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $12, %xmm3, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, (%rdi)
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: test_cmp_v32i32:
@@ -3309,6 +3309,7 @@
 define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind {
 ; SSE2-LABEL: test_cmp_v64i16:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm1
 ; SSE2-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm0
 ; SSE2-NEXT:    packsswb %xmm1, %xmm0
@@ -3322,250 +3323,250 @@
 ; SSE2-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm6
 ; SSE2-NEXT:    packsswb %xmm7, %xmm6
 ; SSE2-NEXT:    movdqa %xmm6, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, 6(%rdi)
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, 6(%rdi)
 ; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, 4(%rdi)
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, 4(%rdi)
 ; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, 2(%rdi)
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, 2(%rdi)
 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, (%rdi)
-; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: test_cmp_v64i16:
 ; SSE42:       # %bb.0:
+; SSE42-NEXT:    movq %rdi, %rax
 ; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm1
 ; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm0
 ; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm3
@@ -3574,247 +3575,247 @@
 ; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm4
 ; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm7
 ; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT:    pextrb $2, %xmm6, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm6, %ecx
+; SSE42-NEXT:    pextrb $2, %xmm6, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $4, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $6, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $8, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $10, %xmm6, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $12, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $14, %xmm6, %edx
+; SSE42-NEXT:    pextrb $0, %xmm6, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm7, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $2, %xmm7, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm6, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $4, %xmm7, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm7, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $6, %xmm6, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm7, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $10, %xmm7, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $8, %xmm6, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm7, %ecx
+; SSE42-NEXT:    pextrb $10, %xmm6, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
+; SSE42-NEXT:    shll $5, %ecx
 ; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $14, %xmm7, %edx
-; SSE42-NEXT:    shll $15, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, 6(%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm4, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $4, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $6, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $8, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $10, %xmm4, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $12, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $14, %xmm4, %edx
+; SSE42-NEXT:    pextrb $12, %xmm6, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm5, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $2, %xmm5, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $14, %xmm6, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm7, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $4, %xmm5, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm5, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $2, %xmm7, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $4, %xmm7, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm5, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $10, %xmm5, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $6, %xmm7, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm7, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm5, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $14, %xmm5, %edx
-; SSE42-NEXT:    shll $15, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, 4(%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm2, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $4, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $6, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $8, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $10, %xmm2, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $12, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $14, %xmm2, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $10, %xmm7, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm7, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm3, %ecx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $14, %xmm7, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, 6(%rdi)
+; SSE42-NEXT:    pextrb $2, %xmm4, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $2, %xmm3, %edx
+; SSE42-NEXT:    pextrb $0, %xmm4, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $4, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm3, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm4, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $10, %xmm3, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $6, %xmm4, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $14, %xmm3, %edx
-; SSE42-NEXT:    shll $15, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, 2(%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm0, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $4, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $6, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $8, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $10, %xmm0, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $12, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $14, %xmm0, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $8, %xmm4, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm1, %ecx
+; SSE42-NEXT:    pextrb $10, %xmm4, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
+; SSE42-NEXT:    shll $5, %ecx
 ; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $2, %xmm1, %edx
+; SSE42-NEXT:    pextrb $12, %xmm4, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $4, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $6, %xmm1, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $14, %xmm4, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm5, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $10, %xmm1, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $2, %xmm5, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $4, %xmm5, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm1, %ecx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $6, %xmm5, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm5, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $10, %xmm5, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm5, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $14, %xmm5, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, 4(%rdi)
+; SSE42-NEXT:    pextrb $2, %xmm2, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $14, %xmm1, %edx
-; SSE42-NEXT:    shll $15, %edx
+; SSE42-NEXT:    pextrb $0, %xmm2, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm2, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $6, %xmm2, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $8, %xmm2, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, (%rdi)
-; SSE42-NEXT:    movq %rdi, %rax
+; SSE42-NEXT:    pextrb $10, %xmm2, %ecx
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    shll $5, %ecx
+; SSE42-NEXT:    orl %edx, %ecx
+; SSE42-NEXT:    pextrb $12, %xmm2, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $14, %xmm2, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm3, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $2, %xmm3, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $4, %xmm3, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $6, %xmm3, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm3, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $10, %xmm3, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm3, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $14, %xmm3, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, 2(%rdi)
+; SSE42-NEXT:    pextrb $2, %xmm0, %ecx
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    pextrb $0, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $6, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $8, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $4, %edx
+; SSE42-NEXT:    orl %ecx, %edx
+; SSE42-NEXT:    pextrb $10, %xmm0, %ecx
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    shll $5, %ecx
+; SSE42-NEXT:    orl %edx, %ecx
+; SSE42-NEXT:    pextrb $12, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $14, %xmm0, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $2, %xmm1, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $4, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $6, %xmm1, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $10, %xmm1, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $14, %xmm1, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, (%rdi)
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: test_cmp_v64i16:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    movq %rdi, %rax
 ; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm8
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm9
 ; AVX1-NEXT:    vpcmpgtw %xmm8, %xmm9, %xmm8
@@ -3831,258 +3832,258 @@
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
 ; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm7, %xmm7
 ; AVX1-NEXT:    vpcmpgtw %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrb $2, %xmm2, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    vpextrb $0, %xmm2, %ecx
+; AVX1-NEXT:    vpextrb $2, %xmm2, %ecx
 ; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX1-NEXT:    vpextrb $4, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX1-NEXT:    vpextrb $6, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX1-NEXT:    vpextrb $8, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $4, %ecx
-; AVX1-NEXT:    orl %eax, %ecx
-; AVX1-NEXT:    vpextrb $10, %xmm2, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    shll $5, %eax
-; AVX1-NEXT:    orl %ecx, %eax
-; AVX1-NEXT:    vpextrb $12, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $6, %ecx
-; AVX1-NEXT:    vpextrb $14, %xmm2, %edx
+; AVX1-NEXT:    vpextrb $0, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $7, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $0, %xmm7, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $8, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $2, %xmm7, %edx
+; AVX1-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX1-NEXT:    vpextrb $4, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $9, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $4, %xmm7, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $10, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $6, %xmm7, %edx
+; AVX1-NEXT:    leal (%rcx,%rdx,4), %ecx
+; AVX1-NEXT:    vpextrb $6, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $11, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm7, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $12, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $10, %xmm7, %edx
+; AVX1-NEXT:    leal (%rcx,%rdx,8), %ecx
+; AVX1-NEXT:    vpextrb $8, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $13, %edx
+; AVX1-NEXT:    shll $4, %edx
 ; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm7, %ecx
+; AVX1-NEXT:    vpextrb $10, %xmm2, %ecx
 ; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $14, %ecx
+; AVX1-NEXT:    shll $5, %ecx
 ; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $14, %xmm7, %edx
+; AVX1-NEXT:    vpextrb $12, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $15, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $16, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $2, %xmm3, %edx
+; AVX1-NEXT:    shll $6, %edx
+; AVX1-NEXT:    vpextrb $14, %xmm2, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $7, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $0, %xmm7, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $17, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $18, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $6, %xmm3, %edx
+; AVX1-NEXT:    shll $8, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $2, %xmm7, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $9, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $4, %xmm7, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $19, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $20, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $10, %xmm3, %edx
+; AVX1-NEXT:    shll $10, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $6, %xmm7, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $11, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $8, %xmm7, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $21, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $22, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $14, %xmm3, %edx
+; AVX1-NEXT:    shll $12, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $10, %xmm7, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $13, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $12, %xmm7, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $23, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $24, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $2, %xmm4, %edx
+; AVX1-NEXT:    shll $14, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $14, %xmm7, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $15, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $0, %xmm3, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $25, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $26, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $6, %xmm4, %edx
+; AVX1-NEXT:    shll $16, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $2, %xmm3, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $17, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $4, %xmm3, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $27, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $28, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $10, %xmm4, %edx
+; AVX1-NEXT:    shll $18, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $6, %xmm3, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $19, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $8, %xmm3, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $29, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $30, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $14, %xmm4, %edx
-; AVX1-NEXT:    shll $31, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    orl %eax, %edx
-; AVX1-NEXT:    movl %edx, 4(%rdi)
-; AVX1-NEXT:    vpextrb $2, %xmm0, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX1-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX1-NEXT:    vpextrb $6, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX1-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $4, %ecx
-; AVX1-NEXT:    orl %eax, %ecx
-; AVX1-NEXT:    vpextrb $10, %xmm0, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    shll $5, %eax
-; AVX1-NEXT:    orl %ecx, %eax
-; AVX1-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $6, %ecx
-; AVX1-NEXT:    vpextrb $14, %xmm0, %edx
+; AVX1-NEXT:    shll $20, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $10, %xmm3, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $21, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $12, %xmm3, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $7, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $0, %xmm5, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $8, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $2, %xmm5, %edx
+; AVX1-NEXT:    shll $22, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $14, %xmm3, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $23, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $0, %xmm4, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $9, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $4, %xmm5, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $10, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $6, %xmm5, %edx
+; AVX1-NEXT:    shll $24, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $2, %xmm4, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $25, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $4, %xmm4, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $11, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm5, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $12, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $10, %xmm5, %edx
+; AVX1-NEXT:    shll $26, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $6, %xmm4, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $27, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $8, %xmm4, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $13, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm5, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $14, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $14, %xmm5, %edx
+; AVX1-NEXT:    shll $28, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $10, %xmm4, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $29, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $12, %xmm4, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $15, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $0, %xmm9, %ecx
+; AVX1-NEXT:    shll $30, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $14, %xmm4, %esi
+; AVX1-NEXT:    shll $31, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    orl %ecx, %esi
+; AVX1-NEXT:    movl %esi, 4(%rdi)
+; AVX1-NEXT:    vpextrb $2, %xmm0, %ecx
 ; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $16, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $2, %xmm9, %edx
+; AVX1-NEXT:    vpextrb $0, %xmm0, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $17, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $4, %xmm9, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $18, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $6, %xmm9, %edx
+; AVX1-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX1-NEXT:    vpextrb $4, %xmm0, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $19, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm9, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $20, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $10, %xmm9, %edx
+; AVX1-NEXT:    leal (%rcx,%rdx,4), %ecx
+; AVX1-NEXT:    vpextrb $6, %xmm0, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $21, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm9, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $22, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $14, %xmm9, %edx
+; AVX1-NEXT:    leal (%rcx,%rdx,8), %ecx
+; AVX1-NEXT:    vpextrb $8, %xmm0, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $23, %edx
+; AVX1-NEXT:    shll $4, %edx
 ; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $0, %xmm8, %ecx
+; AVX1-NEXT:    vpextrb $10, %xmm0, %ecx
 ; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $24, %ecx
+; AVX1-NEXT:    shll $5, %ecx
 ; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $2, %xmm8, %edx
+; AVX1-NEXT:    vpextrb $12, %xmm0, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $25, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $4, %xmm8, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $26, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $6, %xmm8, %edx
+; AVX1-NEXT:    shll $6, %edx
+; AVX1-NEXT:    vpextrb $14, %xmm0, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $7, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $0, %xmm5, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $27, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm8, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $28, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $10, %xmm8, %edx
+; AVX1-NEXT:    shll $8, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $2, %xmm5, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $9, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $4, %xmm5, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $29, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm8, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $30, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $14, %xmm8, %edx
-; AVX1-NEXT:    shll $31, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    orl %eax, %edx
-; AVX1-NEXT:    movl %edx, (%rdi)
-; AVX1-NEXT:    movq %rdi, %rax
+; AVX1-NEXT:    shll $10, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $6, %xmm5, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $11, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $8, %xmm5, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $12, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $10, %xmm5, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $13, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $12, %xmm5, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $14, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $14, %xmm5, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $15, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $0, %xmm9, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $16, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $2, %xmm9, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $17, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $4, %xmm9, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $18, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $6, %xmm9, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $19, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $8, %xmm9, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $20, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $10, %xmm9, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $21, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $12, %xmm9, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $22, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $14, %xmm9, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $23, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $0, %xmm8, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $24, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $2, %xmm8, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $25, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $4, %xmm8, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $26, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $6, %xmm8, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $27, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $8, %xmm8, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $28, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $10, %xmm8, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $29, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $12, %xmm8, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $30, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $14, %xmm8, %esi
+; AVX1-NEXT:    shll $31, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    orl %ecx, %esi
+; AVX1-NEXT:    movl %esi, (%rdi)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_cmp_v64i16:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq %rdi, %rax
 ; AVX2-NEXT:    vpcmpgtw %ymm5, %ymm1, %ymm5
 ; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm1
 ; AVX2-NEXT:    vpcmpgtw %ymm4, %ymm0, %ymm4
@@ -4091,253 +4092,252 @@
 ; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm3
 ; AVX2-NEXT:    vpcmpgtw %ymm6, %ymm2, %ymm6
 ; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm2
-; AVX2-NEXT:    vpextrb $2, %xmm6, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpextrb $0, %xmm6, %ecx
+; AVX2-NEXT:    vpextrb $2, %xmm6, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX2-NEXT:    vpextrb $4, %xmm6, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX2-NEXT:    vpextrb $6, %xmm6, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX2-NEXT:    vpextrb $8, %xmm6, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $4, %ecx
-; AVX2-NEXT:    orl %eax, %ecx
-; AVX2-NEXT:    vpextrb $10, %xmm6, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    shll $5, %eax
-; AVX2-NEXT:    orl %ecx, %eax
-; AVX2-NEXT:    vpextrb $12, %xmm6, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $6, %ecx
-; AVX2-NEXT:    vpextrb $14, %xmm6, %edx
+; AVX2-NEXT:    vpextrb $0, %xmm6, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $7, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $0, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $8, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $2, %xmm2, %edx
+; AVX2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX2-NEXT:    vpextrb $4, %xmm6, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $9, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $4, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $10, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm2, %edx
+; AVX2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; AVX2-NEXT:    vpextrb $6, %xmm6, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $11, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $12, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $10, %xmm2, %edx
+; AVX2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; AVX2-NEXT:    vpextrb $8, %xmm6, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $13, %edx
+; AVX2-NEXT:    shll $4, %edx
 ; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm2, %ecx
+; AVX2-NEXT:    vpextrb $10, %xmm6, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $14, %ecx
+; AVX2-NEXT:    shll $5, %ecx
 ; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $14, %xmm2, %edx
+; AVX2-NEXT:    vpextrb $12, %xmm6, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $15, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $0, %xmm7, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $16, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $2, %xmm7, %edx
+; AVX2-NEXT:    shll $6, %edx
+; AVX2-NEXT:    vpextrb $14, %xmm6, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $7, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $0, %xmm2, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $17, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $4, %xmm7, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $18, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm7, %edx
+; AVX2-NEXT:    shll $8, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $2, %xmm2, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $9, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $4, %xmm2, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $19, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm7, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $20, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $10, %xmm7, %edx
+; AVX2-NEXT:    shll $10, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $6, %xmm2, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $11, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $8, %xmm2, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $21, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm7, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $22, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $14, %xmm7, %edx
+; AVX2-NEXT:    shll $12, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $10, %xmm2, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $13, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $12, %xmm2, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $23, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $24, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $2, %xmm3, %edx
+; AVX2-NEXT:    shll $14, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $14, %xmm2, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $15, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $0, %xmm7, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $25, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $26, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm3, %edx
+; AVX2-NEXT:    shll $16, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $2, %xmm7, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $17, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $4, %xmm7, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $27, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $28, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $10, %xmm3, %edx
+; AVX2-NEXT:    shll $18, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $6, %xmm7, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $19, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $8, %xmm7, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $29, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $30, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $14, %xmm3, %edx
-; AVX2-NEXT:    shll $31, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    orl %eax, %edx
-; AVX2-NEXT:    movl %edx, 4(%rdi)
-; AVX2-NEXT:    vpextrb $2, %xmm4, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX2-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX2-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX2-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $4, %ecx
-; AVX2-NEXT:    orl %eax, %ecx
-; AVX2-NEXT:    vpextrb $10, %xmm4, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    shll $5, %eax
-; AVX2-NEXT:    orl %ecx, %eax
-; AVX2-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $6, %ecx
-; AVX2-NEXT:    vpextrb $14, %xmm4, %edx
+; AVX2-NEXT:    shll $20, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $10, %xmm7, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $21, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $12, %xmm7, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $7, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $8, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $2, %xmm0, %edx
+; AVX2-NEXT:    shll $22, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $14, %xmm7, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $23, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $0, %xmm3, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $9, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $10, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm0, %edx
+; AVX2-NEXT:    shll $24, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $2, %xmm3, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $25, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $4, %xmm3, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $11, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $12, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $10, %xmm0, %edx
+; AVX2-NEXT:    shll $26, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $6, %xmm3, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $27, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $8, %xmm3, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $13, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $14, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $14, %xmm0, %edx
+; AVX2-NEXT:    shll $28, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $10, %xmm3, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $29, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $12, %xmm3, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $15, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $0, %xmm5, %ecx
+; AVX2-NEXT:    shll $30, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $14, %xmm3, %esi
+; AVX2-NEXT:    shll $31, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    orl %ecx, %esi
+; AVX2-NEXT:    movl %esi, 4(%rdi)
+; AVX2-NEXT:    vpextrb $2, %xmm4, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $16, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $2, %xmm5, %edx
+; AVX2-NEXT:    vpextrb $0, %xmm4, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $17, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $4, %xmm5, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $18, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm5, %edx
+; AVX2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX2-NEXT:    vpextrb $4, %xmm4, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $19, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm5, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $20, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $10, %xmm5, %edx
+; AVX2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; AVX2-NEXT:    vpextrb $6, %xmm4, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $21, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm5, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $22, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $14, %xmm5, %edx
+; AVX2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; AVX2-NEXT:    vpextrb $8, %xmm4, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $23, %edx
+; AVX2-NEXT:    shll $4, %edx
 ; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $0, %xmm1, %ecx
+; AVX2-NEXT:    vpextrb $10, %xmm4, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $24, %ecx
+; AVX2-NEXT:    shll $5, %ecx
 ; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $2, %xmm1, %edx
+; AVX2-NEXT:    vpextrb $12, %xmm4, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $25, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $26, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm1, %edx
+; AVX2-NEXT:    shll $6, %edx
+; AVX2-NEXT:    vpextrb $14, %xmm4, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $7, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $0, %xmm0, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $27, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $28, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $10, %xmm1, %edx
+; AVX2-NEXT:    shll $8, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $2, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $9, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $4, %xmm0, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $29, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $30, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $14, %xmm1, %edx
-; AVX2-NEXT:    shll $31, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    orl %eax, %edx
-; AVX2-NEXT:    movl %edx, (%rdi)
-; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    shll $10, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $6, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $11, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $8, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $12, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $10, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $13, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $12, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $14, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $14, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $15, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $0, %xmm5, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $16, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $2, %xmm5, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $17, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $4, %xmm5, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $18, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $6, %xmm5, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $19, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $8, %xmm5, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $20, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $10, %xmm5, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $21, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $12, %xmm5, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $22, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $14, %xmm5, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $23, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $0, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $24, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $2, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $25, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $4, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $26, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $6, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $27, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $8, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $28, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $10, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $29, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $12, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $30, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $14, %xmm1, %esi
+; AVX2-NEXT:    shll $31, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    orl %ecx, %esi
+; AVX2-NEXT:    movl %esi, (%rdi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -4390,6 +4390,7 @@
 ; SSE2-LABEL: test_cmp_v128i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm0
 ; SSE2-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm1
 ; SSE2-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm2
@@ -4399,491 +4400,491 @@
 ; SSE2-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm6
 ; SSE2-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm7
 ; SSE2-NEXT:    movdqa %xmm7, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, 14(%rdi)
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, 14(%rdi)
 ; SSE2-NEXT:    movdqa %xmm6, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, 12(%rdi)
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, 12(%rdi)
 ; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, 10(%rdi)
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, 10(%rdi)
 ; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, 8(%rdi)
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, 8(%rdi)
 ; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, 6(%rdi)
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, 6(%rdi)
 ; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, 4(%rdi)
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, 4(%rdi)
 ; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, 2(%rdi)
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, 2(%rdi)
 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, (%rdi)
-; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, (%rdi)
 ; SSE2-NEXT:    popq %rcx
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: test_cmp_v128i8:
 ; SSE42:       # %bb.0:
+; SSE42-NEXT:    movq %rdi, %rax
 ; SSE42-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm0
 ; SSE42-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm1
 ; SSE42-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm2
@@ -4892,483 +4893,483 @@
 ; SSE42-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm5
 ; SSE42-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm6
 ; SSE42-NEXT:    pcmpgtb {{[0-9]+}}(%rsp), %xmm7
-; SSE42-NEXT:    pextrb $1, %xmm7, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm7, %ecx
+; SSE42-NEXT:    pextrb $1, %xmm7, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $2, %xmm7, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $3, %xmm7, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $4, %xmm7, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $5, %xmm7, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $6, %xmm7, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $7, %xmm7, %edx
+; SSE42-NEXT:    pextrb $0, %xmm7, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm7, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $9, %xmm7, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $2, %xmm7, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm7, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $11, %xmm7, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $3, %xmm7, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm7, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $13, %xmm7, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm7, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $14, %xmm7, %ecx
+; SSE42-NEXT:    pextrb $5, %xmm7, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
+; SSE42-NEXT:    shll $5, %ecx
 ; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $15, %xmm7, %edx
-; SSE42-NEXT:    shll $15, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, 14(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm6, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $2, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $3, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $4, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $5, %xmm6, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $6, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $7, %xmm6, %edx
+; SSE42-NEXT:    pextrb $6, %xmm7, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $9, %xmm6, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $7, %xmm7, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm7, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $11, %xmm6, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $9, %xmm7, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $10, %xmm7, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $13, %xmm6, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $11, %xmm7, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm7, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $14, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $15, %xmm6, %edx
-; SSE42-NEXT:    shll $15, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, 12(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm5, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm5, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $2, %xmm5, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $3, %xmm5, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $4, %xmm5, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $5, %xmm5, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $6, %xmm5, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $7, %xmm5, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $13, %xmm7, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $14, %xmm7, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm5, %ecx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $15, %xmm7, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, 14(%rdi)
+; SSE42-NEXT:    pextrb $1, %xmm6, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $9, %xmm5, %edx
+; SSE42-NEXT:    pextrb $0, %xmm6, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm5, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $11, %xmm5, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $2, %xmm6, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm5, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $13, %xmm5, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $3, %xmm6, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $14, %xmm5, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $15, %xmm5, %edx
-; SSE42-NEXT:    shll $15, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, 10(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm4, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $2, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $3, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $4, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $5, %xmm4, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $6, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $7, %xmm4, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm6, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm4, %ecx
+; SSE42-NEXT:    pextrb $5, %xmm6, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
+; SSE42-NEXT:    shll $5, %ecx
 ; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $9, %xmm4, %edx
+; SSE42-NEXT:    pextrb $6, %xmm6, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $11, %xmm4, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $7, %xmm6, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm6, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $13, %xmm4, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $9, %xmm6, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $10, %xmm6, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $14, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $15, %xmm4, %edx
-; SSE42-NEXT:    shll $15, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, 8(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm3, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $2, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $3, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $4, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $5, %xmm3, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $6, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $7, %xmm3, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $11, %xmm6, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm6, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $9, %xmm3, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $13, %xmm6, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $14, %xmm6, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm3, %ecx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $15, %xmm6, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, 12(%rdi)
+; SSE42-NEXT:    pextrb $1, %xmm5, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $11, %xmm3, %edx
+; SSE42-NEXT:    pextrb $0, %xmm5, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $13, %xmm3, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $2, %xmm5, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $14, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $15, %xmm3, %edx
-; SSE42-NEXT:    shll $15, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, 6(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm2, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $2, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $3, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $4, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $5, %xmm2, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $6, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $7, %xmm2, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $3, %xmm5, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $9, %xmm2, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm5, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm2, %ecx
+; SSE42-NEXT:    pextrb $5, %xmm5, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
+; SSE42-NEXT:    shll $5, %ecx
 ; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $11, %xmm2, %edx
+; SSE42-NEXT:    pextrb $6, %xmm5, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $13, %xmm2, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $7, %xmm5, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm5, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $14, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $15, %xmm2, %edx
-; SSE42-NEXT:    shll $15, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, 4(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm1, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $2, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $3, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $4, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $5, %xmm1, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $6, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $7, %xmm1, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $9, %xmm5, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $10, %xmm5, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $9, %xmm1, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $11, %xmm5, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm5, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $11, %xmm1, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $13, %xmm5, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $14, %xmm5, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm1, %ecx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $15, %xmm5, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, 10(%rdi)
+; SSE42-NEXT:    pextrb $1, %xmm4, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $13, %xmm1, %edx
+; SSE42-NEXT:    pextrb $0, %xmm4, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $14, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $15, %xmm1, %edx
-; SSE42-NEXT:    shll $15, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, 2(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm0, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $2, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $3, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $4, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $5, %xmm0, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $6, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $7, %xmm0, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $2, %xmm4, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $8, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $9, %xmm0, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $3, %xmm4, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $10, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $11, %xmm0, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm4, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $12, %xmm0, %ecx
+; SSE42-NEXT:    pextrb $5, %xmm4, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
+; SSE42-NEXT:    shll $5, %ecx
 ; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $13, %xmm0, %edx
+; SSE42-NEXT:    pextrb $6, %xmm4, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $14, %xmm0, %ecx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $7, %xmm4, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm4, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $9, %xmm4, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $10, %xmm4, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $11, %xmm4, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm4, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $13, %xmm4, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $14, %xmm4, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $15, %xmm4, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, 8(%rdi)
+; SSE42-NEXT:    pextrb $1, %xmm3, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $15, %xmm0, %edx
-; SSE42-NEXT:    shll $15, %edx
+; SSE42-NEXT:    pextrb $0, %xmm3, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $2, %xmm3, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $3, %xmm3, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm3, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, (%rdi)
-; SSE42-NEXT:    movq %rdi, %rax
+; SSE42-NEXT:    pextrb $5, %xmm3, %ecx
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    shll $5, %ecx
+; SSE42-NEXT:    orl %edx, %ecx
+; SSE42-NEXT:    pextrb $6, %xmm3, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $7, %xmm3, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm3, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $9, %xmm3, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $10, %xmm3, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $11, %xmm3, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm3, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $13, %xmm3, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $14, %xmm3, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $15, %xmm3, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, 6(%rdi)
+; SSE42-NEXT:    pextrb $1, %xmm2, %ecx
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    pextrb $0, %xmm2, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $2, %xmm2, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $3, %xmm2, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm2, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $4, %edx
+; SSE42-NEXT:    orl %ecx, %edx
+; SSE42-NEXT:    pextrb $5, %xmm2, %ecx
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    shll $5, %ecx
+; SSE42-NEXT:    orl %edx, %ecx
+; SSE42-NEXT:    pextrb $6, %xmm2, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $7, %xmm2, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm2, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $9, %xmm2, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $10, %xmm2, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $11, %xmm2, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm2, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $13, %xmm2, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $14, %xmm2, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $15, %xmm2, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, 4(%rdi)
+; SSE42-NEXT:    pextrb $1, %xmm1, %ecx
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    pextrb $0, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $2, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $3, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $4, %edx
+; SSE42-NEXT:    orl %ecx, %edx
+; SSE42-NEXT:    pextrb $5, %xmm1, %ecx
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    shll $5, %ecx
+; SSE42-NEXT:    orl %edx, %ecx
+; SSE42-NEXT:    pextrb $6, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $7, %xmm1, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $9, %xmm1, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $10, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $11, %xmm1, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $13, %xmm1, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $14, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $15, %xmm1, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, 2(%rdi)
+; SSE42-NEXT:    pextrb $1, %xmm0, %ecx
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    pextrb $0, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $2, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $3, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $4, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $4, %edx
+; SSE42-NEXT:    orl %ecx, %edx
+; SSE42-NEXT:    pextrb $5, %xmm0, %ecx
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    shll $5, %ecx
+; SSE42-NEXT:    orl %edx, %ecx
+; SSE42-NEXT:    pextrb $6, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $7, %xmm0, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $8, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $9, %xmm0, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $10, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $11, %xmm0, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $12, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $13, %xmm0, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $14, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $15, %xmm0, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, (%rdi)
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: test_cmp_v128i8:
 ; AVX1:       # %bb.0:
+; AVX1-NEXT:    movq %rdi, %rax
 ; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm8
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm9
 ; AVX1-NEXT:    vpcmpgtb %xmm8, %xmm9, %xmm8
@@ -5385,1010 +5386,1010 @@
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
 ; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm6, %xmm6
 ; AVX1-NEXT:    vpcmpgtb %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    vpextrb $0, %xmm3, %ecx
+; AVX1-NEXT:    vpextrb $1, %xmm3, %ecx
 ; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX1-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX1-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX1-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $4, %ecx
-; AVX1-NEXT:    orl %eax, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    shll $5, %eax
-; AVX1-NEXT:    orl %ecx, %eax
-; AVX1-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $6, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm3, %edx
+; AVX1-NEXT:    vpextrb $0, %xmm3, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $7, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $8, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm3, %edx
+; AVX1-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX1-NEXT:    vpextrb $2, %xmm3, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $9, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $10, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm3, %edx
+; AVX1-NEXT:    leal (%rcx,%rdx,4), %ecx
+; AVX1-NEXT:    vpextrb $3, %xmm3, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $11, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $12, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm3, %edx
+; AVX1-NEXT:    leal (%rcx,%rdx,8), %ecx
+; AVX1-NEXT:    vpextrb $4, %xmm3, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $13, %edx
+; AVX1-NEXT:    shll $4, %edx
 ; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm3, %ecx
+; AVX1-NEXT:    vpextrb $5, %xmm3, %ecx
 ; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $14, %ecx
+; AVX1-NEXT:    shll $5, %ecx
 ; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm3, %edx
+; AVX1-NEXT:    vpextrb $6, %xmm3, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $15, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $0, %xmm6, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $16, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $1, %xmm6, %edx
+; AVX1-NEXT:    shll $6, %edx
+; AVX1-NEXT:    vpextrb $7, %xmm3, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $7, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $8, %xmm3, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $17, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $2, %xmm6, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $18, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $3, %xmm6, %edx
+; AVX1-NEXT:    shll $8, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $9, %xmm3, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $9, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $10, %xmm3, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $19, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $4, %xmm6, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $20, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm6, %edx
+; AVX1-NEXT:    shll $10, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $11, %xmm3, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $11, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $12, %xmm3, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $21, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $6, %xmm6, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $22, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm6, %edx
+; AVX1-NEXT:    shll $12, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $13, %xmm3, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $13, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $14, %xmm3, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $23, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm6, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $24, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm6, %edx
+; AVX1-NEXT:    shll $14, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $15, %xmm3, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $15, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $0, %xmm6, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $25, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm6, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $26, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm6, %edx
+; AVX1-NEXT:    shll $16, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $1, %xmm6, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $17, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $2, %xmm6, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $27, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm6, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $28, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm6, %edx
+; AVX1-NEXT:    shll $18, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $3, %xmm6, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $19, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $4, %xmm6, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $29, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm6, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $30, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm6, %edx
-; AVX1-NEXT:    shll $31, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    orl %eax, %edx
-; AVX1-NEXT:    movl %edx, 12(%rdi)
-; AVX1-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    vpextrb $0, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX1-NEXT:    vpextrb $2, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX1-NEXT:    vpextrb $3, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX1-NEXT:    vpextrb $4, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $4, %ecx
-; AVX1-NEXT:    orl %eax, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    shll $5, %eax
-; AVX1-NEXT:    orl %ecx, %eax
-; AVX1-NEXT:    vpextrb $6, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $6, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm2, %edx
+; AVX1-NEXT:    shll $20, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $5, %xmm6, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $21, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $6, %xmm6, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $7, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $8, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm2, %edx
+; AVX1-NEXT:    shll $22, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $7, %xmm6, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $23, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $8, %xmm6, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $9, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $10, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm2, %edx
+; AVX1-NEXT:    shll $24, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $9, %xmm6, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $25, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $10, %xmm6, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $11, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $12, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm2, %edx
+; AVX1-NEXT:    shll $26, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $11, %xmm6, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $27, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $12, %xmm6, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $13, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm2, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $14, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm2, %edx
+; AVX1-NEXT:    shll $28, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $13, %xmm6, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $29, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $14, %xmm6, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $15, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $0, %xmm5, %ecx
+; AVX1-NEXT:    shll $30, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $15, %xmm6, %esi
+; AVX1-NEXT:    shll $31, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    orl %ecx, %esi
+; AVX1-NEXT:    movl %esi, 12(%rdi)
+; AVX1-NEXT:    vpextrb $1, %xmm2, %ecx
 ; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $16, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $1, %xmm5, %edx
+; AVX1-NEXT:    vpextrb $0, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $17, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $2, %xmm5, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $18, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $3, %xmm5, %edx
+; AVX1-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX1-NEXT:    vpextrb $2, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $19, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $4, %xmm5, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $20, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm5, %edx
+; AVX1-NEXT:    leal (%rcx,%rdx,4), %ecx
+; AVX1-NEXT:    vpextrb $3, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $21, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $6, %xmm5, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $22, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm5, %edx
+; AVX1-NEXT:    leal (%rcx,%rdx,8), %ecx
+; AVX1-NEXT:    vpextrb $4, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $23, %edx
+; AVX1-NEXT:    shll $4, %edx
 ; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm5, %ecx
+; AVX1-NEXT:    vpextrb $5, %xmm2, %ecx
 ; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $24, %ecx
+; AVX1-NEXT:    shll $5, %ecx
 ; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm5, %edx
+; AVX1-NEXT:    vpextrb $6, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $25, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm5, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $26, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm5, %edx
+; AVX1-NEXT:    shll $6, %edx
+; AVX1-NEXT:    vpextrb $7, %xmm2, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $7, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $8, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $27, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm5, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $28, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm5, %edx
+; AVX1-NEXT:    shll $8, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $9, %xmm2, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $9, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $10, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $29, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm5, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $30, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm5, %edx
-; AVX1-NEXT:    shll $31, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    orl %eax, %edx
-; AVX1-NEXT:    movl %edx, 8(%rdi)
-; AVX1-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX1-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX1-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX1-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $4, %ecx
-; AVX1-NEXT:    orl %eax, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    shll $5, %eax
-; AVX1-NEXT:    orl %ecx, %eax
-; AVX1-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $6, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm1, %edx
+; AVX1-NEXT:    shll $10, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $11, %xmm2, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $11, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $12, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $7, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $8, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm1, %edx
+; AVX1-NEXT:    shll $12, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $13, %xmm2, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $13, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $14, %xmm2, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $9, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $10, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm1, %edx
+; AVX1-NEXT:    shll $14, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $15, %xmm2, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $15, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $0, %xmm5, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $11, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $12, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm1, %edx
+; AVX1-NEXT:    shll $16, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $1, %xmm5, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $17, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $2, %xmm5, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $13, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $14, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm1, %edx
+; AVX1-NEXT:    shll $18, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $3, %xmm5, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $19, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $4, %xmm5, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $15, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $0, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $16, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $1, %xmm4, %edx
+; AVX1-NEXT:    shll $20, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $5, %xmm5, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $21, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $6, %xmm5, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $17, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $2, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $18, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $3, %xmm4, %edx
+; AVX1-NEXT:    shll $22, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $7, %xmm5, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $23, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $8, %xmm5, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $19, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $4, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $20, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm4, %edx
+; AVX1-NEXT:    shll $24, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $9, %xmm5, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $25, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $10, %xmm5, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $21, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $6, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $22, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm4, %edx
+; AVX1-NEXT:    shll $26, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $11, %xmm5, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $27, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $12, %xmm5, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $23, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $24, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm4, %edx
+; AVX1-NEXT:    shll $28, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $13, %xmm5, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $29, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $14, %xmm5, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $25, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm4, %ecx
+; AVX1-NEXT:    shll $30, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $15, %xmm5, %esi
+; AVX1-NEXT:    shll $31, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    orl %ecx, %esi
+; AVX1-NEXT:    movl %esi, 8(%rdi)
+; AVX1-NEXT:    vpextrb $1, %xmm1, %ecx
 ; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $26, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm4, %edx
+; AVX1-NEXT:    vpextrb $0, %xmm1, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $27, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $28, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm4, %edx
+; AVX1-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX1-NEXT:    vpextrb $2, %xmm1, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $29, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm4, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $30, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm4, %edx
-; AVX1-NEXT:    shll $31, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    orl %eax, %edx
-; AVX1-NEXT:    movl %edx, 4(%rdi)
-; AVX1-NEXT:    vpextrb $1, %xmm9, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    vpextrb $0, %xmm9, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX1-NEXT:    vpextrb $2, %xmm9, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX1-NEXT:    vpextrb $3, %xmm9, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX1-NEXT:    vpextrb $4, %xmm9, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $4, %ecx
-; AVX1-NEXT:    orl %eax, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm9, %eax
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    shll $5, %eax
-; AVX1-NEXT:    orl %ecx, %eax
-; AVX1-NEXT:    vpextrb $6, %xmm9, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $6, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm9, %edx
+; AVX1-NEXT:    leal (%rcx,%rdx,4), %ecx
+; AVX1-NEXT:    vpextrb $3, %xmm1, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $7, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm9, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $8, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm9, %edx
+; AVX1-NEXT:    leal (%rcx,%rdx,8), %ecx
+; AVX1-NEXT:    vpextrb $4, %xmm1, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $9, %edx
+; AVX1-NEXT:    shll $4, %edx
 ; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm9, %ecx
+; AVX1-NEXT:    vpextrb $5, %xmm1, %ecx
 ; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $10, %ecx
+; AVX1-NEXT:    shll $5, %ecx
 ; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm9, %edx
+; AVX1-NEXT:    vpextrb $6, %xmm1, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $11, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm9, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $12, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm9, %edx
+; AVX1-NEXT:    shll $6, %edx
+; AVX1-NEXT:    vpextrb $7, %xmm1, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $7, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $8, %xmm1, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $13, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm9, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $14, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm9, %edx
+; AVX1-NEXT:    shll $8, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $9, %xmm1, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $9, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $10, %xmm1, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $15, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $0, %xmm8, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $16, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $1, %xmm8, %edx
+; AVX1-NEXT:    shll $10, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $11, %xmm1, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $11, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $12, %xmm1, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $17, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $2, %xmm8, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $18, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $3, %xmm8, %edx
+; AVX1-NEXT:    shll $12, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $13, %xmm1, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $13, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $14, %xmm1, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $19, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $4, %xmm8, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $20, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $5, %xmm8, %edx
+; AVX1-NEXT:    shll $14, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $15, %xmm1, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $15, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $0, %xmm4, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $21, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $6, %xmm8, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $22, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $7, %xmm8, %edx
+; AVX1-NEXT:    shll $16, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $1, %xmm4, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $17, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $2, %xmm4, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $23, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $8, %xmm8, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $24, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $9, %xmm8, %edx
+; AVX1-NEXT:    shll $18, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $3, %xmm4, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $19, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $4, %xmm4, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $25, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $10, %xmm8, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $26, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $11, %xmm8, %edx
+; AVX1-NEXT:    shll $20, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $5, %xmm4, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $21, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $6, %xmm4, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $27, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $12, %xmm8, %ecx
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $28, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $13, %xmm8, %edx
+; AVX1-NEXT:    shll $22, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $7, %xmm4, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $23, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $8, %xmm4, %edx
 ; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    shll $29, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    vpextrb $14, %xmm8, %ecx
+; AVX1-NEXT:    shll $24, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $9, %xmm4, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $25, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $10, %xmm4, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $26, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $11, %xmm4, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $27, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $12, %xmm4, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $28, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $13, %xmm4, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $29, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $14, %xmm4, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $30, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $15, %xmm4, %esi
+; AVX1-NEXT:    shll $31, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    orl %ecx, %esi
+; AVX1-NEXT:    movl %esi, 4(%rdi)
+; AVX1-NEXT:    vpextrb $1, %xmm9, %ecx
 ; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    shll $30, %ecx
-; AVX1-NEXT:    orl %edx, %ecx
-; AVX1-NEXT:    vpextrb $15, %xmm8, %edx
-; AVX1-NEXT:    shll $31, %edx
+; AVX1-NEXT:    vpextrb $0, %xmm9, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX1-NEXT:    vpextrb $2, %xmm9, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    leal (%rcx,%rdx,4), %ecx
+; AVX1-NEXT:    vpextrb $3, %xmm9, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    leal (%rcx,%rdx,8), %ecx
+; AVX1-NEXT:    vpextrb $4, %xmm9, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $4, %edx
 ; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    orl %eax, %edx
-; AVX1-NEXT:    movl %edx, (%rdi)
-; AVX1-NEXT:    movq %rdi, %rax
+; AVX1-NEXT:    vpextrb $5, %xmm9, %ecx
+; AVX1-NEXT:    andl $1, %ecx
+; AVX1-NEXT:    shll $5, %ecx
+; AVX1-NEXT:    orl %edx, %ecx
+; AVX1-NEXT:    vpextrb $6, %xmm9, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $6, %edx
+; AVX1-NEXT:    vpextrb $7, %xmm9, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $7, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $8, %xmm9, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $8, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $9, %xmm9, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $9, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $10, %xmm9, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $10, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $11, %xmm9, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $11, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $12, %xmm9, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $12, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $13, %xmm9, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $13, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $14, %xmm9, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $14, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $15, %xmm9, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $15, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $0, %xmm8, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $16, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $1, %xmm8, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $17, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $2, %xmm8, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $18, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $3, %xmm8, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $19, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $4, %xmm8, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $20, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $5, %xmm8, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $21, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $6, %xmm8, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $22, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $7, %xmm8, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $23, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $8, %xmm8, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $24, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $9, %xmm8, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $25, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $10, %xmm8, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $26, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $11, %xmm8, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $27, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $12, %xmm8, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $28, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $13, %xmm8, %esi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    shll $29, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    vpextrb $14, %xmm8, %edx
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    shll $30, %edx
+; AVX1-NEXT:    orl %esi, %edx
+; AVX1-NEXT:    vpextrb $15, %xmm8, %esi
+; AVX1-NEXT:    shll $31, %esi
+; AVX1-NEXT:    orl %edx, %esi
+; AVX1-NEXT:    orl %ecx, %esi
+; AVX1-NEXT:    movl %esi, (%rdi)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_cmp_v128i8:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq %rdi, %rax
 ; AVX2-NEXT:    vpcmpgtb %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpcmpgtb %ymm5, %ymm1, %ymm1
 ; AVX2-NEXT:    vpcmpgtb %ymm6, %ymm2, %ymm2
 ; AVX2-NEXT:    vpcmpgtb %ymm7, %ymm3, %ymm3
-; AVX2-NEXT:    vpextrb $1, %xmm3, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpextrb $0, %xmm3, %ecx
+; AVX2-NEXT:    vpextrb $1, %xmm3, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX2-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX2-NEXT:    vpextrb $3, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX2-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $4, %ecx
-; AVX2-NEXT:    orl %eax, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm3, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    shll $5, %eax
-; AVX2-NEXT:    orl %ecx, %eax
-; AVX2-NEXT:    vpextrb $6, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $6, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm3, %edx
+; AVX2-NEXT:    vpextrb $0, %xmm3, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $7, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $8, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm3, %edx
+; AVX2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX2-NEXT:    vpextrb $2, %xmm3, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $9, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $10, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $11, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $12, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $13, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $14, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $15, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX2-NEXT:    vpextrb $0, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $16, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $1, %xmm3, %edx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $17, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $2, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $18, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
+; AVX2-NEXT:    leal (%rcx,%rdx,4), %ecx
 ; AVX2-NEXT:    vpextrb $3, %xmm3, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $19, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $4, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $20, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm3, %edx
+; AVX2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; AVX2-NEXT:    vpextrb $4, %xmm3, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $21, %edx
+; AVX2-NEXT:    shll $4, %edx
 ; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $6, %xmm3, %ecx
+; AVX2-NEXT:    vpextrb $5, %xmm3, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $22, %ecx
+; AVX2-NEXT:    shll $5, %ecx
 ; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm3, %edx
+; AVX2-NEXT:    vpextrb $6, %xmm3, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $23, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $24, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm3, %edx
+; AVX2-NEXT:    shll $6, %edx
+; AVX2-NEXT:    vpextrb $7, %xmm3, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $7, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $8, %xmm3, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $25, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $26, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm3, %edx
+; AVX2-NEXT:    shll $8, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $9, %xmm3, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $9, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $10, %xmm3, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $27, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $28, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm3, %edx
+; AVX2-NEXT:    shll $10, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $11, %xmm3, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $11, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $12, %xmm3, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $29, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm3, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $30, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm3, %edx
-; AVX2-NEXT:    shll $31, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    orl %eax, %edx
-; AVX2-NEXT:    movl %edx, 12(%rdi)
-; AVX2-NEXT:    vpextrb $1, %xmm2, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpextrb $0, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX2-NEXT:    vpextrb $2, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX2-NEXT:    vpextrb $3, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX2-NEXT:    vpextrb $4, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $4, %ecx
-; AVX2-NEXT:    orl %eax, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm2, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    shll $5, %eax
-; AVX2-NEXT:    orl %ecx, %eax
-; AVX2-NEXT:    vpextrb $6, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $6, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm2, %edx
+; AVX2-NEXT:    shll $12, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $13, %xmm3, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $13, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $14, %xmm3, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $7, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $8, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm2, %edx
+; AVX2-NEXT:    shll $14, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $15, %xmm3, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $15, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm3
+; AVX2-NEXT:    vpextrb $0, %xmm3, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $9, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $10, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm2, %edx
+; AVX2-NEXT:    shll $16, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $1, %xmm3, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $17, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $2, %xmm3, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $11, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $12, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm2, %edx
+; AVX2-NEXT:    shll $18, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $3, %xmm3, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $19, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $4, %xmm3, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $13, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $14, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm2, %edx
+; AVX2-NEXT:    shll $20, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $5, %xmm3, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $21, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $6, %xmm3, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $15, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX2-NEXT:    vpextrb $0, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $16, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $1, %xmm2, %edx
+; AVX2-NEXT:    shll $22, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $7, %xmm3, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $23, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $8, %xmm3, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $17, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $2, %xmm2, %ecx
+; AVX2-NEXT:    shll $24, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $9, %xmm3, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $25, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $10, %xmm3, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $26, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $11, %xmm3, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $27, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $12, %xmm3, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $28, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $13, %xmm3, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $29, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $14, %xmm3, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $30, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $15, %xmm3, %esi
+; AVX2-NEXT:    shll $31, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    orl %ecx, %esi
+; AVX2-NEXT:    movl %esi, 12(%rdi)
+; AVX2-NEXT:    vpextrb $1, %xmm2, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $18, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
+; AVX2-NEXT:    vpextrb $0, %xmm2, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX2-NEXT:    vpextrb $2, %xmm2, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    leal (%rcx,%rdx,4), %ecx
 ; AVX2-NEXT:    vpextrb $3, %xmm2, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $19, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $4, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $20, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm2, %edx
+; AVX2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; AVX2-NEXT:    vpextrb $4, %xmm2, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $21, %edx
+; AVX2-NEXT:    shll $4, %edx
 ; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $6, %xmm2, %ecx
+; AVX2-NEXT:    vpextrb $5, %xmm2, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $22, %ecx
+; AVX2-NEXT:    shll $5, %ecx
 ; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm2, %edx
+; AVX2-NEXT:    vpextrb $6, %xmm2, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $23, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $24, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm2, %edx
+; AVX2-NEXT:    shll $6, %edx
+; AVX2-NEXT:    vpextrb $7, %xmm2, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $7, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $8, %xmm2, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $25, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $26, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm2, %edx
+; AVX2-NEXT:    shll $8, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $9, %xmm2, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $9, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $10, %xmm2, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $27, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $28, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm2, %edx
+; AVX2-NEXT:    shll $10, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $11, %xmm2, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $11, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $12, %xmm2, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $29, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm2, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $30, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm2, %edx
-; AVX2-NEXT:    shll $31, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    orl %eax, %edx
-; AVX2-NEXT:    movl %edx, 8(%rdi)
-; AVX2-NEXT:    vpextrb $1, %xmm1, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX2-NEXT:    vpextrb $2, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX2-NEXT:    vpextrb $3, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX2-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $4, %ecx
-; AVX2-NEXT:    orl %eax, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm1, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    shll $5, %eax
-; AVX2-NEXT:    orl %ecx, %eax
-; AVX2-NEXT:    vpextrb $6, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $6, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm1, %edx
+; AVX2-NEXT:    shll $12, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $13, %xmm2, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $13, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $14, %xmm2, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $7, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $8, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm1, %edx
+; AVX2-NEXT:    shll $14, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $15, %xmm2, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $15, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX2-NEXT:    vpextrb $0, %xmm2, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $9, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $10, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm1, %edx
+; AVX2-NEXT:    shll $16, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $1, %xmm2, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $17, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $2, %xmm2, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $11, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $12, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm1, %edx
+; AVX2-NEXT:    shll $18, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $3, %xmm2, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $19, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $4, %xmm2, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $13, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $14, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm1, %edx
+; AVX2-NEXT:    shll $20, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $5, %xmm2, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $21, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $6, %xmm2, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $15, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpextrb $0, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $16, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $1, %xmm1, %edx
+; AVX2-NEXT:    shll $22, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $7, %xmm2, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $23, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $8, %xmm2, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $17, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $2, %xmm1, %ecx
+; AVX2-NEXT:    shll $24, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $9, %xmm2, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $25, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $10, %xmm2, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $26, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $11, %xmm2, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $27, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $12, %xmm2, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $28, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $13, %xmm2, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $29, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $14, %xmm2, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $30, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $15, %xmm2, %esi
+; AVX2-NEXT:    shll $31, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    orl %ecx, %esi
+; AVX2-NEXT:    movl %esi, 8(%rdi)
+; AVX2-NEXT:    vpextrb $1, %xmm1, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $18, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
+; AVX2-NEXT:    vpextrb $0, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX2-NEXT:    vpextrb $2, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    leal (%rcx,%rdx,4), %ecx
 ; AVX2-NEXT:    vpextrb $3, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $19, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $4, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $20, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm1, %edx
+; AVX2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; AVX2-NEXT:    vpextrb $4, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $21, %edx
+; AVX2-NEXT:    shll $4, %edx
 ; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $6, %xmm1, %ecx
+; AVX2-NEXT:    vpextrb $5, %xmm1, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $22, %ecx
+; AVX2-NEXT:    shll $5, %ecx
 ; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm1, %edx
+; AVX2-NEXT:    vpextrb $6, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $23, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $24, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm1, %edx
+; AVX2-NEXT:    shll $6, %edx
+; AVX2-NEXT:    vpextrb $7, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $7, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $8, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $25, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $26, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm1, %edx
+; AVX2-NEXT:    shll $8, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $9, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $9, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $10, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $27, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $28, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm1, %edx
+; AVX2-NEXT:    shll $10, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $11, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $11, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $12, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $29, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $30, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm1, %edx
-; AVX2-NEXT:    shll $31, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    orl %eax, %edx
-; AVX2-NEXT:    movl %edx, 4(%rdi)
-; AVX2-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rcx,%rax,2), %eax
-; AVX2-NEXT:    vpextrb $2, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,4), %eax
-; AVX2-NEXT:    vpextrb $3, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    leal (%rax,%rcx,8), %eax
-; AVX2-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $4, %ecx
-; AVX2-NEXT:    orl %eax, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    shll $5, %eax
-; AVX2-NEXT:    orl %ecx, %eax
-; AVX2-NEXT:    vpextrb $6, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $6, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm0, %edx
+; AVX2-NEXT:    shll $12, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $13, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $13, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $14, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $7, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $8, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm0, %edx
+; AVX2-NEXT:    shll $14, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $15, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $15, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    vpextrb $0, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $9, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $10, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm0, %edx
+; AVX2-NEXT:    shll $16, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $1, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $17, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $2, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $11, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $12, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm0, %edx
+; AVX2-NEXT:    shll $18, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $3, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $19, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $4, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $13, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $14, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm0, %edx
+; AVX2-NEXT:    shll $20, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $5, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $21, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $6, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $15, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $16, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $1, %xmm0, %edx
+; AVX2-NEXT:    shll $22, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $7, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $23, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $8, %xmm1, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $17, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $2, %xmm0, %ecx
+; AVX2-NEXT:    shll $24, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $9, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $25, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $10, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $26, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $11, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $27, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $12, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $28, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $13, %xmm1, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $29, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $14, %xmm1, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $30, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $15, %xmm1, %esi
+; AVX2-NEXT:    shll $31, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    orl %ecx, %esi
+; AVX2-NEXT:    movl %esi, 4(%rdi)
+; AVX2-NEXT:    vpextrb $1, %xmm0, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $18, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
+; AVX2-NEXT:    vpextrb $0, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX2-NEXT:    vpextrb $2, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    leal (%rcx,%rdx,4), %ecx
 ; AVX2-NEXT:    vpextrb $3, %xmm0, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $19, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $20, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $5, %xmm0, %edx
+; AVX2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; AVX2-NEXT:    vpextrb $4, %xmm0, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $21, %edx
+; AVX2-NEXT:    shll $4, %edx
 ; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $6, %xmm0, %ecx
+; AVX2-NEXT:    vpextrb $5, %xmm0, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $22, %ecx
+; AVX2-NEXT:    shll $5, %ecx
 ; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $7, %xmm0, %edx
+; AVX2-NEXT:    vpextrb $6, %xmm0, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $23, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $8, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $24, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $9, %xmm0, %edx
+; AVX2-NEXT:    shll $6, %edx
+; AVX2-NEXT:    vpextrb $7, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $7, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $8, %xmm0, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $25, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $10, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $26, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $11, %xmm0, %edx
+; AVX2-NEXT:    shll $8, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $9, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $9, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $10, %xmm0, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $27, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $12, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $28, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $13, %xmm0, %edx
+; AVX2-NEXT:    shll $10, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $11, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $11, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $12, %xmm0, %edx
 ; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    shll $29, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    vpextrb $14, %xmm0, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    shll $30, %ecx
-; AVX2-NEXT:    orl %edx, %ecx
-; AVX2-NEXT:    vpextrb $15, %xmm0, %edx
-; AVX2-NEXT:    shll $31, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    orl %eax, %edx
-; AVX2-NEXT:    movl %edx, (%rdi)
-; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    shll $12, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $13, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $13, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $14, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $14, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $15, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $15, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpextrb $0, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $16, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $1, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $17, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $2, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $18, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $3, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $19, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $4, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $20, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $5, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $21, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $6, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $22, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $7, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $23, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $8, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $24, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $9, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $25, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $10, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $26, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $11, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $27, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $12, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $28, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $13, %xmm0, %esi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    shll $29, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    vpextrb $14, %xmm0, %edx
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    shll $30, %edx
+; AVX2-NEXT:    orl %esi, %edx
+; AVX2-NEXT:    vpextrb $15, %xmm0, %esi
+; AVX2-NEXT:    shll $31, %esi
+; AVX2-NEXT:    orl %edx, %esi
+; AVX2-NEXT:    orl %ecx, %esi
+; AVX2-NEXT:    movl %esi, (%rdi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test_cmp_v128i8:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movq %rdi, %rax
 ; AVX512F-NEXT:    vpcmpgtb %ymm4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm4
 ; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k0
@@ -6421,12 +6422,12 @@
 ; AVX512F-NEXT:    kmovw %k2, 4(%rdi)
 ; AVX512F-NEXT:    kmovw %k1, 2(%rdi)
 ; AVX512F-NEXT:    kmovw %k0, (%rdi)
-; AVX512F-NEXT:    movq %rdi, %rax
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_cmp_v128i8:
 ; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    movq %rdi, %rax
 ; AVX512DQ-NEXT:    vpcmpgtb %ymm4, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm4
 ; AVX512DQ-NEXT:    vpmovd2m %zmm4, %k0
@@ -6459,7 +6460,6 @@
 ; AVX512DQ-NEXT:    kmovw %k2, 4(%rdi)
 ; AVX512DQ-NEXT:    kmovw %k1, 2(%rdi)
 ; AVX512DQ-NEXT:    kmovw %k0, (%rdi)
-; AVX512DQ-NEXT:    movq %rdi, %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -6481,6 +6481,7 @@
 define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind {
 ; SSE2-LABEL: test_cmp_v32f64:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm8
 ; SSE2-NEXT:    cmpltpd %xmm1, %xmm8
 ; SSE2-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
@@ -6565,126 +6566,125 @@
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
 ; SSE2-NEXT:    movapd %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, 2(%rdi)
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, 2(%rdi)
 ; SSE2-NEXT:    movapd %xmm4, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, (%rdi)
-; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: test_cmp_v32f64:
@@ -6695,7 +6695,7 @@
 ; SSE42-NEXT:    pushq %r13
 ; SSE42-NEXT:    pushq %r12
 ; SSE42-NEXT:    pushq %rbx
-; SSE42-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; SSE42-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm8
 ; SSE42-NEXT:    cmpltpd %xmm7, %xmm8
 ; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
@@ -6730,24 +6730,24 @@
 ; SSE42-NEXT:    pextrb $8, %xmm0, %r9d
 ; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
 ; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    pextrb $0, %xmm0, %esi
+; SSE42-NEXT:    pextrb $0, %xmm0, %edx
 ; SSE42-NEXT:    pextrb $8, %xmm0, %r12d
 ; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
 ; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    pextrb $0, %xmm0, %edx
+; SSE42-NEXT:    pextrb $0, %xmm0, %esi
 ; SSE42-NEXT:    pextrb $8, %xmm0, %ebx
 ; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
 ; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; SSE42-NEXT:    pextrb $0, %xmm0, %ecx
 ; SSE42-NEXT:    pextrb $8, %xmm0, %r13d
 ; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
 ; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm0
 ; SSE42-NEXT:    andl $1, %r8d
 ; SSE42-NEXT:    andl $1, %r10d
-; SSE42-NEXT:    leal (%r10,%r8,2), %ecx
+; SSE42-NEXT:    leal (%r10,%r8,2), %eax
 ; SSE42-NEXT:    andl $1, %ebp
-; SSE42-NEXT:    leal (%rcx,%rbp,4), %r8d
-; SSE42-NEXT:    pextrb $0, %xmm0, %ecx
+; SSE42-NEXT:    leal (%rax,%rbp,4), %r8d
+; SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; SSE42-NEXT:    pextrb $8, %xmm0, %ebp
 ; SSE42-NEXT:    andl $1, %edi
 ; SSE42-NEXT:    leal (%r8,%rdi,8), %r8d
@@ -6755,8 +6755,8 @@
 ; SSE42-NEXT:    shll $4, %r15d
 ; SSE42-NEXT:    orl %r8d, %r15d
 ; SSE42-NEXT:    pextrb $8, %xmm1, %edi
-; SSE42-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; SSE42-NEXT:    pextrb $0, %xmm1, %r10d
+; SSE42-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE42-NEXT:    pextrb $0, %xmm1, %r8d
 ; SSE42-NEXT:    andl $1, %r11d
 ; SSE42-NEXT:    shll $5, %r11d
 ; SSE42-NEXT:    orl %r15d, %r11d
@@ -6765,93 +6765,93 @@
 ; SSE42-NEXT:    andl $1, %r9d
 ; SSE42-NEXT:    shll $7, %r9d
 ; SSE42-NEXT:    orl %r14d, %r9d
-; SSE42-NEXT:    pextrb $0, %xmm2, %r14d
+; SSE42-NEXT:    pextrb $0, %xmm2, %r10d
 ; SSE42-NEXT:    pextrb $8, %xmm2, %edi
-; SSE42-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    shll $8, %esi
-; SSE42-NEXT:    orl %r9d, %esi
+; SSE42-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %r9d, %edx
 ; SSE42-NEXT:    andl $1, %r12d
 ; SSE42-NEXT:    shll $9, %r12d
-; SSE42-NEXT:    orl %esi, %r12d
-; SSE42-NEXT:    pextrb $0, %xmm3, %r8d
-; SSE42-NEXT:    pextrb $8, %xmm3, %r15d
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $10, %edx
-; SSE42-NEXT:    orl %r12d, %edx
+; SSE42-NEXT:    orl %edx, %r12d
+; SSE42-NEXT:    pextrb $0, %xmm3, %edi
+; SSE42-NEXT:    pextrb $8, %xmm3, %r9d
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $10, %esi
+; SSE42-NEXT:    orl %r12d, %esi
 ; SSE42-NEXT:    andl $1, %ebx
 ; SSE42-NEXT:    shll $11, %ebx
-; SSE42-NEXT:    orl %edx, %ebx
-; SSE42-NEXT:    pextrb $0, %xmm4, %r12d
-; SSE42-NEXT:    pextrb $8, %xmm4, %edi
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $12, %eax
-; SSE42-NEXT:    orl %ebx, %eax
+; SSE42-NEXT:    orl %esi, %ebx
+; SSE42-NEXT:    pextrb $0, %xmm4, %r15d
+; SSE42-NEXT:    pextrb $8, %xmm4, %r12d
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    shll $12, %ecx
+; SSE42-NEXT:    orl %ebx, %ecx
 ; SSE42-NEXT:    andl $1, %r13d
 ; SSE42-NEXT:    shll $13, %r13d
-; SSE42-NEXT:    orl %eax, %r13d
-; SSE42-NEXT:    pextrb $0, %xmm5, %eax
+; SSE42-NEXT:    orl %ecx, %r13d
+; SSE42-NEXT:    pextrb $0, %xmm5, %ecx
 ; SSE42-NEXT:    pextrb $8, %xmm5, %ebx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
-; SSE42-NEXT:    orl %r13d, %ecx
+; SSE42-NEXT:    andl $1, %eax
+; SSE42-NEXT:    shll $14, %eax
+; SSE42-NEXT:    orl %r13d, %eax
 ; SSE42-NEXT:    shll $15, %ebp
-; SSE42-NEXT:    orl %ecx, %ebp
+; SSE42-NEXT:    orl %eax, %ebp
 ; SSE42-NEXT:    pextrb $0, %xmm6, %r13d
-; SSE42-NEXT:    pextrb $8, %xmm6, %edx
+; SSE42-NEXT:    pextrb $8, %xmm6, %esi
 ; SSE42-NEXT:    orl %r11d, %ebp
-; SSE42-NEXT:    movq -{{[0-9]+}}(%rsp), %r9 # 8-byte Reload
-; SSE42-NEXT:    movw %bp, 2(%r9)
+; SSE42-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE42-NEXT:    movw %bp, 2(%r14)
 ; SSE42-NEXT:    pextrb $0, %xmm7, %r11d
-; SSE42-NEXT:    pextrb $8, %xmm7, %ecx
-; SSE42-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    pextrb $8, %xmm7, %eax
+; SSE42-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    andl $1, %r8d
+; SSE42-NEXT:    leal (%r8,%rdx,2), %r8d
 ; SSE42-NEXT:    andl $1, %r10d
-; SSE42-NEXT:    leal (%r10,%rsi,2), %esi
-; SSE42-NEXT:    andl $1, %r14d
-; SSE42-NEXT:    leal (%rsi,%r14,4), %r14d
+; SSE42-NEXT:    leal (%r8,%r10,4), %r8d
 ; SSE42-NEXT:    pextrb $0, %xmm8, %r10d
 ; SSE42-NEXT:    pextrb $8, %xmm8, %ebp
-; SSE42-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; SSE42-NEXT:    andl $1, %esi
-; SSE42-NEXT:    leal (%r14,%rsi,8), %esi
-; SSE42-NEXT:    andl $1, %r8d
-; SSE42-NEXT:    shll $4, %r8d
-; SSE42-NEXT:    orl %esi, %r8d
-; SSE42-NEXT:    andl $1, %r15d
-; SSE42-NEXT:    shll $5, %r15d
-; SSE42-NEXT:    orl %r8d, %r15d
-; SSE42-NEXT:    andl $1, %r12d
-; SSE42-NEXT:    shll $6, %r12d
+; SSE42-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%r8,%rdx,8), %r8d
 ; SSE42-NEXT:    andl $1, %edi
-; SSE42-NEXT:    shll $7, %edi
-; SSE42-NEXT:    orl %r12d, %edi
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $8, %eax
-; SSE42-NEXT:    orl %edi, %eax
+; SSE42-NEXT:    shll $4, %edi
+; SSE42-NEXT:    orl %r8d, %edi
+; SSE42-NEXT:    andl $1, %r9d
+; SSE42-NEXT:    shll $5, %r9d
+; SSE42-NEXT:    orl %edi, %r9d
+; SSE42-NEXT:    andl $1, %r15d
+; SSE42-NEXT:    shll $6, %r15d
+; SSE42-NEXT:    andl $1, %r12d
+; SSE42-NEXT:    shll $7, %r12d
+; SSE42-NEXT:    orl %r15d, %r12d
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    shll $8, %ecx
+; SSE42-NEXT:    orl %r12d, %ecx
 ; SSE42-NEXT:    andl $1, %ebx
 ; SSE42-NEXT:    shll $9, %ebx
-; SSE42-NEXT:    orl %eax, %ebx
+; SSE42-NEXT:    orl %ecx, %ebx
 ; SSE42-NEXT:    andl $1, %r13d
 ; SSE42-NEXT:    shll $10, %r13d
 ; SSE42-NEXT:    orl %ebx, %r13d
-; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %r13d, %edx
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %r13d, %esi
 ; SSE42-NEXT:    andl $1, %r11d
 ; SSE42-NEXT:    shll $12, %r11d
-; SSE42-NEXT:    orl %edx, %r11d
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $13, %ecx
-; SSE42-NEXT:    orl %r11d, %ecx
+; SSE42-NEXT:    orl %esi, %r11d
+; SSE42-NEXT:    andl $1, %eax
+; SSE42-NEXT:    shll $13, %eax
+; SSE42-NEXT:    orl %r11d, %eax
 ; SSE42-NEXT:    andl $1, %r10d
 ; SSE42-NEXT:    shll $14, %r10d
-; SSE42-NEXT:    orl %ecx, %r10d
+; SSE42-NEXT:    orl %eax, %r10d
 ; SSE42-NEXT:    shll $15, %ebp
 ; SSE42-NEXT:    orl %r10d, %ebp
-; SSE42-NEXT:    orl %r15d, %ebp
-; SSE42-NEXT:    movw %bp, (%r9)
-; SSE42-NEXT:    movq %r9, %rax
+; SSE42-NEXT:    orl %r9d, %ebp
+; SSE42-NEXT:    movw %bp, (%r14)
+; SSE42-NEXT:    movq %r14, %rax
 ; SSE42-NEXT:    popq %rbx
 ; SSE42-NEXT:    popq %r12
 ; SSE42-NEXT:    popq %r13
@@ -6998,6 +6998,7 @@
 define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
 ; SSE2-LABEL: test_cmp_v32i64:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
 ; SSE2-NEXT:    pxor %xmm8, %xmm1
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
@@ -7234,130 +7235,130 @@
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE2-NEXT:    movapd %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, 2(%rdi)
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, 2(%rdi)
 ; SSE2-NEXT:    movapd %xmm3, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $4, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    shll $5, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $6, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $7, %edx
+; SSE2-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $4, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    shll $5, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $9, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $10, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $6, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $7, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $11, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $8, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $9, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE2-NEXT:    andl $1, %edx
-; SSE2-NEXT:    shll $13, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    andl $1, %ecx
-; SSE2-NEXT:    shll $14, %ecx
-; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $11, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT:    shll $15, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movw %dx, (%rdi)
-; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $12, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    andl $1, %esi
+; SSE2-NEXT:    shll $13, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE2-NEXT:    andl $1, %edx
+; SSE2-NEXT:    shll $14, %edx
+; SSE2-NEXT:    orl %esi, %edx
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT:    shll $15, %esi
+; SSE2-NEXT:    orl %edx, %esi
+; SSE2-NEXT:    orl %ecx, %esi
+; SSE2-NEXT:    movw %si, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: test_cmp_v32i64:
 ; SSE42:       # %bb.0:
+; SSE42-NEXT:    movq %rdi, %rax
 ; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm15
 ; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm14
 ; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm13
@@ -7382,125 +7383,124 @@
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm13
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm14
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm15
-; SSE42-NEXT:    pextrb $8, %xmm15, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm15, %ecx
+; SSE42-NEXT:    pextrb $8, %xmm15, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $0, %xmm14, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $8, %xmm14, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $0, %xmm13, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $8, %xmm13, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $0, %xmm12, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $8, %xmm12, %edx
+; SSE42-NEXT:    pextrb $0, %xmm15, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm11, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $8, %xmm11, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $0, %xmm14, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm10, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $8, %xmm10, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $8, %xmm14, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm9, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $8, %xmm9, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $0, %xmm13, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm8, %ecx
+; SSE42-NEXT:    pextrb $8, %xmm13, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
+; SSE42-NEXT:    shll $5, %ecx
 ; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $8, %xmm8, %edx
-; SSE42-NEXT:    shll $15, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, 2(%rdi)
-; SSE42-NEXT:    pextrb $8, %xmm0, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    pextrb $0, %xmm0, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rcx,%rax,2), %eax
-; SSE42-NEXT:    pextrb $0, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,4), %eax
-; SSE42-NEXT:    pextrb $8, %xmm1, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    leal (%rax,%rcx,8), %eax
-; SSE42-NEXT:    pextrb $0, %xmm2, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $4, %ecx
-; SSE42-NEXT:    orl %eax, %ecx
-; SSE42-NEXT:    pextrb $8, %xmm2, %eax
-; SSE42-NEXT:    andl $1, %eax
-; SSE42-NEXT:    shll $5, %eax
-; SSE42-NEXT:    orl %ecx, %eax
-; SSE42-NEXT:    pextrb $0, %xmm3, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $6, %ecx
-; SSE42-NEXT:    pextrb $8, %xmm3, %edx
+; SSE42-NEXT:    pextrb $0, %xmm12, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $7, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm4, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $8, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $8, %xmm4, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $8, %xmm12, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm11, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $9, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm5, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $10, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $8, %xmm5, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $8, %xmm11, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm10, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $11, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm6, %ecx
-; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $12, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $8, %xmm6, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $8, %xmm10, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm9, %edx
 ; SSE42-NEXT:    andl $1, %edx
-; SSE42-NEXT:    shll $13, %edx
-; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    pextrb $0, %xmm7, %ecx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $8, %xmm9, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm8, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $8, %xmm8, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, 2(%rdi)
+; SSE42-NEXT:    pextrb $8, %xmm0, %ecx
 ; SSE42-NEXT:    andl $1, %ecx
-; SSE42-NEXT:    shll $14, %ecx
-; SSE42-NEXT:    orl %edx, %ecx
-; SSE42-NEXT:    pextrb $8, %xmm7, %edx
-; SSE42-NEXT:    shll $15, %edx
+; SSE42-NEXT:    pextrb $0, %xmm0, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rdx,%rcx,2), %ecx
+; SSE42-NEXT:    pextrb $0, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,4), %ecx
+; SSE42-NEXT:    pextrb $8, %xmm1, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    leal (%rcx,%rdx,8), %ecx
+; SSE42-NEXT:    pextrb $0, %xmm2, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $4, %edx
 ; SSE42-NEXT:    orl %ecx, %edx
-; SSE42-NEXT:    orl %eax, %edx
-; SSE42-NEXT:    movw %dx, (%rdi)
-; SSE42-NEXT:    movq %rdi, %rax
+; SSE42-NEXT:    pextrb $8, %xmm2, %ecx
+; SSE42-NEXT:    andl $1, %ecx
+; SSE42-NEXT:    shll $5, %ecx
+; SSE42-NEXT:    orl %edx, %ecx
+; SSE42-NEXT:    pextrb $0, %xmm3, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $6, %edx
+; SSE42-NEXT:    pextrb $8, %xmm3, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $7, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm4, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $8, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $8, %xmm4, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $9, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm5, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $10, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $8, %xmm5, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $11, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm6, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $12, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $8, %xmm6, %esi
+; SSE42-NEXT:    andl $1, %esi
+; SSE42-NEXT:    shll $13, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    pextrb $0, %xmm7, %edx
+; SSE42-NEXT:    andl $1, %edx
+; SSE42-NEXT:    shll $14, %edx
+; SSE42-NEXT:    orl %esi, %edx
+; SSE42-NEXT:    pextrb $8, %xmm7, %esi
+; SSE42-NEXT:    shll $15, %esi
+; SSE42-NEXT:    orl %edx, %esi
+; SSE42-NEXT:    orl %ecx, %esi
+; SSE42-NEXT:    movw %si, (%rdi)
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: test_cmp_v32i64:
diff --git a/llvm/test/CodeGen/X86/vector-interleave.ll b/llvm/test/CodeGen/X86/vector-interleave.ll
index 04e6ccc..0c71d12 100644
--- a/llvm/test/CodeGen/X86/vector-interleave.ll
+++ b/llvm/test/CodeGen/X86/vector-interleave.ll
@@ -10,6 +10,7 @@
 define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d, <8 x i16> %e, <8 x i16> %f, <8 x i16> %h, <8 x i16> %g) {
 ; SSE-LABEL: interleave8x8:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    movdqa %xmm0, %xmm8
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -54,7 +55,6 @@
 ; SSE-NEXT:    movdqa %xmm1, 32(%rdi)
 ; SSE-NEXT:    movdqa %xmm8, 16(%rdi)
 ; SSE-NEXT:    movdqa %xmm5, (%rdi)
-; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: interleave8x8:
diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll
index b2c0a4d..0cdc900 100644
--- a/llvm/test/CodeGen/X86/vector-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vector-pcmp.ll
@@ -86,10 +86,10 @@
 define <1 x i128> @test_strange_type(<1 x i128> %x) {
 ; CHECK-LABEL: test_strange_type:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    sarq $63, %rsi
-; CHECK-NEXT:    notq %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    movq %rsi, %rdx
+; CHECK-NEXT:    sarq $63, %rax
+; CHECK-NEXT:    notq %rax
+; CHECK-NEXT:    movq %rax, %rdx
 ; CHECK-NEXT:    retq
   %sign = ashr <1 x i128> %x, <i128 127>
   %not = xor <1 x i128> %sign, <i128 -1>
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll
index e126d90..b17734b 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll
@@ -20,8 +20,8 @@
 ;
 ; SSE41-LABEL: test_v2f32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    haddps %xmm1, %xmm1
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    haddps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f32:
@@ -527,8 +527,8 @@
 ;
 ; SSE41-LABEL: test_v2f64:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    haddpd %xmm1, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    haddpd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f64:
@@ -555,9 +555,9 @@
 ;
 ; SSE41-LABEL: test_v4f64:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    addpd %xmm2, %xmm1
-; SSE41-NEXT:    haddpd %xmm1, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    addpd %xmm2, %xmm0
+; SSE41-NEXT:    haddpd %xmm0, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4f64:
@@ -594,11 +594,11 @@
 ;
 ; SSE41-LABEL: test_v8f64:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    addpd %xmm4, %xmm2
-; SSE41-NEXT:    addpd %xmm3, %xmm1
-; SSE41-NEXT:    addpd %xmm2, %xmm1
-; SSE41-NEXT:    haddpd %xmm1, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    addpd %xmm4, %xmm2
+; SSE41-NEXT:    addpd %xmm3, %xmm0
+; SSE41-NEXT:    addpd %xmm2, %xmm0
+; SSE41-NEXT:    haddpd %xmm0, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_v8f64:
@@ -643,15 +643,15 @@
 ;
 ; SSE41-LABEL: test_v16f64:
 ; SSE41:       # %bb.0:
+; SSE41-NEXT:    movapd %xmm4, %xmm0
 ; SSE41-NEXT:    addpd %xmm6, %xmm2
 ; SSE41-NEXT:    addpd %xmm7, %xmm3
 ; SSE41-NEXT:    addpd %xmm5, %xmm1
 ; SSE41-NEXT:    addpd %xmm3, %xmm1
-; SSE41-NEXT:    addpd {{[0-9]+}}(%rsp), %xmm4
-; SSE41-NEXT:    addpd %xmm2, %xmm4
-; SSE41-NEXT:    addpd %xmm1, %xmm4
-; SSE41-NEXT:    haddpd %xmm4, %xmm4
-; SSE41-NEXT:    movapd %xmm4, %xmm0
+; SSE41-NEXT:    addpd {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    addpd %xmm2, %xmm0
+; SSE41-NEXT:    addpd %xmm1, %xmm0
+; SSE41-NEXT:    haddpd %xmm0, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_v16f64:
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index 3426da3..679a81b 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -257,32 +257,33 @@
 ;
 ; SSE41-LABEL: var_shift_v8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    psllw $12, %xmm0
-; SSE41-NEXT:    psllw $4, %xmm1
-; SSE41-NEXT:    por %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    paddw %xmm1, %xmm3
-; SSE41-NEXT:    movdqa %xmm2, %xmm4
-; SSE41-NEXT:    psraw $8, %xmm4
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm1
-; SSE41-NEXT:    psraw $4, %xmm1
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm1
-; SSE41-NEXT:    psraw $2, %xmm1
-; SSE41-NEXT:    paddw %xmm3, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm1
-; SSE41-NEXT:    psraw $1, %xmm1
-; SSE41-NEXT:    paddw %xmm3, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm2
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    paddw %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    psraw $8, %xmm4
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $4, %xmm2
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $2, %xmm2
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psraw $1, %xmm2
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: var_shift_v8i16:
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
index 90f064b..bd77311 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -227,32 +227,33 @@
 ;
 ; SSE41-LABEL: var_shift_v8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    psllw $12, %xmm0
-; SSE41-NEXT:    psllw $4, %xmm1
-; SSE41-NEXT:    por %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    paddw %xmm1, %xmm3
-; SSE41-NEXT:    movdqa %xmm2, %xmm4
-; SSE41-NEXT:    psrlw $8, %xmm4
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm1
-; SSE41-NEXT:    psrlw $4, %xmm1
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm1
-; SSE41-NEXT:    psrlw $2, %xmm1
-; SSE41-NEXT:    paddw %xmm3, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm1
-; SSE41-NEXT:    psrlw $1, %xmm1
-; SSE41-NEXT:    paddw %xmm3, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    psllw $12, %xmm0
+; SSE41-NEXT:    psllw $4, %xmm2
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    paddw %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    psrlw $8, %xmm4
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $4, %xmm2
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $2, %xmm2
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $1, %xmm2
+; SSE41-NEXT:    paddw %xmm3, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: var_shift_v8i16:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
index 9785bb5..ffcaad8 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -164,8 +164,8 @@
 define <2 x double> @shuffle_v2f64_22(<2 x double> %a, <2 x double> %b) {
 ; SSE2-LABEL: shuffle_v2f64_22:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0,0]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: shuffle_v2f64_22:
@@ -193,8 +193,8 @@
 define <2 x double> @shuffle_v2f64_32(<2 x double> %a, <2 x double> %b) {
 ; SSE-LABEL: shuffle_v2f64_32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
 ; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2f64_32:
@@ -208,8 +208,8 @@
 define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
 ; SSE-LABEL: shuffle_v2f64_33:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
 ; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2f64_33:
@@ -309,8 +309,8 @@
 define <2 x double> @shuffle_v2f64_3u(<2 x double> %a, <2 x double> %b) {
 ; SSE-LABEL: shuffle_v2f64_3u:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
 ; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2f64_3u:
@@ -337,8 +337,8 @@
 define <2 x i64> @shuffle_v2i64_02_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
 ; SSE-LABEL: shuffle_v2i64_02_copy:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2i64_02_copy:
@@ -382,26 +382,26 @@
 define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
 ; SSE2-LABEL: shuffle_v2i64_03_copy:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
 ; SSE2-NEXT:    movapd %xmm2, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: shuffle_v2i64_03_copy:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
 ; SSE3-NEXT:    movapd %xmm2, %xmm0
+; SSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v2i64_03_copy:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
 ; SSSE3-NEXT:    movapd %xmm2, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v2i64_03_copy:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2i64_03_copy:
@@ -444,26 +444,26 @@
 define <2 x i64> @shuffle_v2i64_12_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
 ; SSE2-LABEL: shuffle_v2i64_12_copy:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0]
 ; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: shuffle_v2i64_12_copy:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0]
 ; SSE3-NEXT:    movapd %xmm1, %xmm0
+; SSE3-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm2[0]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v2i64_12_copy:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    palignr {{.*#+}} xmm2 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v2i64_12_copy:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    palignr {{.*#+}} xmm2 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2i64_12_copy:
@@ -489,8 +489,8 @@
 define <2 x i64> @shuffle_v2i64_13_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
 ; SSE-LABEL: shuffle_v2i64_13_copy:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
 ; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2i64_13_copy:
@@ -517,8 +517,8 @@
 define <2 x i64> @shuffle_v2i64_20_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
 ; SSE-LABEL: shuffle_v2i64_20_copy:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
 ; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2i64_20_copy:
@@ -559,26 +559,26 @@
 define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
 ; SSE2-LABEL: shuffle_v2i64_21_copy:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: shuffle_v2i64_21_copy:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE3-NEXT:    movapd %xmm1, %xmm0
+; SSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v2i64_21_copy:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSSE3-NEXT:    movapd %xmm1, %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v2i64_21_copy:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2i64_21_copy:
@@ -621,26 +621,26 @@
 define <2 x i64> @shuffle_v2i64_30_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
 ; SSE2-LABEL: shuffle_v2i64_30_copy:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
 ; SSE2-NEXT:    movapd %xmm2, %xmm0
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: shuffle_v2i64_30_copy:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
 ; SSE3-NEXT:    movapd %xmm2, %xmm0
+; SSE3-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v2i64_30_copy:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v2i64_30_copy:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2i64_30_copy:
@@ -667,8 +667,8 @@
 define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
 ; SSE-LABEL: shuffle_v2i64_31_copy:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
 ; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2i64_31_copy:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll
index 5da9419..1b701f8 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll
@@ -33,8 +33,8 @@
 define <16 x i8> @combine_insertqi_pshufb_16i8(<16 x i8> %a0, <16 x i8> %a1) {
 ; SSSE3-LABEL: combine_insertqi_pshufb_16i8:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    extrq {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE42-LABEL: combine_insertqi_pshufb_16i8:
@@ -54,8 +54,8 @@
 define <8 x i16> @combine_insertqi_pshufb_8i16(<8 x i16> %a0, <8 x i16> %a1) {
 ; SSSE3-LABEL: combine_insertqi_pshufb_8i16:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    extrq {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE42-LABEL: combine_insertqi_pshufb_8i16:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index a1316eb..f32f87b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -574,8 +574,8 @@
 define <16 x i8> @combine_unpckl_arg1_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
 ; SSE-LABEL: combine_unpckl_arg1_pshufb:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_unpckl_arg1_pshufb:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 0d0e1c6..2eb9362 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -1624,8 +1624,8 @@
 define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: combine_test1b:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
 ; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_test1b:
@@ -1640,8 +1640,8 @@
 define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
 ; SSE2-LABEL: combine_test2b:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0,0]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_test2b:
@@ -1695,8 +1695,8 @@
 define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: combine_test4b:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
 ; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_test4b:
@@ -2766,9 +2766,9 @@
 define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: PR22412:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
-; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
 ; SSE-NEXT:    movaps %xmm3, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[3,2]
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: PR22412:
diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll
index 1e07412..2cb8cd0 100644
--- a/llvm/test/CodeGen/X86/vector-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-zext.ll
@@ -2123,6 +2123,7 @@
 define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
 ; SSE2-LABEL: zext_32i8_to_32i32:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
@@ -2150,11 +2151,11 @@
 ; SSE2-NEXT:    movdqa %xmm5, 32(%rdi)
 ; SSE2-NEXT:    movdqa %xmm3, 16(%rdi)
 ; SSE2-NEXT:    movdqa %xmm8, (%rdi)
-; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: zext_32i8_to_32i32:
 ; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movq %rdi, %rax
 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm3
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
@@ -2182,11 +2183,11 @@
 ; SSSE3-NEXT:    movdqa %xmm5, 32(%rdi)
 ; SSSE3-NEXT:    movdqa %xmm3, 16(%rdi)
 ; SSSE3-NEXT:    movdqa %xmm8, (%rdi)
-; SSSE3-NEXT:    movq %rdi, %rax
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: zext_32i8_to_32i32:
 ; SSE41:       # %bb.0:
+; SSE41-NEXT:    movq %rdi, %rax
 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
 ; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
@@ -2209,7 +2210,6 @@
 ; SSE41-NEXT:    movdqa %xmm4, 32(%rdi)
 ; SSE41-NEXT:    movdqa %xmm3, 16(%rdi)
 ; SSE41-NEXT:    movdqa %xmm2, (%rdi)
-; SSE41-NEXT:    movq %rdi, %rax
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: zext_32i8_to_32i32:
diff --git a/llvm/test/CodeGen/X86/vectorcall.ll b/llvm/test/CodeGen/X86/vectorcall.ll
index 9914780..08a2827 100644
--- a/llvm/test/CodeGen/X86/vectorcall.ll
+++ b/llvm/test/CodeGen/X86/vectorcall.ll
@@ -22,7 +22,8 @@
 }
 ; X86-LABEL: {{^}}test_int_3@@8:
 ; X64-LABEL: {{^}}test_int_3@@8:
-; CHECK: movl %ecx, %eax
+; X86: movl %ecx, %eax
+; X64: movq %rcx, %rax
 
 define x86_vectorcallcc i32 @test_int_4(i32 inreg %a, i32 inreg %b) {
   %s = add i32 %a, %b
@@ -148,8 +149,8 @@
   ret <4 x float> %0
 }
 ; CHECK-LABEL: test_mixed_5
-; CHECK:       movaps	%xmm5, 16(%{{(e|r)}}sp)
-; CHECK:       movaps	%xmm5, %xmm0
+; CHECK-DAG:   movaps	%xmm{{[0,5]}}, 16(%{{(e|r)}}sp)
+; CHECK-DAG:   movaps	%xmm5, %xmm0
 ; CHECK:       ret{{[ql]}}
 
 define x86_vectorcallcc %struct.HVA4 @test_mixed_6(%struct.HVA4 inreg %a, %struct.HVA4* %b) {
@@ -183,12 +184,12 @@
   ret void
 }
 ; CHECK-LABEL: test_mixed_7
+; X64:         mov{{[ql]}}	%rcx, %rax
 ; CHECK:       movaps	%xmm{{[0-9]}}, 64(%{{rcx|eax}})
 ; CHECK:       movaps	%xmm{{[0-9]}}, 48(%{{rcx|eax}})
 ; CHECK:       movaps	%xmm{{[0-9]}}, 32(%{{rcx|eax}})
 ; CHECK:       movaps	%xmm{{[0-9]}}, 16(%{{rcx|eax}})
 ; CHECK:       movaps	%xmm{{[0-9]}}, (%{{rcx|eax}})
-; X64:         mov{{[ql]}}	%rcx, %rax
 ; CHECK:       ret{{[ql]}}
 
 define x86_vectorcallcc <4 x float> @test_mixed_8(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, i32 %e, <4 x float> %f) {
diff --git a/llvm/test/CodeGen/X86/vselect-minmax.ll b/llvm/test/CodeGen/X86/vselect-minmax.ll
index 9cfb248..d61c4bc 100644
--- a/llvm/test/CodeGen/X86/vselect-minmax.ll
+++ b/llvm/test/CodeGen/X86/vselect-minmax.ll
@@ -4535,23 +4535,24 @@
 ;
 ; SSE4-LABEL: test121:
 ; SSE4:       # %bb.0: # %entry
-; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa %xmm7, %xmm8
+; SSE4-NEXT:    movdqa %xmm0, %xmm7
 ; SSE4-NEXT:    movdqa %xmm4, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm8, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
+; SSE4-NEXT:    pcmpgtq %xmm7, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm7, %xmm4
 ; SSE4-NEXT:    movdqa %xmm5, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm1, %xmm0
 ; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
 ; SSE4-NEXT:    movdqa %xmm6, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm2, %xmm0
 ; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
-; SSE4-NEXT:    movdqa %xmm7, %xmm0
+; SSE4-NEXT:    movdqa %xmm8, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm8
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    movapd %xmm5, %xmm1
 ; SSE4-NEXT:    movapd %xmm6, %xmm2
-; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    movapd %xmm8, %xmm3
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test121:
@@ -4655,23 +4656,24 @@
 ;
 ; SSE4-LABEL: test122:
 ; SSE4:       # %bb.0: # %entry
-; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa %xmm7, %xmm8
+; SSE4-NEXT:    movdqa %xmm0, %xmm7
 ; SSE4-NEXT:    movdqa %xmm4, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm8, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
+; SSE4-NEXT:    pcmpgtq %xmm7, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm7, %xmm4
 ; SSE4-NEXT:    movdqa %xmm5, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm1, %xmm0
 ; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
 ; SSE4-NEXT:    movdqa %xmm6, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm2, %xmm0
 ; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
-; SSE4-NEXT:    movdqa %xmm7, %xmm0
+; SSE4-NEXT:    movdqa %xmm8, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm8
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    movapd %xmm5, %xmm1
 ; SSE4-NEXT:    movapd %xmm6, %xmm2
-; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    movapd %xmm8, %xmm3
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test122:
@@ -4775,9 +4777,10 @@
 ;
 ; SSE4-LABEL: test123:
 ; SSE4:       # %bb.0: # %entry
-; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa %xmm7, %xmm8
+; SSE4-NEXT:    movdqa %xmm0, %xmm7
 ; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm7, %xmm4
 ; SSE4-NEXT:    movdqa %xmm1, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm5, %xmm0
 ; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
@@ -4785,12 +4788,12 @@
 ; SSE4-NEXT:    pcmpgtq %xmm6, %xmm0
 ; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
 ; SSE4-NEXT:    movdqa %xmm3, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm7, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
+; SSE4-NEXT:    pcmpgtq %xmm8, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm8
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    movapd %xmm5, %xmm1
 ; SSE4-NEXT:    movapd %xmm6, %xmm2
-; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    movapd %xmm8, %xmm3
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test123:
@@ -4894,9 +4897,10 @@
 ;
 ; SSE4-LABEL: test124:
 ; SSE4:       # %bb.0: # %entry
-; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa %xmm7, %xmm8
+; SSE4-NEXT:    movdqa %xmm0, %xmm7
 ; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm7, %xmm4
 ; SSE4-NEXT:    movdqa %xmm1, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm5, %xmm0
 ; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
@@ -4904,12 +4908,12 @@
 ; SSE4-NEXT:    pcmpgtq %xmm6, %xmm0
 ; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
 ; SSE4-NEXT:    movdqa %xmm3, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm7, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
+; SSE4-NEXT:    pcmpgtq %xmm8, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm8
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    movapd %xmm5, %xmm1
 ; SSE4-NEXT:    movapd %xmm6, %xmm2
-; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    movapd %xmm8, %xmm3
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test124:
@@ -5013,36 +5017,39 @@
 ;
 ; SSE4-LABEL: test125:
 ; SSE4:       # %bb.0: # %entry
-; SSE4-NEXT:    movdqa %xmm0, %xmm9
-; SSE4-NEXT:    movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT:    movdqa %xmm0, %xmm10
-; SSE4-NEXT:    pxor %xmm8, %xmm10
+; SSE4-NEXT:    movdqa %xmm7, %xmm8
+; SSE4-NEXT:    movdqa %xmm6, %xmm9
+; SSE4-NEXT:    movdqa %xmm5, %xmm10
+; SSE4-NEXT:    movdqa %xmm0, %xmm5
+; SSE4-NEXT:    movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm0, %xmm6
+; SSE4-NEXT:    pxor %xmm7, %xmm6
 ; SSE4-NEXT:    movdqa %xmm4, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm9, %xmm4
-; SSE4-NEXT:    movdqa %xmm1, %xmm9
-; SSE4-NEXT:    pxor %xmm8, %xmm9
-; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm5, %xmm4
+; SSE4-NEXT:    movdqa %xmm1, %xmm5
+; SSE4-NEXT:    pxor %xmm7, %xmm5
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm10
 ; SSE4-NEXT:    movdqa %xmm2, %xmm1
-; SSE4-NEXT:    pxor %xmm8, %xmm1
-; SSE4-NEXT:    movdqa %xmm6, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
+; SSE4-NEXT:    pxor %xmm7, %xmm1
+; SSE4-NEXT:    movdqa %xmm9, %xmm0
+; SSE4-NEXT:    pxor %xmm7, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm9
 ; SSE4-NEXT:    movdqa %xmm3, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pxor %xmm7, %xmm8
-; SSE4-NEXT:    pcmpgtq %xmm0, %xmm8
-; SSE4-NEXT:    movdqa %xmm8, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pxor %xmm8, %xmm7
+; SSE4-NEXT:    pcmpgtq %xmm0, %xmm7
+; SSE4-NEXT:    movdqa %xmm7, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm8
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
-; SSE4-NEXT:    movapd %xmm5, %xmm1
-; SSE4-NEXT:    movapd %xmm6, %xmm2
-; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    movapd %xmm10, %xmm1
+; SSE4-NEXT:    movapd %xmm9, %xmm2
+; SSE4-NEXT:    movapd %xmm8, %xmm3
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test125:
@@ -5160,36 +5167,39 @@
 ;
 ; SSE4-LABEL: test126:
 ; SSE4:       # %bb.0: # %entry
-; SSE4-NEXT:    movdqa %xmm0, %xmm9
-; SSE4-NEXT:    movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT:    movdqa %xmm0, %xmm10
-; SSE4-NEXT:    pxor %xmm8, %xmm10
+; SSE4-NEXT:    movdqa %xmm7, %xmm8
+; SSE4-NEXT:    movdqa %xmm6, %xmm9
+; SSE4-NEXT:    movdqa %xmm5, %xmm10
+; SSE4-NEXT:    movdqa %xmm0, %xmm5
+; SSE4-NEXT:    movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm0, %xmm6
+; SSE4-NEXT:    pxor %xmm7, %xmm6
 ; SSE4-NEXT:    movdqa %xmm4, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm9, %xmm4
-; SSE4-NEXT:    movdqa %xmm1, %xmm9
-; SSE4-NEXT:    pxor %xmm8, %xmm9
-; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm5, %xmm4
+; SSE4-NEXT:    movdqa %xmm1, %xmm5
+; SSE4-NEXT:    pxor %xmm7, %xmm5
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm10
 ; SSE4-NEXT:    movdqa %xmm2, %xmm1
-; SSE4-NEXT:    pxor %xmm8, %xmm1
-; SSE4-NEXT:    movdqa %xmm6, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
+; SSE4-NEXT:    pxor %xmm7, %xmm1
+; SSE4-NEXT:    movdqa %xmm9, %xmm0
+; SSE4-NEXT:    pxor %xmm7, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm9
 ; SSE4-NEXT:    movdqa %xmm3, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pxor %xmm7, %xmm8
-; SSE4-NEXT:    pcmpgtq %xmm0, %xmm8
-; SSE4-NEXT:    movdqa %xmm8, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pxor %xmm8, %xmm7
+; SSE4-NEXT:    pcmpgtq %xmm0, %xmm7
+; SSE4-NEXT:    movdqa %xmm7, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm8
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
-; SSE4-NEXT:    movapd %xmm5, %xmm1
-; SSE4-NEXT:    movapd %xmm6, %xmm2
-; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    movapd %xmm10, %xmm1
+; SSE4-NEXT:    movapd %xmm9, %xmm2
+; SSE4-NEXT:    movapd %xmm8, %xmm3
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test126:
@@ -5307,35 +5317,38 @@
 ;
 ; SSE4-LABEL: test127:
 ; SSE4:       # %bb.0: # %entry
-; SSE4-NEXT:    movdqa %xmm0, %xmm9
-; SSE4-NEXT:    movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT:    movdqa %xmm4, %xmm10
-; SSE4-NEXT:    pxor %xmm8, %xmm10
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm9, %xmm4
-; SSE4-NEXT:    movdqa %xmm5, %xmm9
-; SSE4-NEXT:    pxor %xmm8, %xmm9
+; SSE4-NEXT:    movdqa %xmm7, %xmm8
+; SSE4-NEXT:    movdqa %xmm6, %xmm9
+; SSE4-NEXT:    movdqa %xmm5, %xmm10
+; SSE4-NEXT:    movdqa %xmm4, %xmm5
+; SSE4-NEXT:    movdqa %xmm0, %xmm6
+; SSE4-NEXT:    movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    pxor %xmm7, %xmm4
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm6, %xmm5
+; SSE4-NEXT:    movdqa %xmm10, %xmm4
+; SSE4-NEXT:    pxor %xmm7, %xmm4
 ; SSE4-NEXT:    movdqa %xmm1, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
-; SSE4-NEXT:    movdqa %xmm6, %xmm1
-; SSE4-NEXT:    pxor %xmm8, %xmm1
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm10
+; SSE4-NEXT:    movdqa %xmm9, %xmm1
+; SSE4-NEXT:    pxor %xmm7, %xmm1
 ; SSE4-NEXT:    movdqa %xmm2, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
+; SSE4-NEXT:    pxor %xmm7, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
-; SSE4-NEXT:    movdqa %xmm7, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pxor %xmm3, %xmm8
-; SSE4-NEXT:    pcmpgtq %xmm0, %xmm8
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm9
 ; SSE4-NEXT:    movdqa %xmm8, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
-; SSE4-NEXT:    movapd %xmm4, %xmm0
-; SSE4-NEXT:    movapd %xmm5, %xmm1
-; SSE4-NEXT:    movapd %xmm6, %xmm2
-; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pxor %xmm3, %xmm7
+; SSE4-NEXT:    pcmpgtq %xmm0, %xmm7
+; SSE4-NEXT:    movdqa %xmm7, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm8
+; SSE4-NEXT:    movapd %xmm5, %xmm0
+; SSE4-NEXT:    movapd %xmm10, %xmm1
+; SSE4-NEXT:    movapd %xmm9, %xmm2
+; SSE4-NEXT:    movapd %xmm8, %xmm3
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test127:
@@ -5453,35 +5466,38 @@
 ;
 ; SSE4-LABEL: test128:
 ; SSE4:       # %bb.0: # %entry
-; SSE4-NEXT:    movdqa %xmm0, %xmm9
-; SSE4-NEXT:    movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT:    movdqa %xmm4, %xmm10
-; SSE4-NEXT:    pxor %xmm8, %xmm10
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm9, %xmm4
-; SSE4-NEXT:    movdqa %xmm5, %xmm9
-; SSE4-NEXT:    pxor %xmm8, %xmm9
+; SSE4-NEXT:    movdqa %xmm7, %xmm8
+; SSE4-NEXT:    movdqa %xmm6, %xmm9
+; SSE4-NEXT:    movdqa %xmm5, %xmm10
+; SSE4-NEXT:    movdqa %xmm4, %xmm5
+; SSE4-NEXT:    movdqa %xmm0, %xmm6
+; SSE4-NEXT:    movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    pxor %xmm7, %xmm4
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm6, %xmm5
+; SSE4-NEXT:    movdqa %xmm10, %xmm4
+; SSE4-NEXT:    pxor %xmm7, %xmm4
 ; SSE4-NEXT:    movdqa %xmm1, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
-; SSE4-NEXT:    movdqa %xmm6, %xmm1
-; SSE4-NEXT:    pxor %xmm8, %xmm1
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm10
+; SSE4-NEXT:    movdqa %xmm9, %xmm1
+; SSE4-NEXT:    pxor %xmm7, %xmm1
 ; SSE4-NEXT:    movdqa %xmm2, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
+; SSE4-NEXT:    pxor %xmm7, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
-; SSE4-NEXT:    movdqa %xmm7, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pxor %xmm3, %xmm8
-; SSE4-NEXT:    pcmpgtq %xmm0, %xmm8
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm9
 ; SSE4-NEXT:    movdqa %xmm8, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
-; SSE4-NEXT:    movapd %xmm4, %xmm0
-; SSE4-NEXT:    movapd %xmm5, %xmm1
-; SSE4-NEXT:    movapd %xmm6, %xmm2
-; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pxor %xmm3, %xmm7
+; SSE4-NEXT:    pcmpgtq %xmm0, %xmm7
+; SSE4-NEXT:    movdqa %xmm7, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm8
+; SSE4-NEXT:    movapd %xmm5, %xmm0
+; SSE4-NEXT:    movapd %xmm10, %xmm1
+; SSE4-NEXT:    movapd %xmm9, %xmm2
+; SSE4-NEXT:    movapd %xmm8, %xmm3
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test128:
@@ -6977,9 +6993,10 @@
 ;
 ; SSE4-LABEL: test153:
 ; SSE4:       # %bb.0: # %entry
-; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa %xmm7, %xmm8
+; SSE4-NEXT:    movdqa %xmm0, %xmm7
 ; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm7, %xmm4
 ; SSE4-NEXT:    movdqa %xmm1, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm5, %xmm0
 ; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
@@ -6987,12 +7004,12 @@
 ; SSE4-NEXT:    pcmpgtq %xmm6, %xmm0
 ; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
 ; SSE4-NEXT:    movdqa %xmm3, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm7, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
+; SSE4-NEXT:    pcmpgtq %xmm8, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm8
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    movapd %xmm5, %xmm1
 ; SSE4-NEXT:    movapd %xmm6, %xmm2
-; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    movapd %xmm8, %xmm3
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test153:
@@ -7096,9 +7113,10 @@
 ;
 ; SSE4-LABEL: test154:
 ; SSE4:       # %bb.0: # %entry
-; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa %xmm7, %xmm8
+; SSE4-NEXT:    movdqa %xmm0, %xmm7
 ; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm7, %xmm4
 ; SSE4-NEXT:    movdqa %xmm1, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm5, %xmm0
 ; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
@@ -7106,12 +7124,12 @@
 ; SSE4-NEXT:    pcmpgtq %xmm6, %xmm0
 ; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
 ; SSE4-NEXT:    movdqa %xmm3, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm7, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
+; SSE4-NEXT:    pcmpgtq %xmm8, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm8
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    movapd %xmm5, %xmm1
 ; SSE4-NEXT:    movapd %xmm6, %xmm2
-; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    movapd %xmm8, %xmm3
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test154:
@@ -7215,23 +7233,24 @@
 ;
 ; SSE4-LABEL: test155:
 ; SSE4:       # %bb.0: # %entry
-; SSE4-NEXT:    movdqa %xmm0, %xmm8
+; SSE4-NEXT:    movdqa %xmm7, %xmm8
+; SSE4-NEXT:    movdqa %xmm0, %xmm7
 ; SSE4-NEXT:    movdqa %xmm4, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm8, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
+; SSE4-NEXT:    pcmpgtq %xmm7, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm7, %xmm4
 ; SSE4-NEXT:    movdqa %xmm5, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm1, %xmm0
 ; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
 ; SSE4-NEXT:    movdqa %xmm6, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm2, %xmm0
 ; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
-; SSE4-NEXT:    movdqa %xmm7, %xmm0
+; SSE4-NEXT:    movdqa %xmm8, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm8
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    movapd %xmm5, %xmm1
 ; SSE4-NEXT:    movapd %xmm6, %xmm2
-; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    movapd %xmm8, %xmm3
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test155:
@@ -7335,35 +7354,38 @@
 ;
 ; SSE4-LABEL: test156:
 ; SSE4:       # %bb.0: # %entry
-; SSE4-NEXT:    movdqa %xmm0, %xmm9
-; SSE4-NEXT:    movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT:    movdqa %xmm4, %xmm10
-; SSE4-NEXT:    pxor %xmm8, %xmm10
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm9, %xmm4
-; SSE4-NEXT:    movdqa %xmm5, %xmm9
-; SSE4-NEXT:    pxor %xmm8, %xmm9
+; SSE4-NEXT:    movdqa %xmm7, %xmm8
+; SSE4-NEXT:    movdqa %xmm6, %xmm9
+; SSE4-NEXT:    movdqa %xmm5, %xmm10
+; SSE4-NEXT:    movdqa %xmm4, %xmm5
+; SSE4-NEXT:    movdqa %xmm0, %xmm6
+; SSE4-NEXT:    movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    pxor %xmm7, %xmm4
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm6, %xmm5
+; SSE4-NEXT:    movdqa %xmm10, %xmm4
+; SSE4-NEXT:    pxor %xmm7, %xmm4
 ; SSE4-NEXT:    movdqa %xmm1, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
-; SSE4-NEXT:    movdqa %xmm6, %xmm1
-; SSE4-NEXT:    pxor %xmm8, %xmm1
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm10
+; SSE4-NEXT:    movdqa %xmm9, %xmm1
+; SSE4-NEXT:    pxor %xmm7, %xmm1
 ; SSE4-NEXT:    movdqa %xmm2, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
+; SSE4-NEXT:    pxor %xmm7, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
-; SSE4-NEXT:    movdqa %xmm7, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pxor %xmm3, %xmm8
-; SSE4-NEXT:    pcmpgtq %xmm0, %xmm8
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm9
 ; SSE4-NEXT:    movdqa %xmm8, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
-; SSE4-NEXT:    movapd %xmm4, %xmm0
-; SSE4-NEXT:    movapd %xmm5, %xmm1
-; SSE4-NEXT:    movapd %xmm6, %xmm2
-; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pxor %xmm3, %xmm7
+; SSE4-NEXT:    pcmpgtq %xmm0, %xmm7
+; SSE4-NEXT:    movdqa %xmm7, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm8
+; SSE4-NEXT:    movapd %xmm5, %xmm0
+; SSE4-NEXT:    movapd %xmm10, %xmm1
+; SSE4-NEXT:    movapd %xmm9, %xmm2
+; SSE4-NEXT:    movapd %xmm8, %xmm3
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test156:
@@ -7481,36 +7503,39 @@
 ;
 ; SSE4-LABEL: test159:
 ; SSE4:       # %bb.0: # %entry
-; SSE4-NEXT:    movdqa %xmm0, %xmm9
-; SSE4-NEXT:    movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT:    movdqa %xmm0, %xmm10
-; SSE4-NEXT:    pxor %xmm8, %xmm10
+; SSE4-NEXT:    movdqa %xmm7, %xmm8
+; SSE4-NEXT:    movdqa %xmm6, %xmm9
+; SSE4-NEXT:    movdqa %xmm5, %xmm10
+; SSE4-NEXT:    movdqa %xmm0, %xmm5
+; SSE4-NEXT:    movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm0, %xmm6
+; SSE4-NEXT:    pxor %xmm7, %xmm6
 ; SSE4-NEXT:    movdqa %xmm4, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm9, %xmm4
-; SSE4-NEXT:    movdqa %xmm1, %xmm9
-; SSE4-NEXT:    pxor %xmm8, %xmm9
-; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm5, %xmm4
+; SSE4-NEXT:    movdqa %xmm1, %xmm5
+; SSE4-NEXT:    pxor %xmm7, %xmm5
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm10
 ; SSE4-NEXT:    movdqa %xmm2, %xmm1
-; SSE4-NEXT:    pxor %xmm8, %xmm1
-; SSE4-NEXT:    movdqa %xmm6, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
+; SSE4-NEXT:    pxor %xmm7, %xmm1
+; SSE4-NEXT:    movdqa %xmm9, %xmm0
+; SSE4-NEXT:    pxor %xmm7, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm9
 ; SSE4-NEXT:    movdqa %xmm3, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pxor %xmm7, %xmm8
-; SSE4-NEXT:    pcmpgtq %xmm0, %xmm8
-; SSE4-NEXT:    movdqa %xmm8, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pxor %xmm8, %xmm7
+; SSE4-NEXT:    pcmpgtq %xmm0, %xmm7
+; SSE4-NEXT:    movdqa %xmm7, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm8
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
-; SSE4-NEXT:    movapd %xmm5, %xmm1
-; SSE4-NEXT:    movapd %xmm6, %xmm2
-; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    movapd %xmm10, %xmm1
+; SSE4-NEXT:    movapd %xmm9, %xmm2
+; SSE4-NEXT:    movapd %xmm8, %xmm3
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test159:
@@ -7628,36 +7653,39 @@
 ;
 ; SSE4-LABEL: test160:
 ; SSE4:       # %bb.0: # %entry
-; SSE4-NEXT:    movdqa %xmm0, %xmm9
-; SSE4-NEXT:    movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT:    movdqa %xmm0, %xmm10
-; SSE4-NEXT:    pxor %xmm8, %xmm10
+; SSE4-NEXT:    movdqa %xmm7, %xmm8
+; SSE4-NEXT:    movdqa %xmm6, %xmm9
+; SSE4-NEXT:    movdqa %xmm5, %xmm10
+; SSE4-NEXT:    movdqa %xmm0, %xmm5
+; SSE4-NEXT:    movdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT:    movdqa %xmm0, %xmm6
+; SSE4-NEXT:    pxor %xmm7, %xmm6
 ; SSE4-NEXT:    movdqa %xmm4, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm9, %xmm4
-; SSE4-NEXT:    movdqa %xmm1, %xmm9
-; SSE4-NEXT:    pxor %xmm8, %xmm9
-; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pcmpgtq %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm6, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm5, %xmm4
+; SSE4-NEXT:    movdqa %xmm1, %xmm5
+; SSE4-NEXT:    pxor %xmm7, %xmm5
+; SSE4-NEXT:    movdqa %xmm10, %xmm0
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pcmpgtq %xmm5, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm10
 ; SSE4-NEXT:    movdqa %xmm2, %xmm1
-; SSE4-NEXT:    pxor %xmm8, %xmm1
-; SSE4-NEXT:    movdqa %xmm6, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
+; SSE4-NEXT:    pxor %xmm7, %xmm1
+; SSE4-NEXT:    movdqa %xmm9, %xmm0
+; SSE4-NEXT:    pxor %xmm7, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm9
 ; SSE4-NEXT:    movdqa %xmm3, %xmm0
-; SSE4-NEXT:    pxor %xmm8, %xmm0
-; SSE4-NEXT:    pxor %xmm7, %xmm8
-; SSE4-NEXT:    pcmpgtq %xmm0, %xmm8
-; SSE4-NEXT:    movdqa %xmm8, %xmm0
-; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
+; SSE4-NEXT:    pxor %xmm7, %xmm0
+; SSE4-NEXT:    pxor %xmm8, %xmm7
+; SSE4-NEXT:    pcmpgtq %xmm0, %xmm7
+; SSE4-NEXT:    movdqa %xmm7, %xmm0
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm8
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
-; SSE4-NEXT:    movapd %xmm5, %xmm1
-; SSE4-NEXT:    movapd %xmm6, %xmm2
-; SSE4-NEXT:    movapd %xmm7, %xmm3
+; SSE4-NEXT:    movapd %xmm10, %xmm1
+; SSE4-NEXT:    movapd %xmm9, %xmm2
+; SSE4-NEXT:    movapd %xmm8, %xmm3
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test160:
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index 899d46b..2d08e21 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -457,6 +457,7 @@
 define <16 x double> @select_illegal(<16 x double> %a, <16 x double> %b) {
 ; SSE-LABEL: select_illegal:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
 ; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
 ; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm6
@@ -469,13 +470,12 @@
 ; SSE-NEXT:    movaps %xmm2, 32(%rdi)
 ; SSE-NEXT:    movaps %xmm1, 16(%rdi)
 ; SSE-NEXT:    movaps %xmm0, (%rdi)
-; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: select_illegal:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps %ymm6, %ymm2
 ; AVX-NEXT:    vmovaps %ymm7, %ymm3
+; AVX-NEXT:    vmovaps %ymm6, %ymm2
 ; AVX-NEXT:    retq
   %sel = select <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x double> %a, <16 x double> %b
   ret <16 x double> %sel
diff --git a/llvm/test/CodeGen/X86/widen_bitops-0.ll b/llvm/test/CodeGen/X86/widen_bitops-0.ll
index 5312737..251009d 100644
--- a/llvm/test/CodeGen/X86/widen_bitops-0.ll
+++ b/llvm/test/CodeGen/X86/widen_bitops-0.ll
@@ -15,8 +15,8 @@
 ;
 ; X64-SSE-LABEL: and_i24_as_v3i8:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    andl %esi, %edi
 ; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    andl %esi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i24 %a to <3 x i8>
   %2 = bitcast i24 %b to <3 x i8>
@@ -34,8 +34,8 @@
 ;
 ; X64-SSE-LABEL: xor_i24_as_v3i8:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    xorl %esi, %edi
 ; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    xorl %esi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i24 %a to <3 x i8>
   %2 = bitcast i24 %b to <3 x i8>
@@ -53,8 +53,8 @@
 ;
 ; X64-SSE-LABEL: or_i24_as_v3i8:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    orl %esi, %edi
 ; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    orl %esi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i24 %a to <3 x i8>
   %2 = bitcast i24 %b to <3 x i8>
@@ -76,8 +76,8 @@
 ;
 ; X64-SSE-LABEL: and_i24_as_v8i3:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    andl %esi, %edi
 ; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    andl %esi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i24 %a to <8 x i3>
   %2 = bitcast i24 %b to <8 x i3>
@@ -95,8 +95,8 @@
 ;
 ; X64-SSE-LABEL: xor_i24_as_v8i3:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    xorl %esi, %edi
 ; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    xorl %esi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i24 %a to <8 x i3>
   %2 = bitcast i24 %b to <8 x i3>
@@ -114,8 +114,8 @@
 ;
 ; X64-SSE-LABEL: or_i24_as_v8i3:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    orl %esi, %edi
 ; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    orl %esi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i24 %a to <8 x i3>
   %2 = bitcast i24 %b to <8 x i3>
diff --git a/llvm/test/CodeGen/X86/widen_bitops-1.ll b/llvm/test/CodeGen/X86/widen_bitops-1.ll
index fa41b16..3c97efe 100644
--- a/llvm/test/CodeGen/X86/widen_bitops-1.ll
+++ b/llvm/test/CodeGen/X86/widen_bitops-1.ll
@@ -15,8 +15,8 @@
 ;
 ; X64-SSE-LABEL: and_i32_as_v4i8:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    andl %esi, %edi
 ; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    andl %esi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i32 %a to <4 x i8>
   %2 = bitcast i32 %b to <4 x i8>
@@ -34,8 +34,8 @@
 ;
 ; X64-SSE-LABEL: xor_i32_as_v4i8:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    xorl %esi, %edi
 ; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    xorl %esi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i32 %a to <4 x i8>
   %2 = bitcast i32 %b to <4 x i8>
@@ -53,8 +53,8 @@
 ;
 ; X64-SSE-LABEL: or_i32_as_v4i8:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    orl %esi, %edi
 ; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    orl %esi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i32 %a to <4 x i8>
   %2 = bitcast i32 %b to <4 x i8>
@@ -76,8 +76,8 @@
 ;
 ; X64-SSE-LABEL: and_i32_as_v8i4:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    andl %esi, %edi
 ; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    andl %esi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i32 %a to <8 x i4>
   %2 = bitcast i32 %b to <8 x i4>
@@ -95,8 +95,8 @@
 ;
 ; X64-SSE-LABEL: xor_i32_as_v8i4:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    xorl %esi, %edi
 ; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    xorl %esi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i32 %a to <8 x i4>
   %2 = bitcast i32 %b to <8 x i4>
@@ -114,8 +114,8 @@
 ;
 ; X64-SSE-LABEL: or_i32_as_v8i4:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    orl %esi, %edi
 ; X64-SSE-NEXT:    movl %edi, %eax
+; X64-SSE-NEXT:    orl %esi, %eax
 ; X64-SSE-NEXT:    retq
   %1 = bitcast i32 %a to <8 x i4>
   %2 = bitcast i32 %b to <8 x i4>
diff --git a/llvm/test/CodeGen/X86/widen_load-2.ll b/llvm/test/CodeGen/X86/widen_load-2.ll
index 1436b55..5147db0 100644
--- a/llvm/test/CodeGen/X86/widen_load-2.ll
+++ b/llvm/test/CodeGen/X86/widen_load-2.ll
@@ -21,11 +21,11 @@
 ;
 ; X64-LABEL: add3i32:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    movdqa (%rsi), %xmm0
 ; X64-NEXT:    paddd (%rdx), %xmm0
 ; X64-NEXT:    pextrd $2, %xmm0, 8(%rdi)
 ; X64-NEXT:    movq %xmm0, (%rdi)
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
 	%a = load %i32vec3, %i32vec3* %ap, align 16
 	%b = load %i32vec3, %i32vec3* %bp, align 16
@@ -54,6 +54,7 @@
 ;
 ; X64-LABEL: add3i32_2:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-NEXT:    pinsrd $2, 8(%rsi), %xmm0
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
@@ -61,7 +62,6 @@
 ; X64-NEXT:    paddd %xmm0, %xmm1
 ; X64-NEXT:    pextrd $2, %xmm1, 8(%rdi)
 ; X64-NEXT:    movq %xmm1, (%rdi)
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
 	%a = load %i32vec3, %i32vec3* %ap, align 8
 	%b = load %i32vec3, %i32vec3* %bp, align 8
@@ -89,6 +89,7 @@
 ;
 ; X64-LABEL: add7i32:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    movdqa (%rsi), %xmm0
 ; X64-NEXT:    movdqa 16(%rsi), %xmm1
 ; X64-NEXT:    paddd (%rdx), %xmm0
@@ -96,7 +97,6 @@
 ; X64-NEXT:    pextrd $2, %xmm1, 24(%rdi)
 ; X64-NEXT:    movq %xmm1, 16(%rdi)
 ; X64-NEXT:    movdqa %xmm0, (%rdi)
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
 	%a = load %i32vec7, %i32vec7* %ap, align 16
 	%b = load %i32vec7, %i32vec7* %bp, align 16
@@ -125,6 +125,7 @@
 ;
 ; X64-LABEL: add12i32:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    movdqa (%rsi), %xmm0
 ; X64-NEXT:    movdqa 16(%rsi), %xmm1
 ; X64-NEXT:    movdqa 32(%rsi), %xmm2
@@ -134,7 +135,6 @@
 ; X64-NEXT:    movdqa %xmm2, 32(%rdi)
 ; X64-NEXT:    movdqa %xmm1, 16(%rdi)
 ; X64-NEXT:    movdqa %xmm0, (%rdi)
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
 	%a = load %i32vec12, %i32vec12* %ap, align 16
 	%b = load %i32vec12, %i32vec12* %bp, align 16
@@ -171,13 +171,13 @@
 ;
 ; X64-LABEL: add3i16:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; X64-NEXT:    pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; X64-NEXT:    paddd %xmm0, %xmm1
 ; X64-NEXT:    pextrw $4, %xmm1, 4(%rdi)
 ; X64-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; X64-NEXT:    movd %xmm1, (%rdi)
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
 	%a = load %i16vec3, %i16vec3* %ap, align 16
 	%b = load %i16vec3, %i16vec3* %bp, align 16
@@ -201,11 +201,11 @@
 ;
 ; X64-LABEL: add4i16:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    paddw %xmm0, %xmm1
 ; X64-NEXT:    movq %xmm1, (%rdi)
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
 	%a = load %i16vec4, %i16vec4* %ap, align 16
 	%b = load %i16vec4, %i16vec4* %bp, align 16
@@ -232,13 +232,13 @@
 ;
 ; X64-LABEL: add12i16:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    movdqa (%rsi), %xmm0
 ; X64-NEXT:    movdqa 16(%rsi), %xmm1
 ; X64-NEXT:    paddw (%rdx), %xmm0
 ; X64-NEXT:    paddw 16(%rdx), %xmm1
 ; X64-NEXT:    movq %xmm1, 16(%rdi)
 ; X64-NEXT:    movdqa %xmm0, (%rdi)
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
 	%a = load %i16vec12, %i16vec12* %ap, align 16
 	%b = load %i16vec12, %i16vec12* %bp, align 16
@@ -267,6 +267,7 @@
 ;
 ; X64-LABEL: add18i16:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    movdqa (%rsi), %xmm0
 ; X64-NEXT:    movdqa 16(%rsi), %xmm1
 ; X64-NEXT:    movdqa 32(%rsi), %xmm2
@@ -276,7 +277,6 @@
 ; X64-NEXT:    movd %xmm2, 32(%rdi)
 ; X64-NEXT:    movdqa %xmm1, 16(%rdi)
 ; X64-NEXT:    movdqa %xmm0, (%rdi)
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
 	%a = load %i16vec18, %i16vec18* %ap, align 16
 	%b = load %i16vec18, %i16vec18* %bp, align 16
@@ -305,13 +305,13 @@
 ;
 ; X64-LABEL: add3i8:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X64-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X64-NEXT:    paddd %xmm0, %xmm1
 ; X64-NEXT:    pextrb $8, %xmm1, 2(%rdi)
 ; X64-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; X64-NEXT:    pextrw $0, %xmm1, (%rdi)
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
 	%a = load %i8vec3, %i8vec3* %ap, align 16
 	%b = load %i8vec3, %i8vec3* %bp, align 16
@@ -341,6 +341,7 @@
 ;
 ; X64-LABEL: add31i8:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    movdqa (%rsi), %xmm0
 ; X64-NEXT:    movdqa 16(%rsi), %xmm1
 ; X64-NEXT:    paddb (%rdx), %xmm0
@@ -350,7 +351,6 @@
 ; X64-NEXT:    pextrd $2, %xmm1, 24(%rdi)
 ; X64-NEXT:    movq %xmm1, 16(%rdi)
 ; X64-NEXT:    movdqa %xmm0, (%rdi)
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
 	%a = load %i8vec31, %i8vec31* %ap, align 16
 	%b = load %i8vec31, %i8vec31* %bp, align 16
@@ -384,6 +384,7 @@
 ;
 ; X64-LABEL: rot:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    movb $-98, 2(%rsi)
 ; X64-NEXT:    movw $-24930, (%rsi) # imm = 0x9E9E
 ; X64-NEXT:    movb $1, 2(%rdx)
@@ -395,7 +396,6 @@
 ; X64-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; X64-NEXT:    pextrb $8, %xmm1, 2(%rdi)
 ; X64-NEXT:    pextrw $0, %xmm0, (%rdi)
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
 entry:
   %storetmp = bitcast %i8vec3pack* %X to <3 x i8>*
diff --git a/llvm/test/CodeGen/X86/widen_load-3.ll b/llvm/test/CodeGen/X86/widen_load-3.ll
index ce358d9..51e4abf 100644
--- a/llvm/test/CodeGen/X86/widen_load-3.ll
+++ b/llvm/test/CodeGen/X86/widen_load-3.ll
@@ -41,26 +41,26 @@
 ;
 ; X64-SSE-LABEL: load7_aligned:
 ; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movq %rdi, %rax
 ; X64-SSE-NEXT:    movaps (%rsi), %xmm0
 ; X64-SSE-NEXT:    movaps 16(%rsi), %xmm1
 ; X64-SSE-NEXT:    movaps 32(%rsi), %xmm2
-; X64-SSE-NEXT:    movq 48(%rsi), %rax
-; X64-SSE-NEXT:    movq %rax, 48(%rdi)
+; X64-SSE-NEXT:    movq 48(%rsi), %rcx
+; X64-SSE-NEXT:    movq %rcx, 48(%rdi)
 ; X64-SSE-NEXT:    movaps %xmm2, 32(%rdi)
 ; X64-SSE-NEXT:    movaps %xmm1, 16(%rdi)
 ; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
-; X64-SSE-NEXT:    movq %rdi, %rax
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: load7_aligned:
 ; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movq %rdi, %rax
 ; X64-AVX-NEXT:    vmovaps (%rsi), %ymm0
 ; X64-AVX-NEXT:    vmovaps 32(%rsi), %ymm1
 ; X64-AVX-NEXT:    vmovaps %ymm0, (%rdi)
 ; X64-AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; X64-AVX-NEXT:    vmovlps %xmm0, 48(%rdi)
 ; X64-AVX-NEXT:    vmovaps %xmm1, 32(%rdi)
-; X64-AVX-NEXT:    movq %rdi, %rax
 ; X64-AVX-NEXT:    vzeroupper
 ; X64-AVX-NEXT:    retq
   %x1 = load <7 x i64>, <7 x i64>* %x
@@ -101,26 +101,26 @@
 ;
 ; X64-SSE-LABEL: load7_unaligned:
 ; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movq %rdi, %rax
 ; X64-SSE-NEXT:    movups (%rsi), %xmm0
 ; X64-SSE-NEXT:    movups 16(%rsi), %xmm1
 ; X64-SSE-NEXT:    movups 32(%rsi), %xmm2
-; X64-SSE-NEXT:    movq 48(%rsi), %rax
-; X64-SSE-NEXT:    movq %rax, 48(%rdi)
+; X64-SSE-NEXT:    movq 48(%rsi), %rcx
+; X64-SSE-NEXT:    movq %rcx, 48(%rdi)
 ; X64-SSE-NEXT:    movaps %xmm2, 32(%rdi)
 ; X64-SSE-NEXT:    movaps %xmm1, 16(%rdi)
 ; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
-; X64-SSE-NEXT:    movq %rdi, %rax
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: load7_unaligned:
 ; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movq %rdi, %rax
 ; X64-AVX-NEXT:    vmovups (%rsi), %ymm0
 ; X64-AVX-NEXT:    vmovups 32(%rsi), %xmm1
-; X64-AVX-NEXT:    movq 48(%rsi), %rax
-; X64-AVX-NEXT:    movq %rax, 48(%rdi)
+; X64-AVX-NEXT:    movq 48(%rsi), %rcx
+; X64-AVX-NEXT:    movq %rcx, 48(%rdi)
 ; X64-AVX-NEXT:    vmovaps %xmm1, 32(%rdi)
 ; X64-AVX-NEXT:    vmovaps %ymm0, (%rdi)
-; X64-AVX-NEXT:    movq %rdi, %rax
 ; X64-AVX-NEXT:    vzeroupper
 ; X64-AVX-NEXT:    retq
   %x1 = load <7 x i64>, <7 x i64>* %x, align 1
diff --git a/llvm/test/CodeGen/X86/win64_vararg.ll b/llvm/test/CodeGen/X86/win64_vararg.ll
index 20386bf..fc9a10e 100644
--- a/llvm/test/CodeGen/X86/win64_vararg.ll
+++ b/llvm/test/CodeGen/X86/win64_vararg.ll
@@ -121,10 +121,10 @@
 }
 ; CHECK-LABEL: sret_arg:
 ; CHECK: pushq
+; CHECK: movq %rcx, %rax
 ; CHECK-DAG: movq %r9, 40(%rsp)
 ; CHECK-DAG: movq %r8, 32(%rsp)
 ; CHECK: movl 32(%rsp), %[[tmp:[^ ]*]]
-; CHECK: movl %[[tmp]], (%[[sret:[^ ]*]])
-; CHECK: movq %[[sret]], %rax
+; CHECK: movl %[[tmp]], (%rax)
 ; CHECK: popq
 ; CHECK: retq
diff --git a/llvm/test/CodeGen/X86/x64-cet-intrinsics.ll b/llvm/test/CodeGen/X86/x64-cet-intrinsics.ll
index 27d78f1..49a3e4a 100644
--- a/llvm/test/CodeGen/X86/x64-cet-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/x64-cet-intrinsics.ll
@@ -30,8 +30,8 @@
 define i32 @test_rdsspd(i32 %a) {
 ; CHECK-LABEL: test_rdsspd:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    rdsspd %edi
 ; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    rdsspd %eax
 ; CHECK-NEXT:    retq
 entry:
   %0 = call i32 @llvm.x86.rdsspd(i32 %a)
@@ -43,8 +43,8 @@
 define i64 @test_rdsspq(i64 %a) {
 ; CHECK-LABEL: test_rdsspq:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    rdsspq %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    rdsspq %rax
 ; CHECK-NEXT:    retq
 entry:
   %0 = call i64 @llvm.x86.rdsspq(i64 %a)
diff --git a/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll b/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll
index dbb6dca..253a5d1 100644
--- a/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll
+++ b/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll
@@ -124,8 +124,8 @@
 define i64 @and1_optsize(i64 %x) optsize {
 ; CHECK-LABEL: and1_optsize:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    btrq $31, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btrq $31, %rax
 ; CHECK-NEXT:    retq
   %a = and i64 %x, 18446744071562067967 ; clear bit 31
   ret i64 %a
@@ -134,8 +134,8 @@
 define i64 @and2_optsize(i64 %x) optsize {
 ; CHECK-LABEL: and2_optsize:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    btrq $32, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btrq $32, %rax
 ; CHECK-NEXT:    retq
   %a = and i64 %x, 18446744069414584319 ; clear bit 32
   ret i64 %a
@@ -144,8 +144,8 @@
 define i64 @and3_optsize(i64 %x) optsize {
 ; CHECK-LABEL: and3_optsize:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    btrq $62, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btrq $62, %rax
 ; CHECK-NEXT:    retq
   %a = and i64 %x, 13835058055282163711 ; clear bit 62
   ret i64 %a
@@ -154,8 +154,8 @@
 define i64 @and4_optsize(i64 %x) optsize {
 ; CHECK-LABEL: and4_optsize:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    btrq $63, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btrq $63, %rax
 ; CHECK-NEXT:    retq
   %a = and i64 %x, 9223372036854775807 ; clear bit 63
   ret i64 %a
@@ -164,8 +164,8 @@
 define i64 @or1_optsize(i64 %x) optsize {
 ; CHECK-LABEL: or1_optsize:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    btsq $31, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btsq $31, %rax
 ; CHECK-NEXT:    retq
   %a = or i64 %x, 2147483648 ; set bit 31
   ret i64 %a
@@ -174,8 +174,8 @@
 define i64 @or2_optsize(i64 %x) optsize {
 ; CHECK-LABEL: or2_optsize:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    btsq $32, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btsq $32, %rax
 ; CHECK-NEXT:    retq
   %a = or i64 %x, 4294967296 ; set bit 32
   ret i64 %a
@@ -184,8 +184,8 @@
 define i64 @or3_optsize(i64 %x) optsize {
 ; CHECK-LABEL: or3_optsize:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    btsq $62, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btsq $62, %rax
 ; CHECK-NEXT:    retq
   %a = or i64 %x, 4611686018427387904 ; set bit 62
   ret i64 %a
@@ -194,8 +194,8 @@
 define i64 @or4_optsize(i64 %x) optsize {
 ; CHECK-LABEL: or4_optsize:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    btsq $63, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btsq $63, %rax
 ; CHECK-NEXT:    retq
   %a = or i64 %x, 9223372036854775808 ; set bit 63
   ret i64 %a
@@ -204,8 +204,8 @@
 define i64 @xor1_optsize(i64 %x) optsize {
 ; CHECK-LABEL: xor1_optsize:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    btcq $31, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btcq $31, %rax
 ; CHECK-NEXT:    retq
   %a = xor i64 %x, 2147483648 ; toggle bit 31
   ret i64 %a
@@ -214,8 +214,8 @@
 define i64 @xor2_optsize(i64 %x) optsize {
 ; CHECK-LABEL: xor2_optsize:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    btcq $32, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btcq $32, %rax
 ; CHECK-NEXT:    retq
   %a = xor i64 %x, 4294967296 ; toggle bit 32
   ret i64 %a
@@ -224,8 +224,8 @@
 define i64 @xor3_optsize(i64 %x) optsize {
 ; CHECK-LABEL: xor3_optsize:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    btcq $62, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btcq $62, %rax
 ; CHECK-NEXT:    retq
   %a = xor i64 %x, 4611686018427387904 ; toggle bit 62
   ret i64 %a
@@ -234,8 +234,8 @@
 define i64 @xor4_optsize(i64 %x) optsize {
 ; CHECK-LABEL: xor4_optsize:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    btcq $63, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btcq $63, %rax
 ; CHECK-NEXT:    retq
   %a = xor i64 %x, 9223372036854775808 ; toggle bit 63
   ret i64 %a
diff --git a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
index 15386a3..5a18010 100644
--- a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
+++ b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
@@ -14,8 +14,8 @@
 define i64 @_Z8lshift10mm(i64 %a, i64 %b) #0 {
 ; CHECK-LABEL: _Z8lshift10mm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shldq $10, %rsi, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    shldq $10, %rsi, %rax
 ; CHECK-NEXT:    retq
 entry:
   %shl = shl i64 %a, 10
@@ -40,8 +40,8 @@
 define i64 @_Z8lshift11mm(i64 %a, i64 %b) #1 {
 ; CHECK-LABEL: _Z8lshift11mm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shldq $11, %rsi, %rdi
 ; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    shldq $11, %rsi, %rax
 ; CHECK-NEXT:    retq
 entry:
   %shl = shl i64 %a, 11
diff --git a/llvm/test/CodeGen/X86/x86-cmov-converter.ll b/llvm/test/CodeGen/X86/x86-cmov-converter.ll
index 56a994d..791ad80 100644
--- a/llvm/test/CodeGen/X86/x86-cmov-converter.ll
+++ b/llvm/test/CodeGen/X86/x86-cmov-converter.ll
@@ -336,14 +336,14 @@
 ; CHECK-LABEL: test_cmov_memoperand:
 entry:
   %cond = icmp ugt i32 %a, %b
+; CHECK:         movl %edx, %eax
 ; CHECK:         cmpl
   %load = load i32, i32* %y
   %z = select i1 %cond, i32 %x, i32 %load
 ; CHECK-NOT:     cmov
 ; CHECK:         ja [[FALSE_BB:.*]]
-; CHECK:         movl (%r{{..}}), %[[R:.*]]
+; CHECK:         movl (%rcx), %eax
 ; CHECK:       [[FALSE_BB]]:
-; CHECK:         movl %[[R]], %
   ret i32 %z
 }
 
@@ -353,6 +353,7 @@
 ; CHECK-LABEL: test_cmov_memoperand_in_group:
 entry:
   %cond = icmp ugt i32 %a, %b
+; CHECK:         movl %edx, %eax
 ; CHECK:         cmpl
   %y = load i32, i32* %y.ptr
   %z1 = select i1 %cond, i32 %x, i32 %a
@@ -362,17 +363,16 @@
 ; CHECK:         ja [[FALSE_BB:.*]]
 ; CHECK-DAG:     movl %{{.*}}, %[[R1:.*]]
 ; CHECK-DAG:     movl (%r{{..}}), %[[R2:.*]]
-; CHECK-DAG:     movl %{{.*}} %[[R3:.*]]
+; CHECK-DAG:     movl %{{.*}} %eax
 ; CHECK:       [[FALSE_BB]]:
 ; CHECK:         addl
 ; CHECK-DAG:       %[[R1]]
 ; CHECK-DAG:       ,
-; CHECK-DAG:       %[[R3]]
+; CHECK-DAG:       %eax
 ; CHECK-DAG:     addl
 ; CHECK-DAG:       %[[R2]]
 ; CHECK-DAG:       ,
-; CHECK-DAG:       %[[R3]]
-; CHECK:         movl %[[R3]], %eax
+; CHECK-DAG:       %eax
 ; CHECK:         retq
   %s1 = add i32 %z1, %z2
   %s2 = add i32 %s1, %z3
@@ -384,6 +384,7 @@
 ; CHECK-LABEL: test_cmov_memoperand_in_group2:
 entry:
   %cond = icmp ugt i32 %a, %b
+; CHECK:         movl %edx, %eax
 ; CHECK:         cmpl
   %y = load i32, i32* %y.ptr
   %z2 = select i1 %cond, i32 %a, i32 %x
@@ -393,17 +394,16 @@
 ; CHECK:         jbe [[FALSE_BB:.*]]
 ; CHECK-DAG:     movl %{{.*}}, %[[R1:.*]]
 ; CHECK-DAG:     movl (%r{{..}}), %[[R2:.*]]
-; CHECK-DAG:     movl %{{.*}} %[[R3:.*]]
+; CHECK-DAG:     movl %{{.*}} %eax
 ; CHECK:       [[FALSE_BB]]:
 ; CHECK:         addl
 ; CHECK-DAG:       %[[R1]]
 ; CHECK-DAG:       ,
-; CHECK-DAG:       %[[R3]]
+; CHECK-DAG:       %eax
 ; CHECK-DAG:     addl
 ; CHECK-DAG:       %[[R2]]
 ; CHECK-DAG:       ,
-; CHECK-DAG:       %[[R3]]
-; CHECK:         movl %[[R3]], %eax
+; CHECK-DAG:       %eax
 ; CHECK:         retq
   %s1 = add i32 %z1, %z2
   %s2 = add i32 %s1, %z3
@@ -434,15 +434,15 @@
 ; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr:
 entry:
   %cond = icmp ugt i32 %a, %b
+; CHECK:         movl %edi, %eax
 ; CHECK:         cmpl
   %p = select i1 %cond, i32* %x, i32* %y
   %load = load i32, i32* %p
   %z = select i1 %cond, i32 %a, i32 %load
 ; CHECK-NOT:     cmov
 ; CHECK:         ja [[FALSE_BB:.*]]
-; CHECK:         movl (%r{{..}}), %[[R:.*]]
+; CHECK:         movl (%r{{..}}), %eax
 ; CHECK:       [[FALSE_BB]]:
-; CHECK:         movl %[[R]], %eax
 ; CHECK:         retq
   ret i32 %z
 }
@@ -453,6 +453,7 @@
 ; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr2:
 entry:
   %cond = icmp ugt i32 %a, %b
+; CHECK:         movl %edi, %eax
 ; CHECK:         cmpl
   %load1 = load i32*, i32** %y
   %p = select i1 %cond, i32* %x, i32* %load1
@@ -461,9 +462,8 @@
 ; CHECK-NOT:     cmov
 ; CHECK:         ja [[FALSE_BB:.*]]
 ; CHECK:         movq (%r{{..}}), %[[R1:.*]]
-; CHECK:         movl (%[[R1]]), %[[R2:.*]]
+; CHECK:         movl (%[[R1]]), %eax
 ; CHECK:       [[FALSE_BB]]:
-; CHECK:         movl %[[R2]], %eax
 ; CHECK:         retq
   ret i32 %z
 }
@@ -475,6 +475,7 @@
 ; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr3:
 entry:
   %cond = icmp ugt i32 %a, %b
+; CHECK:         movl %edi, %eax
 ; CHECK:         cmpl
   %p = select i1 %cond, i32* %x, i32* %y
   %p2 = select i1 %cond, i32* %z, i32* %p
@@ -482,9 +483,8 @@
   %r = select i1 %cond, i32 %a, i32 %load
 ; CHECK-NOT:     cmov
 ; CHECK:         ja [[FALSE_BB:.*]]
-; CHECK:         movl (%r{{..}}), %[[R:.*]]
+; CHECK:         movl (%r{{..}}), %eax
 ; CHECK:       [[FALSE_BB]]:
-; CHECK:         movl %[[R]], %eax
 ; CHECK:         retq
   ret i32 %r
 }
diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
index eecc7fd..2c278e7 100644
--- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -83,9 +83,7 @@
 ; DISABLE: testl %edi, %edi
 ; DISABLE: je [[ELSE_LABEL:LBB[0-9_]+]]
 ;
-; SUM is in %esi because it is coalesced with the second
-; argument on the else path.
-; CHECK: xorl [[SUM:%esi]], [[SUM]]
+; CHECK: xorl [[SUM:%eax]], [[SUM]]
 ; CHECK-NEXT: movl $10, [[IV:%e[a-z]+]]
 ;
 ; Next BB.
@@ -99,23 +97,22 @@
 ; SUM << 3.
 ; CHECK: shll $3, [[SUM]]
 ;
-; Jump to epilogue.
-; DISABLE: jmp [[EPILOG_BB:LBB[0-9_]+]]
+; DISABLE: popq
+; DISABLE: retq
 ;
 ; DISABLE: [[ELSE_LABEL]]: ## %if.else
-; Shift second argument by one and store into returned register.
-; DISABLE: addl %esi, %esi
-; DISABLE: [[EPILOG_BB]]: ## %if.end
+; Shift second argument by one in returned register.
+; DISABLE: movl %esi, %eax
+; DISABLE: addl %esi, %eax
 ;
 ; Epilogue code.
 ; CHECK-DAG: popq %rbx
-; CHECK-DAG: movl %esi, %eax
 ; CHECK: retq
 ;
 ; ENABLE: [[ELSE_LABEL]]: ## %if.else
 ; Shift second argument by one and store into returned register.
-; ENABLE: addl %esi, %esi
-; ENABLE-NEXT: movl %esi, %eax
+; ENABLE: movl %esi, %eax
+; ENABLE: addl %esi, %eax
 ; ENABLE-NEXT: retq
 define i32 @freqSaveAndRestoreOutsideLoop(i32 %cond, i32 %N) {
 entry:
@@ -210,7 +207,7 @@
 ; DISABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
 ;
 ; CHECK: nop
-; CHECK: xorl [[SUM:%esi]], [[SUM]]
+; CHECK: xorl [[SUM:%eax]], [[SUM]]
 ; CHECK-NEXT: movl $10, [[IV:%e[a-z]+]]
 ;
 ; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ## %for.body
@@ -222,22 +219,22 @@
 ; CHECK: nop
 ; CHECK: shll $3, [[SUM]]
 ;
-; DISABLE: jmp [[EPILOG_BB:LBB[0-9_]+]]
+; DISABLE: popq
+; DISABLE: retq
 ;
 ; DISABLE: [[ELSE_LABEL]]: ## %if.else
-; Shift second argument by one and store into returned register.
-; DISABLE: addl %esi, %esi
-; DISABLE: [[EPILOG_BB]]: ## %if.end
+; Shift second argument by one in returned register.
+; DISABLE: movl %esi, %eax
+; DISABLE: addl %esi, %eax
 ;
 ; Epilogue code.
 ; CHECK-DAG: popq %rbx
-; CHECK-DAG: movl %esi, %eax
 ; CHECK: retq
 ;
 ; ENABLE: [[ELSE_LABEL]]: ## %if.else
 ; Shift second argument by one and store into returned register.
-; ENABLE: addl %esi, %esi
-; ENABLE-NEXT: movl %esi, %eax
+; ENABLE: movl %esi, %eax
+; ENABLE: addl %esi, %eax
 ; ENABLE-NEXT: retq
 define i32 @loopInfoSaveOutsideLoop(i32 %cond, i32 %N) {
 entry:
@@ -286,7 +283,7 @@
 ; DISABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
 ;
 ; CHECK: nop
-; CHECK: xorl [[SUM:%esi]], [[SUM]]
+; CHECK: xorl [[SUM:%eax]], [[SUM]]
 ; CHECK-NEXT: movl $10, [[IV:%e[a-z]+]]
 ;
 ; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ## %for.body
@@ -297,23 +294,23 @@
 ; Next BB.
 ; CHECK: shll $3, [[SUM]]
 ;
-; DISABLE: jmp [[EPILOG_BB:LBB[0-9_]+]]
+; DISABLE: popq
+; DISABLE: retq
 ;
 ; DISABLE: [[ELSE_LABEL]]: ## %if.else
 
-; Shift second argument by one and store into returned register.
-; DISABLE: addl %esi, %esi
-; DISABLE: [[EPILOG_BB]]: ## %if.end
+; Shift second argument by one in returned register.
+; DISABLE: movl %esi, %eax
+; DISABLE: addl %esi, %eax
 ;
 ; Epilogue code.
 ; CHECK-DAG: popq %rbx
-; CHECK-DAG: movl %esi, %eax
 ; CHECK: retq
 ;
 ; ENABLE: [[ELSE_LABEL]]: ## %if.else
 ; Shift second argument by one and store into returned register.
-; ENABLE: addl %esi, %esi
-; ENABLE-NEXT: movl %esi, %eax
+; ENABLE: movl %esi, %eax
+; ENABLE: addl %esi, %eax
 ; ENABLE-NEXT: retq
 define i32 @loopInfoRestoreOutsideLoop(i32 %cond, i32 %N) nounwind {
 entry:
@@ -379,24 +376,24 @@
 ; CHECK-NEXT: jne [[LOOP_LABEL]]
 ; Next BB.
 ; CHECK: nop
-; CHECK: xorl %esi, %esi
+; CHECK: xorl %eax, %eax
 ;
-; DISABLE: jmp [[EPILOG_BB:LBB[0-9_]+]]
+; DISABLE: popq
+; DISABLE: retq
 ;
 ; DISABLE: [[ELSE_LABEL]]: ## %if.else
-; Shift second argument by one and store into returned register.
-; DISABLE: addl %esi, %esi
-; DISABLE: [[EPILOG_BB]]: ## %if.end
+; Shift second argument by one in returned register.
+; DISABLE: movl %esi, %eax
+; DISABLE: addl %esi, %eax
 ;
 ; Epilogue code.
 ; CHECK-DAG: popq %rbx
-; CHECK-DAG: movl %esi, %eax
 ; CHECK: retq
 ;
 ; ENABLE: [[ELSE_LABEL]]: ## %if.else
 ; Shift second argument by one and store into returned register.
-; ENABLE: addl %esi, %esi
-; ENABLE-NEXT: movl %esi, %eax
+; ENABLE: movl %esi, %eax
+; ENABLE: addl %esi, %eax
 ; ENABLE-NEXT: retq
 define i32 @inlineAsm(i32 %cond, i32 %N) {
 entry:
diff --git a/llvm/test/CodeGen/X86/xaluo.ll b/llvm/test/CodeGen/X86/xaluo.ll
index 7d4cd22..25aa45e 100644
--- a/llvm/test/CodeGen/X86/xaluo.ll
+++ b/llvm/test/CodeGen/X86/xaluo.ll
@@ -719,26 +719,26 @@
 define i32 @saddoselecti32(i32 %v1, i32 %v2) {
 ; SDAG-LABEL: saddoselecti32:
 ; SDAG:       ## %bb.0:
-; SDAG-NEXT:    movl %edi, %eax
-; SDAG-NEXT:    addl %esi, %eax
-; SDAG-NEXT:    cmovol %edi, %esi
 ; SDAG-NEXT:    movl %esi, %eax
+; SDAG-NEXT:    movl %edi, %ecx
+; SDAG-NEXT:    addl %esi, %ecx
+; SDAG-NEXT:    cmovol %edi, %eax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: saddoselecti32:
 ; FAST:       ## %bb.0:
-; FAST-NEXT:    movl %edi, %eax
-; FAST-NEXT:    addl %esi, %eax
-; FAST-NEXT:    cmovol %edi, %esi
 ; FAST-NEXT:    movl %esi, %eax
+; FAST-NEXT:    movl %edi, %ecx
+; FAST-NEXT:    addl %esi, %ecx
+; FAST-NEXT:    cmovol %edi, %eax
 ; FAST-NEXT:    retq
 ;
 ; KNL-LABEL: saddoselecti32:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    movl %edi, %eax
-; KNL-NEXT:    addl %esi, %eax
-; KNL-NEXT:    cmovol %edi, %esi
 ; KNL-NEXT:    movl %esi, %eax
+; KNL-NEXT:    movl %edi, %ecx
+; KNL-NEXT:    addl %esi, %ecx
+; KNL-NEXT:    cmovol %edi, %eax
 ; KNL-NEXT:    retq
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
@@ -749,26 +749,26 @@
 define i64 @saddoselecti64(i64 %v1, i64 %v2) {
 ; SDAG-LABEL: saddoselecti64:
 ; SDAG:       ## %bb.0:
-; SDAG-NEXT:    movq %rdi, %rax
-; SDAG-NEXT:    addq %rsi, %rax
-; SDAG-NEXT:    cmovoq %rdi, %rsi
 ; SDAG-NEXT:    movq %rsi, %rax
+; SDAG-NEXT:    movq %rdi, %rcx
+; SDAG-NEXT:    addq %rsi, %rcx
+; SDAG-NEXT:    cmovoq %rdi, %rax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: saddoselecti64:
 ; FAST:       ## %bb.0:
-; FAST-NEXT:    movq %rdi, %rax
-; FAST-NEXT:    addq %rsi, %rax
-; FAST-NEXT:    cmovoq %rdi, %rsi
 ; FAST-NEXT:    movq %rsi, %rax
+; FAST-NEXT:    movq %rdi, %rcx
+; FAST-NEXT:    addq %rsi, %rcx
+; FAST-NEXT:    cmovoq %rdi, %rax
 ; FAST-NEXT:    retq
 ;
 ; KNL-LABEL: saddoselecti64:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    movq %rdi, %rax
-; KNL-NEXT:    addq %rsi, %rax
-; KNL-NEXT:    cmovoq %rdi, %rsi
 ; KNL-NEXT:    movq %rsi, %rax
+; KNL-NEXT:    movq %rdi, %rcx
+; KNL-NEXT:    addq %rsi, %rcx
+; KNL-NEXT:    cmovoq %rdi, %rax
 ; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
@@ -779,26 +779,26 @@
 define i32 @uaddoselecti32(i32 %v1, i32 %v2) {
 ; SDAG-LABEL: uaddoselecti32:
 ; SDAG:       ## %bb.0:
-; SDAG-NEXT:    movl %edi, %eax
-; SDAG-NEXT:    addl %esi, %eax
-; SDAG-NEXT:    cmovbl %edi, %esi
 ; SDAG-NEXT:    movl %esi, %eax
+; SDAG-NEXT:    movl %edi, %ecx
+; SDAG-NEXT:    addl %esi, %ecx
+; SDAG-NEXT:    cmovbl %edi, %eax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: uaddoselecti32:
 ; FAST:       ## %bb.0:
-; FAST-NEXT:    movl %edi, %eax
-; FAST-NEXT:    addl %esi, %eax
-; FAST-NEXT:    cmovbl %edi, %esi
 ; FAST-NEXT:    movl %esi, %eax
+; FAST-NEXT:    movl %edi, %ecx
+; FAST-NEXT:    addl %esi, %ecx
+; FAST-NEXT:    cmovbl %edi, %eax
 ; FAST-NEXT:    retq
 ;
 ; KNL-LABEL: uaddoselecti32:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    movl %edi, %eax
-; KNL-NEXT:    addl %esi, %eax
-; KNL-NEXT:    cmovbl %edi, %esi
 ; KNL-NEXT:    movl %esi, %eax
+; KNL-NEXT:    movl %edi, %ecx
+; KNL-NEXT:    addl %esi, %ecx
+; KNL-NEXT:    cmovbl %edi, %eax
 ; KNL-NEXT:    retq
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
@@ -809,26 +809,26 @@
 define i64 @uaddoselecti64(i64 %v1, i64 %v2) {
 ; SDAG-LABEL: uaddoselecti64:
 ; SDAG:       ## %bb.0:
-; SDAG-NEXT:    movq %rdi, %rax
-; SDAG-NEXT:    addq %rsi, %rax
-; SDAG-NEXT:    cmovbq %rdi, %rsi
 ; SDAG-NEXT:    movq %rsi, %rax
+; SDAG-NEXT:    movq %rdi, %rcx
+; SDAG-NEXT:    addq %rsi, %rcx
+; SDAG-NEXT:    cmovbq %rdi, %rax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: uaddoselecti64:
 ; FAST:       ## %bb.0:
-; FAST-NEXT:    movq %rdi, %rax
-; FAST-NEXT:    addq %rsi, %rax
-; FAST-NEXT:    cmovbq %rdi, %rsi
 ; FAST-NEXT:    movq %rsi, %rax
+; FAST-NEXT:    movq %rdi, %rcx
+; FAST-NEXT:    addq %rsi, %rcx
+; FAST-NEXT:    cmovbq %rdi, %rax
 ; FAST-NEXT:    retq
 ;
 ; KNL-LABEL: uaddoselecti64:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    movq %rdi, %rax
-; KNL-NEXT:    addq %rsi, %rax
-; KNL-NEXT:    cmovbq %rdi, %rsi
 ; KNL-NEXT:    movq %rsi, %rax
+; KNL-NEXT:    movq %rdi, %rcx
+; KNL-NEXT:    addq %rsi, %rcx
+; KNL-NEXT:    cmovbq %rdi, %rax
 ; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
@@ -839,23 +839,23 @@
 define i32 @ssuboselecti32(i32 %v1, i32 %v2) {
 ; SDAG-LABEL: ssuboselecti32:
 ; SDAG:       ## %bb.0:
-; SDAG-NEXT:    cmpl %esi, %edi
-; SDAG-NEXT:    cmovol %edi, %esi
 ; SDAG-NEXT:    movl %esi, %eax
+; SDAG-NEXT:    cmpl %esi, %edi
+; SDAG-NEXT:    cmovol %edi, %eax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: ssuboselecti32:
 ; FAST:       ## %bb.0:
-; FAST-NEXT:    cmpl %esi, %edi
-; FAST-NEXT:    cmovol %edi, %esi
 ; FAST-NEXT:    movl %esi, %eax
+; FAST-NEXT:    cmpl %esi, %edi
+; FAST-NEXT:    cmovol %edi, %eax
 ; FAST-NEXT:    retq
 ;
 ; KNL-LABEL: ssuboselecti32:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    cmpl %esi, %edi
-; KNL-NEXT:    cmovol %edi, %esi
 ; KNL-NEXT:    movl %esi, %eax
+; KNL-NEXT:    cmpl %esi, %edi
+; KNL-NEXT:    cmovol %edi, %eax
 ; KNL-NEXT:    retq
   %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
@@ -866,23 +866,23 @@
 define i64 @ssuboselecti64(i64 %v1, i64 %v2) {
 ; SDAG-LABEL: ssuboselecti64:
 ; SDAG:       ## %bb.0:
-; SDAG-NEXT:    cmpq %rsi, %rdi
-; SDAG-NEXT:    cmovoq %rdi, %rsi
 ; SDAG-NEXT:    movq %rsi, %rax
+; SDAG-NEXT:    cmpq %rsi, %rdi
+; SDAG-NEXT:    cmovoq %rdi, %rax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: ssuboselecti64:
 ; FAST:       ## %bb.0:
-; FAST-NEXT:    cmpq %rsi, %rdi
-; FAST-NEXT:    cmovoq %rdi, %rsi
 ; FAST-NEXT:    movq %rsi, %rax
+; FAST-NEXT:    cmpq %rsi, %rdi
+; FAST-NEXT:    cmovoq %rdi, %rax
 ; FAST-NEXT:    retq
 ;
 ; KNL-LABEL: ssuboselecti64:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    cmpq %rsi, %rdi
-; KNL-NEXT:    cmovoq %rdi, %rsi
 ; KNL-NEXT:    movq %rsi, %rax
+; KNL-NEXT:    cmpq %rsi, %rdi
+; KNL-NEXT:    cmovoq %rdi, %rax
 ; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
@@ -893,23 +893,23 @@
 define i32 @usuboselecti32(i32 %v1, i32 %v2) {
 ; SDAG-LABEL: usuboselecti32:
 ; SDAG:       ## %bb.0:
-; SDAG-NEXT:    cmpl %esi, %edi
-; SDAG-NEXT:    cmovbl %edi, %esi
 ; SDAG-NEXT:    movl %esi, %eax
+; SDAG-NEXT:    cmpl %esi, %edi
+; SDAG-NEXT:    cmovbl %edi, %eax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: usuboselecti32:
 ; FAST:       ## %bb.0:
-; FAST-NEXT:    cmpl %esi, %edi
-; FAST-NEXT:    cmovbl %edi, %esi
 ; FAST-NEXT:    movl %esi, %eax
+; FAST-NEXT:    cmpl %esi, %edi
+; FAST-NEXT:    cmovbl %edi, %eax
 ; FAST-NEXT:    retq
 ;
 ; KNL-LABEL: usuboselecti32:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    cmpl %esi, %edi
-; KNL-NEXT:    cmovbl %edi, %esi
 ; KNL-NEXT:    movl %esi, %eax
+; KNL-NEXT:    cmpl %esi, %edi
+; KNL-NEXT:    cmovbl %edi, %eax
 ; KNL-NEXT:    retq
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
@@ -920,23 +920,23 @@
 define i64 @usuboselecti64(i64 %v1, i64 %v2) {
 ; SDAG-LABEL: usuboselecti64:
 ; SDAG:       ## %bb.0:
-; SDAG-NEXT:    cmpq %rsi, %rdi
-; SDAG-NEXT:    cmovbq %rdi, %rsi
 ; SDAG-NEXT:    movq %rsi, %rax
+; SDAG-NEXT:    cmpq %rsi, %rdi
+; SDAG-NEXT:    cmovbq %rdi, %rax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: usuboselecti64:
 ; FAST:       ## %bb.0:
-; FAST-NEXT:    cmpq %rsi, %rdi
-; FAST-NEXT:    cmovbq %rdi, %rsi
 ; FAST-NEXT:    movq %rsi, %rax
+; FAST-NEXT:    cmpq %rsi, %rdi
+; FAST-NEXT:    cmovbq %rdi, %rax
 ; FAST-NEXT:    retq
 ;
 ; KNL-LABEL: usuboselecti64:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    cmpq %rsi, %rdi
-; KNL-NEXT:    cmovbq %rdi, %rsi
 ; KNL-NEXT:    movq %rsi, %rax
+; KNL-NEXT:    cmpq %rsi, %rdi
+; KNL-NEXT:    cmovbq %rdi, %rax
 ; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
@@ -1372,23 +1372,23 @@
 define {i64, i1} @usuboovf(i64 %a, i64 %b) {
 ; SDAG-LABEL: usuboovf:
 ; SDAG:       ## %bb.0:
-; SDAG-NEXT:    notq %rsi
-; SDAG-NEXT:    xorl %edx, %edx
 ; SDAG-NEXT:    movq %rsi, %rax
+; SDAG-NEXT:    notq %rax
+; SDAG-NEXT:    xorl %edx, %edx
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: usuboovf:
 ; FAST:       ## %bb.0:
-; FAST-NEXT:    notq %rsi
-; FAST-NEXT:    xorl %edx, %edx
 ; FAST-NEXT:    movq %rsi, %rax
+; FAST-NEXT:    notq %rax
+; FAST-NEXT:    xorl %edx, %edx
 ; FAST-NEXT:    retq
 ;
 ; KNL-LABEL: usuboovf:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    notq %rsi
-; KNL-NEXT:    xorl %edx, %edx
 ; KNL-NEXT:    movq %rsi, %rax
+; KNL-NEXT:    notq %rax
+; KNL-NEXT:    xorl %edx, %edx
 ; KNL-NEXT:    retq
   %t0 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %a)
   %v0 = extractvalue {i64, i1} %t0, 0
diff --git a/llvm/test/CodeGen/X86/xchg-nofold.ll b/llvm/test/CodeGen/X86/xchg-nofold.ll
index b602041..2e24f8b 100644
--- a/llvm/test/CodeGen/X86/xchg-nofold.ll
+++ b/llvm/test/CodeGen/X86/xchg-nofold.ll
@@ -9,20 +9,21 @@
 define zeroext i1 @_Z3fooRSt6atomicIbEb(%"struct.std::atomic"* nocapture dereferenceable(1) %a, i1 returned zeroext %b) nounwind {
 ; CHECK-LABEL: _Z3fooRSt6atomicIbEb:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    shrq $3, %rax
-; CHECK-NEXT:    movb 2147450880(%rax), %al
-; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movq %rdi, %rcx
+; CHECK-NEXT:    shrq $3, %rcx
+; CHECK-NEXT:    movb 2147450880(%rcx), %cl
+; CHECK-NEXT:    testb %cl, %cl
 ; CHECK-NEXT:    je .LBB0_3
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    movl %edi, %ecx
-; CHECK-NEXT:    andl $7, %ecx
-; CHECK-NEXT:    cmpb %al, %cl
+; CHECK-NEXT:    movl %edi, %edx
+; CHECK-NEXT:    andl $7, %edx
+; CHECK-NEXT:    cmpb %cl, %dl
 ; CHECK-NEXT:    jge .LBB0_2
 ; CHECK-NEXT:  .LBB0_3:
-; CHECK-NEXT:    movl %esi, %eax
-; CHECK-NEXT:    xchgb %al, (%rdi)
-; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    xchgb %cl, (%rdi)
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB0_2:
 ; CHECK-NEXT:    pushq %rax
diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index 3788d9c..8d2e81f 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -92,6 +92,7 @@
 ; SDAG-LABEL: smuloi8:
 ; SDAG:       ## %bb.0:
 ; SDAG-NEXT:    movl %edi, %eax
+; SDAG-NEXT:    ## kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    imulb %sil
 ; SDAG-NEXT:    seto %cl
 ; SDAG-NEXT:    movb %al, (%rdx)
@@ -101,6 +102,7 @@
 ; FAST-LABEL: smuloi8:
 ; FAST:       ## %bb.0:
 ; FAST-NEXT:    movl %edi, %eax
+; FAST-NEXT:    ## kill: def $al killed $al killed $eax
 ; FAST-NEXT:    imulb %sil
 ; FAST-NEXT:    seto %cl
 ; FAST-NEXT:    movb %al, (%rdx)
@@ -111,6 +113,7 @@
 ; KNL-LABEL: smuloi8:
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    movl %edi, %eax
+; KNL-NEXT:    ## kill: def $al killed $al killed $eax
 ; KNL-NEXT:    imulb %sil
 ; KNL-NEXT:    seto %cl
 ; KNL-NEXT:    movb %al, (%rdx)
@@ -218,6 +221,7 @@
 ; SDAG-LABEL: umuloi8:
 ; SDAG:       ## %bb.0:
 ; SDAG-NEXT:    movl %edi, %eax
+; SDAG-NEXT:    ## kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    mulb %sil
 ; SDAG-NEXT:    seto %cl
 ; SDAG-NEXT:    movb %al, (%rdx)
@@ -227,6 +231,7 @@
 ; FAST-LABEL: umuloi8:
 ; FAST:       ## %bb.0:
 ; FAST-NEXT:    movl %edi, %eax
+; FAST-NEXT:    ## kill: def $al killed $al killed $eax
 ; FAST-NEXT:    mulb %sil
 ; FAST-NEXT:    seto %cl
 ; FAST-NEXT:    movb %al, (%rdx)
@@ -237,6 +242,7 @@
 ; KNL-LABEL: umuloi8:
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    movl %edi, %eax
+; KNL-NEXT:    ## kill: def $al killed $al killed $eax
 ; KNL-NEXT:    mulb %sil
 ; KNL-NEXT:    seto %cl
 ; KNL-NEXT:    movb %al, (%rdx)
@@ -254,6 +260,7 @@
 ; SDAG:       ## %bb.0:
 ; SDAG-NEXT:    movq %rdx, %rcx
 ; SDAG-NEXT:    movl %edi, %eax
+; SDAG-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; SDAG-NEXT:    mulw %si
 ; SDAG-NEXT:    seto %dl
 ; SDAG-NEXT:    movw %ax, (%rcx)
@@ -264,6 +271,7 @@
 ; FAST:       ## %bb.0:
 ; FAST-NEXT:    movq %rdx, %rcx
 ; FAST-NEXT:    movl %edi, %eax
+; FAST-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; FAST-NEXT:    mulw %si
 ; FAST-NEXT:    seto %dl
 ; FAST-NEXT:    movw %ax, (%rcx)
@@ -275,6 +283,7 @@
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    movq %rdx, %rcx
 ; KNL-NEXT:    movl %edi, %eax
+; KNL-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; KNL-NEXT:    mulw %si
 ; KNL-NEXT:    seto %dl
 ; KNL-NEXT:    movw %ax, (%rcx)
@@ -369,26 +378,26 @@
 define i32 @smuloselecti32(i32 %v1, i32 %v2) {
 ; SDAG-LABEL: smuloselecti32:
 ; SDAG:       ## %bb.0:
-; SDAG-NEXT:    movl %edi, %eax
-; SDAG-NEXT:    imull %esi, %eax
-; SDAG-NEXT:    cmovol %edi, %esi
 ; SDAG-NEXT:    movl %esi, %eax
+; SDAG-NEXT:    movl %edi, %ecx
+; SDAG-NEXT:    imull %esi, %ecx
+; SDAG-NEXT:    cmovol %edi, %eax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smuloselecti32:
 ; FAST:       ## %bb.0:
-; FAST-NEXT:    movl %edi, %eax
-; FAST-NEXT:    imull %esi, %eax
-; FAST-NEXT:    cmovol %edi, %esi
 ; FAST-NEXT:    movl %esi, %eax
+; FAST-NEXT:    movl %edi, %ecx
+; FAST-NEXT:    imull %esi, %ecx
+; FAST-NEXT:    cmovol %edi, %eax
 ; FAST-NEXT:    retq
 ;
 ; KNL-LABEL: smuloselecti32:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    movl %edi, %eax
-; KNL-NEXT:    imull %esi, %eax
-; KNL-NEXT:    cmovol %edi, %esi
 ; KNL-NEXT:    movl %esi, %eax
+; KNL-NEXT:    movl %edi, %ecx
+; KNL-NEXT:    imull %esi, %ecx
+; KNL-NEXT:    cmovol %edi, %eax
 ; KNL-NEXT:    retq
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
@@ -399,26 +408,26 @@
 define i64 @smuloselecti64(i64 %v1, i64 %v2) {
 ; SDAG-LABEL: smuloselecti64:
 ; SDAG:       ## %bb.0:
-; SDAG-NEXT:    movq %rdi, %rax
-; SDAG-NEXT:    imulq %rsi, %rax
-; SDAG-NEXT:    cmovoq %rdi, %rsi
 ; SDAG-NEXT:    movq %rsi, %rax
+; SDAG-NEXT:    movq %rdi, %rcx
+; SDAG-NEXT:    imulq %rsi, %rcx
+; SDAG-NEXT:    cmovoq %rdi, %rax
 ; SDAG-NEXT:    retq
 ;
 ; FAST-LABEL: smuloselecti64:
 ; FAST:       ## %bb.0:
-; FAST-NEXT:    movq %rdi, %rax
-; FAST-NEXT:    imulq %rsi, %rax
-; FAST-NEXT:    cmovoq %rdi, %rsi
 ; FAST-NEXT:    movq %rsi, %rax
+; FAST-NEXT:    movq %rdi, %rcx
+; FAST-NEXT:    imulq %rsi, %rcx
+; FAST-NEXT:    cmovoq %rdi, %rax
 ; FAST-NEXT:    retq
 ;
 ; KNL-LABEL: smuloselecti64:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    movq %rdi, %rax
-; KNL-NEXT:    imulq %rsi, %rax
-; KNL-NEXT:    cmovoq %rdi, %rsi
 ; KNL-NEXT:    movq %rsi, %rax
+; KNL-NEXT:    movq %rdi, %rcx
+; KNL-NEXT:    imulq %rsi, %rcx
+; KNL-NEXT:    cmovoq %rdi, %rax
 ; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
@@ -694,8 +703,8 @@
 define i1 @bug27873(i64 %c1, i1 %c2) {
 ; SDAG-LABEL: bug27873:
 ; SDAG:       ## %bb.0:
-; SDAG-NEXT:    movl $160, %ecx
 ; SDAG-NEXT:    movq %rdi, %rax
+; SDAG-NEXT:    movl $160, %ecx
 ; SDAG-NEXT:    mulq %rcx
 ; SDAG-NEXT:    seto %al
 ; SDAG-NEXT:    orb %sil, %al
@@ -703,8 +712,8 @@
 ;
 ; FAST-LABEL: bug27873:
 ; FAST:       ## %bb.0:
-; FAST-NEXT:    movl $160, %ecx
 ; FAST-NEXT:    movq %rdi, %rax
+; FAST-NEXT:    movl $160, %ecx
 ; FAST-NEXT:    mulq %rcx
 ; FAST-NEXT:    seto %al
 ; FAST-NEXT:    orb %sil, %al
@@ -712,8 +721,8 @@
 ;
 ; KNL-LABEL: bug27873:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    movl $160, %ecx
 ; KNL-NEXT:    movq %rdi, %rax
+; KNL-NEXT:    movl $160, %ecx
 ; KNL-NEXT:    mulq %rcx
 ; KNL-NEXT:    seto %al
 ; KNL-NEXT:    orb %sil, %al
diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll
index f73fdb2..3aef8dd 100644
--- a/llvm/test/CodeGen/X86/xor.ll
+++ b/llvm/test/CodeGen/X86/xor.ll
@@ -44,18 +44,18 @@
 ;
 ; X64-LIN-LABEL: test3:
 ; X64-LIN:       # %bb.0: # %entry
-; X64-LIN-NEXT:    notl %esi
-; X64-LIN-NEXT:    andl %edi, %esi
-; X64-LIN-NEXT:    shrl %esi
 ; X64-LIN-NEXT:    movl %esi, %eax
+; X64-LIN-NEXT:    notl %eax
+; X64-LIN-NEXT:    andl %edi, %eax
+; X64-LIN-NEXT:    shrl %eax
 ; X64-LIN-NEXT:    retq
 ;
 ; X64-WIN-LABEL: test3:
 ; X64-WIN:       # %bb.0: # %entry
-; X64-WIN-NEXT:    notl %edx
-; X64-WIN-NEXT:    andl %ecx, %edx
-; X64-WIN-NEXT:    shrl %edx
 ; X64-WIN-NEXT:    movl %edx, %eax
+; X64-WIN-NEXT:    notl %eax
+; X64-WIN-NEXT:    andl %ecx, %eax
+; X64-WIN-NEXT:    shrl %eax
 ; X64-WIN-NEXT:    retq
 entry:
   %tmp1not = xor i32 %b, -2
@@ -84,34 +84,34 @@
 ;
 ; X64-LIN-LABEL: test4:
 ; X64-LIN:       # %bb.0: # %entry
+; X64-LIN-NEXT:    movl %edi, %eax
 ; X64-LIN-NEXT:    .p2align 4, 0x90
 ; X64-LIN-NEXT:  .LBB3_1: # %bb
 ; X64-LIN-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-LIN-NEXT:    xorl %esi, %edi
-; X64-LIN-NEXT:    movl %edi, %eax
-; X64-LIN-NEXT:    notl %eax
-; X64-LIN-NEXT:    andl %esi, %eax
-; X64-LIN-NEXT:    addl %eax, %eax
-; X64-LIN-NEXT:    movl %eax, %esi
+; X64-LIN-NEXT:    xorl %esi, %eax
+; X64-LIN-NEXT:    movl %eax, %ecx
+; X64-LIN-NEXT:    notl %ecx
+; X64-LIN-NEXT:    andl %esi, %ecx
+; X64-LIN-NEXT:    addl %ecx, %ecx
+; X64-LIN-NEXT:    movl %ecx, %esi
 ; X64-LIN-NEXT:    jne .LBB3_1
 ; X64-LIN-NEXT:  # %bb.2: # %bb12
-; X64-LIN-NEXT:    movl %edi, %eax
 ; X64-LIN-NEXT:    retq
 ;
 ; X64-WIN-LABEL: test4:
 ; X64-WIN:       # %bb.0: # %entry
+; X64-WIN-NEXT:    movl %ecx, %eax
 ; X64-WIN-NEXT:    .p2align 4, 0x90
 ; X64-WIN-NEXT:  .LBB3_1: # %bb
 ; X64-WIN-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-WIN-NEXT:    xorl %edx, %ecx
-; X64-WIN-NEXT:    movl %ecx, %eax
-; X64-WIN-NEXT:    notl %eax
-; X64-WIN-NEXT:    andl %edx, %eax
-; X64-WIN-NEXT:    addl %eax, %eax
-; X64-WIN-NEXT:    movl %eax, %edx
+; X64-WIN-NEXT:    xorl %edx, %eax
+; X64-WIN-NEXT:    movl %eax, %ecx
+; X64-WIN-NEXT:    notl %ecx
+; X64-WIN-NEXT:    andl %edx, %ecx
+; X64-WIN-NEXT:    addl %ecx, %ecx
+; X64-WIN-NEXT:    movl %ecx, %edx
 ; X64-WIN-NEXT:    jne .LBB3_1
 ; X64-WIN-NEXT:  # %bb.2: # %bb12
-; X64-WIN-NEXT:    movl %ecx, %eax
 ; X64-WIN-NEXT:    retq
 entry:
   br label %bb
@@ -150,38 +150,39 @@
 ;
 ; X64-LIN-LABEL: test5:
 ; X64-LIN:       # %bb.0: # %entry
+; X64-LIN-NEXT:    movl %edi, %eax
 ; X64-LIN-NEXT:    .p2align 4, 0x90
 ; X64-LIN-NEXT:  .LBB4_1: # %bb
 ; X64-LIN-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-LIN-NEXT:    xorl %esi, %edi
-; X64-LIN-NEXT:    movl %edi, %eax
-; X64-LIN-NEXT:    notl %eax
-; X64-LIN-NEXT:    andl %esi, %eax
-; X64-LIN-NEXT:    addl %eax, %eax
-; X64-LIN-NEXT:    testw %ax, %ax
-; X64-LIN-NEXT:    movl %eax, %esi
+; X64-LIN-NEXT:    xorl %esi, %eax
+; X64-LIN-NEXT:    movl %eax, %ecx
+; X64-LIN-NEXT:    notl %ecx
+; X64-LIN-NEXT:    andl %esi, %ecx
+; X64-LIN-NEXT:    addl %ecx, %ecx
+; X64-LIN-NEXT:    testw %cx, %cx
+; X64-LIN-NEXT:    movl %ecx, %esi
 ; X64-LIN-NEXT:    jne .LBB4_1
 ; X64-LIN-NEXT:  # %bb.2: # %bb12
-; X64-LIN-NEXT:    movl %edi, %eax
+; X64-LIN-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-LIN-NEXT:    retq
 ;
 ; X64-WIN-LABEL: test5:
 ; X64-WIN:       # %bb.0: # %entry
 ; X64-WIN-NEXT:    # kill: def $dx killed $dx def $edx
-; X64-WIN-NEXT:    # kill: def $cx killed $cx def $ecx
+; X64-WIN-NEXT:    movl %ecx, %eax
 ; X64-WIN-NEXT:    .p2align 4, 0x90
 ; X64-WIN-NEXT:  .LBB4_1: # %bb
 ; X64-WIN-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-WIN-NEXT:    xorl %edx, %ecx
-; X64-WIN-NEXT:    movl %ecx, %eax
-; X64-WIN-NEXT:    notl %eax
-; X64-WIN-NEXT:    andl %edx, %eax
-; X64-WIN-NEXT:    addl %eax, %eax
-; X64-WIN-NEXT:    testw %ax, %ax
-; X64-WIN-NEXT:    movl %eax, %edx
+; X64-WIN-NEXT:    xorl %edx, %eax
+; X64-WIN-NEXT:    movl %eax, %ecx
+; X64-WIN-NEXT:    notl %ecx
+; X64-WIN-NEXT:    andl %edx, %ecx
+; X64-WIN-NEXT:    addl %ecx, %ecx
+; X64-WIN-NEXT:    testw %cx, %cx
+; X64-WIN-NEXT:    movl %ecx, %edx
 ; X64-WIN-NEXT:    jne .LBB4_1
 ; X64-WIN-NEXT:  # %bb.2: # %bb12
-; X64-WIN-NEXT:    movl %ecx, %eax
+; X64-WIN-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-WIN-NEXT:    retq
 entry:
   br label %bb
@@ -218,34 +219,35 @@
 ;
 ; X64-LIN-LABEL: test6:
 ; X64-LIN:       # %bb.0: # %entry
+; X64-LIN-NEXT:    movl %edi, %eax
 ; X64-LIN-NEXT:    .p2align 4, 0x90
 ; X64-LIN-NEXT:  .LBB5_1: # %bb
 ; X64-LIN-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-LIN-NEXT:    xorb %sil, %dil
-; X64-LIN-NEXT:    movl %edi, %eax
-; X64-LIN-NEXT:    notb %al
-; X64-LIN-NEXT:    andb %sil, %al
-; X64-LIN-NEXT:    addb %al, %al
-; X64-LIN-NEXT:    movl %eax, %esi
+; X64-LIN-NEXT:    xorb %sil, %al
+; X64-LIN-NEXT:    movl %eax, %ecx
+; X64-LIN-NEXT:    notb %cl
+; X64-LIN-NEXT:    andb %sil, %cl
+; X64-LIN-NEXT:    addb %cl, %cl
+; X64-LIN-NEXT:    movl %ecx, %esi
 ; X64-LIN-NEXT:    jne .LBB5_1
 ; X64-LIN-NEXT:  # %bb.2: # %bb12
-; X64-LIN-NEXT:    movl %edi, %eax
+; X64-LIN-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-LIN-NEXT:    retq
 ;
 ; X64-WIN-LABEL: test6:
 ; X64-WIN:       # %bb.0: # %entry
+; X64-WIN-NEXT:    movl %ecx, %eax
 ; X64-WIN-NEXT:    .p2align 4, 0x90
 ; X64-WIN-NEXT:  .LBB5_1: # %bb
 ; X64-WIN-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-WIN-NEXT:    xorb %dl, %cl
-; X64-WIN-NEXT:    movl %ecx, %eax
-; X64-WIN-NEXT:    notb %al
-; X64-WIN-NEXT:    andb %dl, %al
-; X64-WIN-NEXT:    addb %al, %al
-; X64-WIN-NEXT:    movl %eax, %edx
+; X64-WIN-NEXT:    xorb %dl, %al
+; X64-WIN-NEXT:    movl %eax, %ecx
+; X64-WIN-NEXT:    notb %cl
+; X64-WIN-NEXT:    andb %dl, %cl
+; X64-WIN-NEXT:    addb %cl, %cl
+; X64-WIN-NEXT:    movl %ecx, %edx
 ; X64-WIN-NEXT:    jne .LBB5_1
 ; X64-WIN-NEXT:  # %bb.2: # %bb12
-; X64-WIN-NEXT:    movl %ecx, %eax
 ; X64-WIN-NEXT:    retq
 entry:
   br label %bb
@@ -282,34 +284,34 @@
 ;
 ; X64-LIN-LABEL: test7:
 ; X64-LIN:       # %bb.0: # %entry
+; X64-LIN-NEXT:    movl %edi, %eax
 ; X64-LIN-NEXT:    .p2align 4, 0x90
 ; X64-LIN-NEXT:  .LBB6_1: # %bb
 ; X64-LIN-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-LIN-NEXT:    xorl %esi, %edi
-; X64-LIN-NEXT:    movl %edi, %eax
-; X64-LIN-NEXT:    xorl $2147483646, %eax # imm = 0x7FFFFFFE
-; X64-LIN-NEXT:    andl %esi, %eax
-; X64-LIN-NEXT:    addl %eax, %eax
-; X64-LIN-NEXT:    movl %eax, %esi
+; X64-LIN-NEXT:    xorl %esi, %eax
+; X64-LIN-NEXT:    movl %eax, %ecx
+; X64-LIN-NEXT:    xorl $2147483646, %ecx # imm = 0x7FFFFFFE
+; X64-LIN-NEXT:    andl %esi, %ecx
+; X64-LIN-NEXT:    addl %ecx, %ecx
+; X64-LIN-NEXT:    movl %ecx, %esi
 ; X64-LIN-NEXT:    jne .LBB6_1
 ; X64-LIN-NEXT:  # %bb.2: # %bb12
-; X64-LIN-NEXT:    movl %edi, %eax
 ; X64-LIN-NEXT:    retq
 ;
 ; X64-WIN-LABEL: test7:
 ; X64-WIN:       # %bb.0: # %entry
+; X64-WIN-NEXT:    movl %ecx, %eax
 ; X64-WIN-NEXT:    .p2align 4, 0x90
 ; X64-WIN-NEXT:  .LBB6_1: # %bb
 ; X64-WIN-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-WIN-NEXT:    xorl %edx, %ecx
-; X64-WIN-NEXT:    movl %ecx, %eax
-; X64-WIN-NEXT:    xorl $2147483646, %eax # imm = 0x7FFFFFFE
-; X64-WIN-NEXT:    andl %edx, %eax
-; X64-WIN-NEXT:    addl %eax, %eax
-; X64-WIN-NEXT:    movl %eax, %edx
+; X64-WIN-NEXT:    xorl %edx, %eax
+; X64-WIN-NEXT:    movl %eax, %ecx
+; X64-WIN-NEXT:    xorl $2147483646, %ecx # imm = 0x7FFFFFFE
+; X64-WIN-NEXT:    andl %edx, %ecx
+; X64-WIN-NEXT:    addl %ecx, %ecx
+; X64-WIN-NEXT:    movl %ecx, %edx
 ; X64-WIN-NEXT:    jne .LBB6_1
 ; X64-WIN-NEXT:  # %bb.2: # %bb12
-; X64-WIN-NEXT:    movl %ecx, %eax
 ; X64-WIN-NEXT:    retq
 entry:
   br label %bb
@@ -336,14 +338,14 @@
 ;
 ; X64-LIN-LABEL: test8:
 ; X64-LIN:       # %bb.0: # %entry
-; X64-LIN-NEXT:    notl %edi
 ; X64-LIN-NEXT:    movl %edi, %eax
+; X64-LIN-NEXT:    notl %eax
 ; X64-LIN-NEXT:    retq
 ;
 ; X64-WIN-LABEL: test8:
 ; X64-WIN:       # %bb.0: # %entry
-; X64-WIN-NEXT:    notl %ecx
 ; X64-WIN-NEXT:    movl %ecx, %eax
+; X64-WIN-NEXT:    notl %eax
 ; X64-WIN-NEXT:    retq
 entry:
   %t1 = sub i32 0, %a
@@ -361,16 +363,16 @@
 ;
 ; X64-LIN-LABEL: test9:
 ; X64-LIN:       # %bb.0:
-; X64-LIN-NEXT:    notl %edi
-; X64-LIN-NEXT:    andl $4096, %edi # imm = 0x1000
 ; X64-LIN-NEXT:    movl %edi, %eax
+; X64-LIN-NEXT:    notl %eax
+; X64-LIN-NEXT:    andl $4096, %eax # imm = 0x1000
 ; X64-LIN-NEXT:    retq
 ;
 ; X64-WIN-LABEL: test9:
 ; X64-WIN:       # %bb.0:
-; X64-WIN-NEXT:    notl %ecx
-; X64-WIN-NEXT:    andl $4096, %ecx # imm = 0x1000
 ; X64-WIN-NEXT:    movl %ecx, %eax
+; X64-WIN-NEXT:    notl %eax
+; X64-WIN-NEXT:    andl $4096, %eax # imm = 0x1000
 ; X64-WIN-NEXT:    retq
   %1 = and i32 %a, 4096
   %2 = xor i32 %1, 4096
@@ -459,8 +461,9 @@
 ;
 ; X64-LIN-LABEL: test11:
 ; X64-LIN:       # %bb.0:
-; X64-LIN-NEXT:    movl $-2, %eax
 ; X64-LIN-NEXT:    movl %edi, %ecx
+; X64-LIN-NEXT:    movl $-2, %eax
+; X64-LIN-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-LIN-NEXT:    roll %cl, %eax
 ; X64-LIN-NEXT:    retq
 ;
diff --git a/llvm/test/DebugInfo/COFF/pieces.ll b/llvm/test/DebugInfo/COFF/pieces.ll
index 64f3225..ab3794d 100644
--- a/llvm/test/DebugInfo/COFF/pieces.ll
+++ b/llvm/test/DebugInfo/COFF/pieces.ll
@@ -65,15 +65,15 @@
 
 
 ; ASM-LABEL: pad_right: # @pad_right
-; ASM:         #DEBUG_VALUE: pad_right:o <- [DW_OP_LLVM_fragment 32 32] $ecx
-; ASM:         movl    %ecx, %eax
+; ASM:         movq    %rcx, %rax
+; ASM:         #DEBUG_VALUE: pad_right:o <- [DW_OP_LLVM_fragment 32 32] $eax
 ; ASM:         retq
 
 
 ; ASM-LABEL: pad_left: # @pad_left
-; ASM:         #DEBUG_VALUE: pad_left:o <- [DW_OP_LLVM_fragment 0 32] $ecx
 ; ASM:         .cv_loc 2 1 24 3                # t.c:24:3
-; ASM:         movl    %ecx, %eax
+; ASM:         movq    %rcx, %rax
+; ASM:         #DEBUG_VALUE: pad_left:o <- [DW_OP_LLVM_fragment 0 32] $eax
 ; ASM:         retq
 
 
@@ -136,7 +136,7 @@
 ; ASM:        .asciz  "pad_right"             # Function name
 ; ASM:        .short  4414                    # Record kind: S_LOCAL
 ; ASM:        .asciz  "o"
-; ASM:        .cv_def_range    .Lfunc_begin1 .Lfunc_end1, "C\021\022\000\000\000\004\000\000\000"
+; ASM:        .cv_def_range    .Lfunc_begin1 .Ltmp8, "C\021\021\000\000\000\004\000\000\000"
 
 ; OBJ-LABEL: {{.*}}Proc{{.*}}Sym {
 ; OBJ:         Kind: S_GPROC32_ID (0x1147)
@@ -146,7 +146,7 @@
 ; OBJ:         VarName: o
 ; OBJ:       }
 ; OBJ:       DefRangeSubfieldRegisterSym {
-; OBJ:         Register: ECX (0x12)
+; OBJ:         Register: EAX (0x11)
 ; OBJ:         MayHaveNoName: 0
 ; OBJ:         OffsetInParent: 4
 ; OBJ:         LocalVariableAddrRange {
@@ -159,7 +159,7 @@
 ; ASM:        .asciz  "pad_left"              # Function name
 ; ASM:        .short  4414                    # Record kind: S_LOCAL
 ; ASM:        .asciz  "o"
-; ASM:        .cv_def_range    .Lfunc_begin2 .Lfunc_end2, "C\021\022\000\000\000\000\000\000\000"
+; ASM:        .cv_def_range    .Lfunc_begin2 .Ltmp10, "C\021\021\000\000\000\000\000\000\000"
 
 ; OBJ-LABEL: {{.*}}Proc{{.*}}Sym {
 ; OBJ:         Kind: S_GPROC32_ID (0x1147)
@@ -169,7 +169,7 @@
 ; OBJ:         VarName: o
 ; OBJ:       }
 ; OBJ:       DefRangeSubfieldRegisterSym {
-; OBJ:         Register: ECX (0x12)
+; OBJ:         Register: EAX (0x11)
 ; OBJ:         MayHaveNoName: 0
 ; OBJ:         OffsetInParent: 0
 ; OBJ:         LocalVariableAddrRange {
diff --git a/llvm/test/DebugInfo/X86/live-debug-values.ll b/llvm/test/DebugInfo/X86/live-debug-values.ll
index 9bf2d3e..4884733 100644
--- a/llvm/test/DebugInfo/X86/live-debug-values.ll
+++ b/llvm/test/DebugInfo/X86/live-debug-values.ll
@@ -33,7 +33,7 @@
 ; CHECK-NEXT:  #DEBUG_VALUE: main:n <- $ebx
 ;   Other register values have been clobbered.
 ; CHECK-NOT:   #DEBUG_VALUE:
-; CHECK:         movl    %ecx, m(%rip)
+; CHECK:         movl    %esi, m(%rip)
 
 ; ModuleID = 'LiveDebugValues.c'
 source_filename = "test/DebugInfo/X86/live-debug-values.ll"
diff --git a/llvm/test/DebugInfo/X86/live-debug-variables.ll b/llvm/test/DebugInfo/X86/live-debug-variables.ll
index 5f510e8..e746a0d 100644
--- a/llvm/test/DebugInfo/X86/live-debug-variables.ll
+++ b/llvm/test/DebugInfo/X86/live-debug-variables.ll
@@ -25,7 +25,7 @@
 ; CHECK:      .debug_loc contents:
 ; CHECK-NEXT: 0x00000000:
 ;   We currently emit an entry for the function prologue, too, which could be optimized away.
-; CHECK:              [0x000000000000001f, 0x000000000000003c): DW_OP_reg3 RBX
+; CHECK:              [0x0000000000000018, 0x0000000000000072): DW_OP_reg3 RBX
 ;   We should only have one entry inside the function.
 ; CHECK-NOT: :
 
diff --git a/llvm/test/DebugInfo/X86/pieces-3.ll b/llvm/test/DebugInfo/X86/pieces-3.ll
index 8afdcfa..e67e51e 100644
--- a/llvm/test/DebugInfo/X86/pieces-3.ll
+++ b/llvm/test/DebugInfo/X86/pieces-3.ll
@@ -17,11 +17,12 @@
 ;
 ; CHECK: DW_TAG_formal_parameter [3]
 ; CHECK-NEXT:   DW_AT_location [DW_FORM_data4]        (
-; CHECK-NEXT:     [0x0000000000000000, 0x0000000000000004): DW_OP_reg5 RDI, DW_OP_piece 0x8, DW_OP_piece 0x4, DW_OP_reg4 RSI, DW_OP_piece 0x4
-; CHECK-NEXT:     [0x0000000000000004, 0x0000000000000008): DW_OP_reg5 RDI, DW_OP_piece 0x8, DW_OP_piece 0x4, DW_OP_reg4 RSI, DW_OP_piece 0x4)
+; CHECK-NEXT:     [0x0000000000000000, 0x0000000000000007): DW_OP_reg5 RDI, DW_OP_piece 0x8, DW_OP_piece 0x4, DW_OP_reg4 RSI, DW_OP_piece 0x4
+; CHECK-NEXT:     [0x0000000000000007, 0x0000000000000007): DW_OP_reg5 RDI, DW_OP_piece 0x8, DW_OP_piece 0x4, DW_OP_reg0 RAX, DW_OP_piece 0x4)
 ; CHECK-NEXT:   DW_AT_name {{.*}}"outer"
 ; CHECK: DW_TAG_variable
-; CHECK-NEXT:   DW_AT_location {{.*}}(DW_OP_reg4 RSI, DW_OP_piece 0x4)
+; CHECK-NEXT:   DW_AT_location [DW_FORM_data4]        (0x00000044
+; CHECK-NEXT:     [0x0000000000000007, 0x0000000000000007): DW_OP_reg0 RAX, DW_OP_piece 0x4)
 ; CHECK-NEXT:   "i1"
 
 ; ModuleID = '/Volumes/Data/llvm/test/DebugInfo/X86/sroasplit-2.ll'