Merge "Fix SIB for base + index addressing in x86_64 assembler."
diff --git a/runtime/Android.mk b/runtime/Android.mk
index 8fc5e34..302e835 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -252,6 +252,7 @@
arch/x86_64/context_x86_64.cc \
arch/x86_64/entrypoints_init_x86_64.cc \
arch/x86_64/jni_entrypoints_x86_64.S \
+ arch/x86_64/memcmp16_x86_64.S \
arch/x86_64/portable_entrypoints_x86_64.S \
arch/x86_64/quick_entrypoints_x86_64.S \
arch/x86_64/thread_x86_64.cc \
diff --git a/runtime/arch/memcmp16.h b/runtime/arch/memcmp16.h
index 65d2f92..14dc1e3 100644
--- a/runtime/arch/memcmp16.h
+++ b/runtime/arch/memcmp16.h
@@ -30,7 +30,7 @@
//
// In both cases, MemCmp16 is declared.
-#if defined(__aarch64__) || defined(__arm__) || defined(__mips) || defined(__i386__)
+#if defined(__aarch64__) || defined(__arm__) || defined(__mips) || defined(__i386__) || defined(__x86_64__)
extern "C" uint32_t __memcmp16(const uint16_t* s0, const uint16_t* s1, size_t count);
#define MemCmp16 __memcmp16
diff --git a/runtime/arch/x86/memcmp16_x86.S b/runtime/arch/x86/memcmp16_x86.S
index 17662fa..a315a37 100644
--- a/runtime/arch/x86/memcmp16_x86.S
+++ b/runtime/arch/x86/memcmp16_x86.S
@@ -21,1018 +21,1018 @@
/* int32_t memcmp16_compare(const uint16_t* s0, const uint16_t* s1, size_t count); */
#ifndef L
-# define L(label) .L##label
+# define L(label) .L##label
#endif
-#define CFI_PUSH(REG) \
- CFI_ADJUST_CFA_OFFSET(4); \
- CFI_REL_OFFSET(REG, 0)
+#define CFI_PUSH(REG) \
+ CFI_ADJUST_CFA_OFFSET(4); \
+ CFI_REL_OFFSET(REG, 0)
-#define CFI_POP(REG) \
- CFI_ADJUST_CFA_OFFSET(-4); \
- CFI_RESTORE(REG)
+#define CFI_POP(REG) \
+ CFI_ADJUST_CFA_OFFSET(-4); \
+ CFI_RESTORE(REG)
-#define PUSH(REG) pushl REG; CFI_PUSH (REG)
-#define POP(REG) popl REG; CFI_POP (REG)
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
-#define PARMS 4
-#define BLK1 PARMS
-#define BLK2 BLK1+4
-#define LEN BLK2+4
-#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
-#define RETURN RETURN_END; CFI_RESTORE_STATE; CFI_REMEMBER_STATE
+#define PARMS 4
+#define BLK1 PARMS
+#define BLK2 BLK1+4
+#define LEN BLK2+4
+#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
+#define RETURN RETURN_END; CFI_RESTORE_STATE; CFI_REMEMBER_STATE
DEFINE_FUNCTION MEMCMP
- movl LEN(%esp), %ecx
+ movl LEN(%esp), %ecx
- shl $1, %ecx
- jz L(zero)
+ shl $1, %ecx
+ jz L(zero)
- movl BLK1(%esp), %eax
- cmp $48, %ecx
- movl BLK2(%esp), %edx
- jae L(48bytesormore)
+ movl BLK1(%esp), %eax
+ cmp $48, %ecx
+ movl BLK2(%esp), %edx
+ jae L(48bytesormore)
- PUSH (%ebx)
- add %ecx, %edx
- add %ecx, %eax
- jmp L(less48bytes)
+ PUSH (%ebx)
+ add %ecx, %edx
+ add %ecx, %eax
+ jmp L(less48bytes)
- CFI_POP (%ebx)
+ CFI_POP (%ebx)
- .p2align 4
+ .p2align 4
L(zero):
- xor %eax, %eax
- ret
+ xor %eax, %eax
+ ret
- .p2align 4
+ .p2align 4
L(48bytesormore):
- PUSH (%ebx)
- PUSH (%esi)
- PUSH (%edi)
- CFI_REMEMBER_STATE
- movdqu (%eax), %xmm3
- movdqu (%edx), %xmm0
- movl %eax, %edi
- movl %edx, %esi
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %edx
- lea 16(%edi), %edi
+ PUSH (%ebx)
+ PUSH (%esi)
+ PUSH (%edi)
+ CFI_REMEMBER_STATE
+ movdqu (%eax), %xmm3
+ movdqu (%edx), %xmm0
+ movl %eax, %edi
+ movl %edx, %esi
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%edi), %edi
- sub $0xffff, %edx
- lea 16(%esi), %esi
- jnz L(less16bytes)
- mov %edi, %edx
- and $0xf, %edx
- xor %edx, %edi
- sub %edx, %esi
- add %edx, %ecx
- mov %esi, %edx
- and $0xf, %edx
- jz L(shr_0)
- xor %edx, %esi
+ sub $0xffff, %edx
+ lea 16(%esi), %esi
+ jnz L(less16bytes)
+ mov %edi, %edx
+ and $0xf, %edx
+ xor %edx, %edi
+ sub %edx, %esi
+ add %edx, %ecx
+ mov %esi, %edx
+ and $0xf, %edx
+ jz L(shr_0)
+ xor %edx, %esi
- cmp $0, %edx
- je L(shr_0)
- cmp $2, %edx
- je L(shr_2)
- cmp $4, %edx
- je L(shr_4)
- cmp $6, %edx
- je L(shr_6)
- cmp $8, %edx
- je L(shr_8)
- cmp $10, %edx
- je L(shr_10)
- cmp $12, %edx
- je L(shr_12)
- jmp L(shr_14)
+ cmp $0, %edx
+ je L(shr_0)
+ cmp $2, %edx
+ je L(shr_2)
+ cmp $4, %edx
+ je L(shr_4)
+ cmp $6, %edx
+ je L(shr_6)
+ cmp $8, %edx
+ je L(shr_8)
+ cmp $10, %edx
+ je L(shr_10)
+ cmp $12, %edx
+ je L(shr_12)
+ jmp L(shr_14)
- .p2align 4
+ .p2align 4
L(shr_0):
- cmp $80, %ecx
- jae L(shr_0_gobble)
- lea -48(%ecx), %ecx
- xor %eax, %eax
- movaps (%esi), %xmm1
- pcmpeqb (%edi), %xmm1
- movaps 16(%esi), %xmm2
- pcmpeqb 16(%edi), %xmm2
- pand %xmm1, %xmm2
- pmovmskb %xmm2, %edx
- add $32, %edi
- add $32, %esi
- sub $0xffff, %edx
- jnz L(exit)
+ cmp $80, %ecx
+ jae L(shr_0_gobble)
+ lea -48(%ecx), %ecx
+ xor %eax, %eax
+ movaps (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+ movaps 16(%esi), %xmm2
+ pcmpeqb 16(%edi), %xmm2
+ pand %xmm1, %xmm2
+ pmovmskb %xmm2, %edx
+ add $32, %edi
+ add $32, %esi
+ sub $0xffff, %edx
+ jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea (%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ lea (%ecx, %edi,1), %eax
+ lea (%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_0_gobble):
- lea -48(%ecx), %ecx
- movdqa (%esi), %xmm0
- xor %eax, %eax
- pcmpeqb (%edi), %xmm0
- sub $32, %ecx
- movdqa 16(%esi), %xmm2
- pcmpeqb 16(%edi), %xmm2
+ lea -48(%ecx), %ecx
+ movdqa (%esi), %xmm0
+ xor %eax, %eax
+ pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm2
+ pcmpeqb 16(%edi), %xmm2
L(shr_0_gobble_loop):
- pand %xmm0, %xmm2
- sub $32, %ecx
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- movdqa 32(%esi), %xmm0
- movdqa 48(%esi), %xmm2
- sbb $0xffff, %edx
- pcmpeqb 32(%edi), %xmm0
- pcmpeqb 48(%edi), %xmm2
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- jz L(shr_0_gobble_loop)
+ pand %xmm0, %xmm2
+ sub $32, %ecx
+ pmovmskb %xmm2, %edx
+ movdqa %xmm0, %xmm1
+ movdqa 32(%esi), %xmm0
+ movdqa 48(%esi), %xmm2
+ sbb $0xffff, %edx
+ pcmpeqb 32(%edi), %xmm0
+ pcmpeqb 48(%edi), %xmm2
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ jz L(shr_0_gobble_loop)
- pand %xmm0, %xmm2
- cmp $0, %ecx
- jge L(shr_0_gobble_loop_next)
- inc %edx
- add $32, %ecx
+ pand %xmm0, %xmm2
+ cmp $0, %ecx
+ jge L(shr_0_gobble_loop_next)
+ inc %edx
+ add $32, %ecx
L(shr_0_gobble_loop_next):
- test %edx, %edx
- jnz L(exit)
+ test %edx, %edx
+ jnz L(exit)
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea (%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ pmovmskb %xmm2, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea (%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_2):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_2_gobble)
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_2_gobble)
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $2,(%esi), %xmm1
- pcmpeqb (%edi), %xmm1
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $2,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
- movdqa 32(%esi), %xmm3
- palignr $2,%xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $2,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 2(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 2(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_2_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $2,(%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $2,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $2,16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $2,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_2_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
- movdqa 64(%esi), %xmm3
- palignr $2,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $2,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
+ movdqa 64(%esi), %xmm3
+ palignr $2,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $2,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
- lea 32(%edi), %edi
- jz L(shr_2_gobble_loop)
- pand %xmm0, %xmm3
+ lea 32(%edi), %edi
+ jz L(shr_2_gobble_loop)
+ pand %xmm0, %xmm3
- cmp $0, %ecx
- jge L(shr_2_gobble_next)
- inc %edx
- add $32, %ecx
+ cmp $0, %ecx
+ jge L(shr_2_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_2_gobble_next):
- test %edx, %edx
- jnz L(exit)
+ test %edx, %edx
+ jnz L(exit)
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 2(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ lea (%ecx, %edi,1), %eax
+ lea 2(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_4):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_4_gobble)
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_4_gobble)
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $4,(%esi), %xmm1
- pcmpeqb (%edi), %xmm1
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $4,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
- movdqa 32(%esi), %xmm3
- palignr $4,%xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $4,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 4(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 4(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_4_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $4,(%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $4,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $4,16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $4,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_4_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
- movdqa 64(%esi), %xmm3
- palignr $4,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $4,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
+ movdqa 64(%esi), %xmm3
+ palignr $4,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $4,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
- lea 32(%edi), %edi
- jz L(shr_4_gobble_loop)
- pand %xmm0, %xmm3
+ lea 32(%edi), %edi
+ jz L(shr_4_gobble_loop)
+ pand %xmm0, %xmm3
- cmp $0, %ecx
- jge L(shr_4_gobble_next)
- inc %edx
- add $32, %ecx
+ cmp $0, %ecx
+ jge L(shr_4_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_4_gobble_next):
- test %edx, %edx
- jnz L(exit)
+ test %edx, %edx
+ jnz L(exit)
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 4(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ lea (%ecx, %edi,1), %eax
+ lea 4(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_6):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_6_gobble)
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_6_gobble)
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $6,(%esi), %xmm1
- pcmpeqb (%edi), %xmm1
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $6,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
- movdqa 32(%esi), %xmm3
- palignr $6,%xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $6,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 6(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 6(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_6_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $6,(%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $6,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $6,16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $6,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_6_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
- movdqa 64(%esi), %xmm3
- palignr $6,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $6,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
+ movdqa 64(%esi), %xmm3
+ palignr $6,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $6,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
- lea 32(%edi), %edi
- jz L(shr_6_gobble_loop)
- pand %xmm0, %xmm3
+ lea 32(%edi), %edi
+ jz L(shr_6_gobble_loop)
+ pand %xmm0, %xmm3
- cmp $0, %ecx
- jge L(shr_6_gobble_next)
- inc %edx
- add $32, %ecx
+ cmp $0, %ecx
+ jge L(shr_6_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_6_gobble_next):
- test %edx, %edx
- jnz L(exit)
+ test %edx, %edx
+ jnz L(exit)
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 6(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ lea (%ecx, %edi,1), %eax
+ lea 6(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_8):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_8_gobble)
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_8_gobble)
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $8,(%esi), %xmm1
- pcmpeqb (%edi), %xmm1
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $8,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
- movdqa 32(%esi), %xmm3
- palignr $8,%xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $8,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 8(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 8(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_8_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $8,(%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $8,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $8,16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $8,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_8_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
- movdqa 64(%esi), %xmm3
- palignr $8,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $8,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
+ movdqa 64(%esi), %xmm3
+ palignr $8,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $8,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
- lea 32(%edi), %edi
- jz L(shr_8_gobble_loop)
- pand %xmm0, %xmm3
+ lea 32(%edi), %edi
+ jz L(shr_8_gobble_loop)
+ pand %xmm0, %xmm3
- cmp $0, %ecx
- jge L(shr_8_gobble_next)
- inc %edx
- add $32, %ecx
+ cmp $0, %ecx
+ jge L(shr_8_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_8_gobble_next):
- test %edx, %edx
- jnz L(exit)
+ test %edx, %edx
+ jnz L(exit)
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 8(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ lea (%ecx, %edi,1), %eax
+ lea 8(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_10):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_10_gobble)
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_10_gobble)
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $10, (%esi), %xmm1
- pcmpeqb (%edi), %xmm1
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $10, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
- movdqa 32(%esi), %xmm3
- palignr $10,%xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $10,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 10(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 10(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_10_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $10, (%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $10, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $10, 16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $10, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_10_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
- movdqa 64(%esi), %xmm3
- palignr $10,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $10,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
+ movdqa 64(%esi), %xmm3
+ palignr $10,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $10,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
- lea 32(%edi), %edi
- jz L(shr_10_gobble_loop)
- pand %xmm0, %xmm3
+ lea 32(%edi), %edi
+ jz L(shr_10_gobble_loop)
+ pand %xmm0, %xmm3
- cmp $0, %ecx
- jge L(shr_10_gobble_next)
- inc %edx
- add $32, %ecx
+ cmp $0, %ecx
+ jge L(shr_10_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_10_gobble_next):
- test %edx, %edx
- jnz L(exit)
+ test %edx, %edx
+ jnz L(exit)
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 10(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ lea (%ecx, %edi,1), %eax
+ lea 10(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_12):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_12_gobble)
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_12_gobble)
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $12, (%esi), %xmm1
- pcmpeqb (%edi), %xmm1
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $12, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
- movdqa 32(%esi), %xmm3
- palignr $12, %xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $12, %xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 12(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 12(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_12_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $12, (%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $12, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $12, 16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $12, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_12_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
- movdqa 64(%esi), %xmm3
- palignr $12,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $12,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
+ movdqa 64(%esi), %xmm3
+ palignr $12,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $12,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
- lea 32(%edi), %edi
- jz L(shr_12_gobble_loop)
- pand %xmm0, %xmm3
+ lea 32(%edi), %edi
+ jz L(shr_12_gobble_loop)
+ pand %xmm0, %xmm3
- cmp $0, %ecx
- jge L(shr_12_gobble_next)
- inc %edx
- add $32, %ecx
+ cmp $0, %ecx
+ jge L(shr_12_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_12_gobble_next):
- test %edx, %edx
- jnz L(exit)
+ test %edx, %edx
+ jnz L(exit)
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 12(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ lea (%ecx, %edi,1), %eax
+ lea 12(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_14):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_14_gobble)
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_14_gobble)
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $14, (%esi), %xmm1
- pcmpeqb (%edi), %xmm1
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $14, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
- movdqa 32(%esi), %xmm3
- palignr $14, %xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $14, %xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 14(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 14(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_14_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $14, (%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $14, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $14, 16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $14, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_14_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
- movdqa 64(%esi), %xmm3
- palignr $14,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $14,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
+ movdqa 64(%esi), %xmm3
+ palignr $14,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $14,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
- lea 32(%edi), %edi
- jz L(shr_14_gobble_loop)
- pand %xmm0, %xmm3
+ lea 32(%edi), %edi
+ jz L(shr_14_gobble_loop)
+ pand %xmm0, %xmm3
- cmp $0, %ecx
- jge L(shr_14_gobble_next)
- inc %edx
- add $32, %ecx
+ cmp $0, %ecx
+ jge L(shr_14_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_14_gobble_next):
- test %edx, %edx
- jnz L(exit)
+ test %edx, %edx
+ jnz L(exit)
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 14(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ lea (%ecx, %edi,1), %eax
+ lea 14(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(exit):
- pmovmskb %xmm1, %ebx
- sub $0xffff, %ebx
- jz L(first16bytes)
- lea -16(%esi), %esi
- lea -16(%edi), %edi
- mov %ebx, %edx
+ pmovmskb %xmm1, %ebx
+ sub $0xffff, %ebx
+ jz L(first16bytes)
+ lea -16(%esi), %esi
+ lea -16(%edi), %edi
+ mov %ebx, %edx
L(first16bytes):
- add %eax, %esi
+ add %eax, %esi
L(less16bytes):
- test %dl, %dl
- jz L(next_four_words)
- test $15, %dl
- jz L(second_two_words)
- test $3, %dl
- jz L(second_word)
- movzwl -16(%edi), %eax
- movzwl -16(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ test %dl, %dl
+ jz L(next_four_words)
+ test $15, %dl
+ jz L(second_two_words)
+ test $3, %dl
+ jz L(second_word)
+ movzwl -16(%edi), %eax
+ movzwl -16(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- .p2align 4
+ .p2align 4
L(second_word):
- movzwl -14(%edi), %eax
- movzwl -14(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ movzwl -14(%edi), %eax
+ movzwl -14(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- .p2align 4
+ .p2align 4
L(second_two_words):
- test $63, %dl
- jz L(fourth_word)
- movzwl -12(%edi), %eax
- movzwl -12(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ test $63, %dl
+ jz L(fourth_word)
+ movzwl -12(%edi), %eax
+ movzwl -12(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- .p2align 4
+ .p2align 4
L(fourth_word):
- movzwl -10(%edi), %eax
- movzwl -10(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ movzwl -10(%edi), %eax
+ movzwl -10(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- .p2align 4
+ .p2align 4
L(next_four_words):
- test $15, %dh
- jz L(fourth_two_words)
- test $3, %dh
- jz L(sixth_word)
- movzwl -8(%edi), %eax
- movzwl -8(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ test $15, %dh
+ jz L(fourth_two_words)
+ test $3, %dh
+ jz L(sixth_word)
+ movzwl -8(%edi), %eax
+ movzwl -8(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- .p2align 4
+ .p2align 4
L(sixth_word):
- movzwl -6(%edi), %eax
- movzwl -6(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ movzwl -6(%edi), %eax
+ movzwl -6(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- .p2align 4
+ .p2align 4
L(fourth_two_words):
- test $63, %dh
- jz L(eighth_word)
- movzwl -4(%edi), %eax
- movzwl -4(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ test $63, %dh
+ jz L(eighth_word)
+ movzwl -4(%edi), %eax
+ movzwl -4(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- .p2align 4
+ .p2align 4
L(eighth_word):
- movzwl -2(%edi), %eax
- movzwl -2(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ movzwl -2(%edi), %eax
+ movzwl -2(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- CFI_PUSH (%ebx)
+ CFI_PUSH (%ebx)
- .p2align 4
+ .p2align 4
L(more8bytes):
- cmp $16, %ecx
- jae L(more16bytes)
- cmp $8, %ecx
- je L(8bytes)
- cmp $10, %ecx
- je L(10bytes)
- cmp $12, %ecx
- je L(12bytes)
- jmp L(14bytes)
+ cmp $16, %ecx
+ jae L(more16bytes)
+ cmp $8, %ecx
+ je L(8bytes)
+ cmp $10, %ecx
+ je L(10bytes)
+ cmp $12, %ecx
+ je L(12bytes)
+ jmp L(14bytes)
- .p2align 4
+ .p2align 4
L(more16bytes):
- cmp $24, %ecx
- jae L(more24bytes)
- cmp $16, %ecx
- je L(16bytes)
- cmp $18, %ecx
- je L(18bytes)
- cmp $20, %ecx
- je L(20bytes)
- jmp L(22bytes)
+ cmp $24, %ecx
+ jae L(more24bytes)
+ cmp $16, %ecx
+ je L(16bytes)
+ cmp $18, %ecx
+ je L(18bytes)
+ cmp $20, %ecx
+ je L(20bytes)
+ jmp L(22bytes)
- .p2align 4
+ .p2align 4
L(more24bytes):
- cmp $32, %ecx
- jae L(more32bytes)
- cmp $24, %ecx
- je L(24bytes)
- cmp $26, %ecx
- je L(26bytes)
- cmp $28, %ecx
- je L(28bytes)
- jmp L(30bytes)
+ cmp $32, %ecx
+ jae L(more32bytes)
+ cmp $24, %ecx
+ je L(24bytes)
+ cmp $26, %ecx
+ je L(26bytes)
+ cmp $28, %ecx
+ je L(28bytes)
+ jmp L(30bytes)
- .p2align 4
+ .p2align 4
L(more32bytes):
- cmp $40, %ecx
- jae L(more40bytes)
- cmp $32, %ecx
- je L(32bytes)
- cmp $34, %ecx
- je L(34bytes)
- cmp $36, %ecx
- je L(36bytes)
- jmp L(38bytes)
+ cmp $40, %ecx
+ jae L(more40bytes)
+ cmp $32, %ecx
+ je L(32bytes)
+ cmp $34, %ecx
+ je L(34bytes)
+ cmp $36, %ecx
+ je L(36bytes)
+ jmp L(38bytes)
- .p2align 4
+ .p2align 4
L(less48bytes):
- cmp $8, %ecx
- jae L(more8bytes)
- cmp $2, %ecx
- je L(2bytes)
- cmp $4, %ecx
- je L(4bytes)
- jmp L(6bytes)
+ cmp $8, %ecx
+ jae L(more8bytes)
+ cmp $2, %ecx
+ je L(2bytes)
+ cmp $4, %ecx
+ je L(4bytes)
+ jmp L(6bytes)
- .p2align 4
+ .p2align 4
L(more40bytes):
- cmp $40, %ecx
- je L(40bytes)
- cmp $42, %ecx
- je L(42bytes)
- cmp $44, %ecx
- je L(44bytes)
- jmp L(46bytes)
+ cmp $40, %ecx
+ je L(40bytes)
+ cmp $42, %ecx
+ je L(42bytes)
+ cmp $44, %ecx
+ je L(44bytes)
+ jmp L(46bytes)
- .p2align 4
+ .p2align 4
L(46bytes):
- movzwl -46(%eax), %ecx
- movzwl -46(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -46(%eax), %ecx
+ movzwl -46(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(44bytes):
- movzwl -44(%eax), %ecx
- movzwl -44(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -44(%eax), %ecx
+ movzwl -44(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(42bytes):
- movzwl -42(%eax), %ecx
- movzwl -42(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -42(%eax), %ecx
+ movzwl -42(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(40bytes):
- movzwl -40(%eax), %ecx
- movzwl -40(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -40(%eax), %ecx
+ movzwl -40(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(38bytes):
- movzwl -38(%eax), %ecx
- movzwl -38(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -38(%eax), %ecx
+ movzwl -38(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(36bytes):
- movzwl -36(%eax), %ecx
- movzwl -36(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -36(%eax), %ecx
+ movzwl -36(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(34bytes):
- movzwl -34(%eax), %ecx
- movzwl -34(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -34(%eax), %ecx
+ movzwl -34(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(32bytes):
- movzwl -32(%eax), %ecx
- movzwl -32(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -32(%eax), %ecx
+ movzwl -32(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(30bytes):
- movzwl -30(%eax), %ecx
- movzwl -30(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -30(%eax), %ecx
+ movzwl -30(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(28bytes):
- movzwl -28(%eax), %ecx
- movzwl -28(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -28(%eax), %ecx
+ movzwl -28(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(26bytes):
- movzwl -26(%eax), %ecx
- movzwl -26(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -26(%eax), %ecx
+ movzwl -26(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(24bytes):
- movzwl -24(%eax), %ecx
- movzwl -24(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -24(%eax), %ecx
+ movzwl -24(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(22bytes):
- movzwl -22(%eax), %ecx
- movzwl -22(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -22(%eax), %ecx
+ movzwl -22(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(20bytes):
- movzwl -20(%eax), %ecx
- movzwl -20(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -20(%eax), %ecx
+ movzwl -20(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(18bytes):
- movzwl -18(%eax), %ecx
- movzwl -18(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -18(%eax), %ecx
+ movzwl -18(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(16bytes):
- movzwl -16(%eax), %ecx
- movzwl -16(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -16(%eax), %ecx
+ movzwl -16(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(14bytes):
- movzwl -14(%eax), %ecx
- movzwl -14(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -14(%eax), %ecx
+ movzwl -14(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(12bytes):
- movzwl -12(%eax), %ecx
- movzwl -12(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -12(%eax), %ecx
+ movzwl -12(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(10bytes):
- movzwl -10(%eax), %ecx
- movzwl -10(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -10(%eax), %ecx
+ movzwl -10(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(8bytes):
- movzwl -8(%eax), %ecx
- movzwl -8(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -8(%eax), %ecx
+ movzwl -8(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(6bytes):
- movzwl -6(%eax), %ecx
- movzwl -6(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -6(%eax), %ecx
+ movzwl -6(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(4bytes):
- movzwl -4(%eax), %ecx
- movzwl -4(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -4(%eax), %ecx
+ movzwl -4(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(2bytes):
- movzwl -2(%eax), %eax
- movzwl -2(%edx), %ebx
- subl %ebx, %eax
- POP (%ebx)
- ret
- CFI_PUSH (%ebx)
+ movzwl -2(%eax), %eax
+ movzwl -2(%edx), %ebx
+ subl %ebx, %eax
+ POP (%ebx)
+ ret
+ CFI_PUSH (%ebx)
- .p2align 4
+ .p2align 4
L(memcmp16_exit):
- POP (%ebx)
- mov %ecx, %eax
- ret
+ POP (%ebx)
+ mov %ecx, %eax
+ ret
END_FUNCTION MEMCMP
diff --git a/runtime/arch/x86_64/memcmp16_x86_64.S b/runtime/arch/x86_64/memcmp16_x86_64.S
new file mode 100755
index 0000000..46e4ba3
--- /dev/null
+++ b/runtime/arch/x86_64/memcmp16_x86_64.S
@@ -0,0 +1,1210 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "asm_support_x86_64.S"
+
+#define MEMCMP __memcmp16
+
+/*
+ * Half of Silvermont L1 Data Cache size
+ *(see original file cache.h in bionic/libc/arch-x86_64/).
+ * This value is used for specific optimization on big lengths.
+ */
+#define DATA_CACHE_SIZE_HALF (12*1024)
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+
+#define JMPTBL(I, B) (I - B)
+
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ lea TABLE(%rip), %r11; \
+ movslq (%r11, INDEX, SCALE), %rcx; \
+ add %r11, %rcx; \
+ jmp *%rcx; \
+ ud2
+
+DEFINE_FUNCTION MEMCMP
+ pxor %xmm0, %xmm0
+ shl $1, %rdx
+ cmp $79, %rdx
+ ja L(79bytesormore)
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+ ALIGN (4)
+L(79bytesormore):
+ movdqu (%rsi), %xmm1
+ movdqu (%rdi), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+ mov %rsi, %rcx
+ and $-16, %rsi
+ add $16, %rsi
+ sub %rsi, %rcx
+
+ sub %rcx, %rdi
+ add %rcx, %rdx
+ test $0xf, %rdi
+ jz L(2aligned)
+
+ cmp $128, %rdx
+ ja L(128bytesormore)
+L(less128bytes):
+ sub $64, %rdx
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqu 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqu 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+ cmp $32, %rdx
+ jb L(less32bytesin64)
+
+ movdqu 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqu 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin64):
+ add $64, %rdi
+ add $64, %rsi
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+L(128bytesormore):
+ cmp $512, %rdx
+ ja L(512bytesormore)
+ cmp $256, %rdx
+ ja L(less512bytes)
+L(less256bytes):
+ sub $128, %rdx
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqu 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqu 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqu 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqu 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqu 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqu 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ add $128, %rsi
+ add $128, %rdi
+
+ cmp $64, %rdx
+ jae L(less128bytes)
+
+ cmp $32, %rdx
+ jb L(less32bytesin128)
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin128):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+L(less512bytes):
+ sub $256, %rdx
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqu 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqu 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqu 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqu 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqu 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqu 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ movdqu 128(%rdi), %xmm2
+ pxor 128(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(144bytesin256)
+
+ movdqu 144(%rdi), %xmm2
+ pxor 144(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(160bytesin256)
+
+ movdqu 160(%rdi), %xmm2
+ pxor 160(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(176bytesin256)
+
+ movdqu 176(%rdi), %xmm2
+ pxor 176(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(192bytesin256)
+
+ movdqu 192(%rdi), %xmm2
+ pxor 192(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(208bytesin256)
+
+ movdqu 208(%rdi), %xmm2
+ pxor 208(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(224bytesin256)
+
+ movdqu 224(%rdi), %xmm2
+ pxor 224(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(240bytesin256)
+
+ movdqu 240(%rdi), %xmm2
+ pxor 240(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(256bytesin256)
+
+ add $256, %rsi
+ add $256, %rdi
+
+ cmp $128, %rdx
+ jae L(less256bytes)
+
+ cmp $64, %rdx
+ jae L(less128bytes)
+
+ cmp $32, %rdx
+ jb L(less32bytesin256)
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin256):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+ ALIGN (4)
+L(512bytesormore):
+#ifdef DATA_CACHE_SIZE_HALF
+ mov $DATA_CACHE_SIZE_HALF, %r8
+#else
+ mov __x86_64_data_cache_size_half(%rip), %r8
+#endif
+ mov %r8, %r9
+ shr $1, %r8
+ add %r9, %r8
+ cmp %r8, %rdx
+ ja L(L2_L3_cache_unaglined)
+ sub $64, %rdx
+ ALIGN (4)
+L(64bytesormore_loop):
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqu 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqu 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqu 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(64bytesormore_loop)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+L(L2_L3_cache_unaglined):
+ sub $64, %rdx
+ ALIGN (4)
+L(L2_L3_unaligned_128bytes_loop):
+ prefetchnta 0x1c0(%rdi)
+ prefetchnta 0x1c0(%rsi)
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqu 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqu 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqu 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(L2_L3_unaligned_128bytes_loop)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+/*
+ * This case is for machines which are sensitive for unaligned instructions.
+ */
+ ALIGN (4)
+L(2aligned):
+ cmp $128, %rdx
+ ja L(128bytesormorein2aligned)
+L(less128bytesin2aligned):
+ sub $64, %rdx
+
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqa 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqa 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+ cmp $32, %rdx
+ jb L(less32bytesin64in2alinged)
+
+ movdqa 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqa 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin64in2alinged):
+ add $64, %rdi
+ add $64, %rsi
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+ ALIGN (4)
+L(128bytesormorein2aligned):
+ cmp $512, %rdx
+ ja L(512bytesormorein2aligned)
+ cmp $256, %rdx
+ ja L(256bytesormorein2aligned)
+L(less256bytesin2alinged):
+ sub $128, %rdx
+
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqa 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqa 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqa 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqa 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqa 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqa 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ add $128, %rsi
+ add $128, %rdi
+
+ cmp $64, %rdx
+ jae L(less128bytesin2aligned)
+
+ cmp $32, %rdx
+ jb L(less32bytesin128in2aligned)
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin128in2aligned):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+ ALIGN (4)
+L(256bytesormorein2aligned):
+
+ sub $256, %rdx
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqa 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqa 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqa 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqa 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqa 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqa 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ movdqa 128(%rdi), %xmm2
+ pxor 128(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(144bytesin256)
+
+ movdqa 144(%rdi), %xmm2
+ pxor 144(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(160bytesin256)
+
+ movdqa 160(%rdi), %xmm2
+ pxor 160(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(176bytesin256)
+
+ movdqa 176(%rdi), %xmm2
+ pxor 176(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(192bytesin256)
+
+ movdqa 192(%rdi), %xmm2
+ pxor 192(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(208bytesin256)
+
+ movdqa 208(%rdi), %xmm2
+ pxor 208(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(224bytesin256)
+
+ movdqa 224(%rdi), %xmm2
+ pxor 224(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(240bytesin256)
+
+ movdqa 240(%rdi), %xmm2
+ pxor 240(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(256bytesin256)
+
+ add $256, %rsi
+ add $256, %rdi
+
+ cmp $128, %rdx
+ jae L(less256bytesin2alinged)
+
+ cmp $64, %rdx
+ jae L(less128bytesin2aligned)
+
+ cmp $32, %rdx
+ jb L(less32bytesin256in2alinged)
+
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin256in2alinged):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+ ALIGN (4)
+L(512bytesormorein2aligned):
+#ifdef DATA_CACHE_SIZE_HALF
+ mov $DATA_CACHE_SIZE_HALF, %r8
+#else
+ mov __x86_64_data_cache_size_half(%rip), %r8
+#endif
+ mov %r8, %r9
+ shr $1, %r8
+ add %r9, %r8
+ cmp %r8, %rdx
+ ja L(L2_L3_cache_aglined)
+
+ sub $64, %rdx
+ ALIGN (4)
+L(64bytesormore_loopin2aligned):
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqa 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqa 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqa 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(64bytesormore_loopin2aligned)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+L(L2_L3_cache_aglined):
+ sub $64, %rdx
+ ALIGN (4)
+L(L2_L3_aligned_128bytes_loop):
+ prefetchnta 0x1c0(%rdi)
+ prefetchnta 0x1c0(%rsi)
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqa 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqa 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqa 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(L2_L3_aligned_128bytes_loop)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2)
+
+
+ ALIGN (4)
+L(64bytesormore_loop_end):
+ add $16, %rdi
+ add $16, %rsi
+ ptest %xmm2, %xmm0
+ jnc L(16bytes)
+
+ add $16, %rdi
+ add $16, %rsi
+ ptest %xmm3, %xmm0
+ jnc L(16bytes)
+
+ add $16, %rdi
+ add $16, %rsi
+ ptest %xmm4, %xmm0
+ jnc L(16bytes)
+
+ add $16, %rdi
+ add $16, %rsi
+ jmp L(16bytes)
+
+L(256bytesin256):
+ add $256, %rdi
+ add $256, %rsi
+ jmp L(16bytes)
+L(240bytesin256):
+ add $240, %rdi
+ add $240, %rsi
+ jmp L(16bytes)
+L(224bytesin256):
+ add $224, %rdi
+ add $224, %rsi
+ jmp L(16bytes)
+L(208bytesin256):
+ add $208, %rdi
+ add $208, %rsi
+ jmp L(16bytes)
+L(192bytesin256):
+ add $192, %rdi
+ add $192, %rsi
+ jmp L(16bytes)
+L(176bytesin256):
+ add $176, %rdi
+ add $176, %rsi
+ jmp L(16bytes)
+L(160bytesin256):
+ add $160, %rdi
+ add $160, %rsi
+ jmp L(16bytes)
+L(144bytesin256):
+ add $144, %rdi
+ add $144, %rsi
+ jmp L(16bytes)
+L(128bytesin256):
+ add $128, %rdi
+ add $128, %rsi
+ jmp L(16bytes)
+L(112bytesin256):
+ add $112, %rdi
+ add $112, %rsi
+ jmp L(16bytes)
+L(96bytesin256):
+ add $96, %rdi
+ add $96, %rsi
+ jmp L(16bytes)
+L(80bytesin256):
+ add $80, %rdi
+ add $80, %rsi
+ jmp L(16bytes)
+L(64bytesin256):
+ add $64, %rdi
+ add $64, %rsi
+ jmp L(16bytes)
+L(48bytesin256):
+ add $16, %rdi
+ add $16, %rsi
+L(32bytesin256):
+ add $16, %rdi
+ add $16, %rsi
+L(16bytesin256):
+ add $16, %rdi
+ add $16, %rsi
+L(16bytes):
+ mov -16(%rdi), %rax
+ mov -16(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(8bytes):
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(12bytes):
+ mov -12(%rdi), %rax
+ mov -12(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(4bytes):
+ mov -4(%rsi), %ecx
+ mov -4(%rdi), %eax
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+L(0bytes):
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(66bytes):
+ movdqu -66(%rdi), %xmm1
+ movdqu -66(%rsi), %xmm2
+ mov $-66, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(50bytes):
+ movdqu -50(%rdi), %xmm1
+ movdqu -50(%rsi), %xmm2
+ mov $-50, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(34bytes):
+ movdqu -34(%rdi), %xmm1
+ movdqu -34(%rsi), %xmm2
+ mov $-34, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(18bytes):
+ mov -18(%rdi), %rax
+ mov -18(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(10bytes):
+ mov -10(%rdi), %rax
+ mov -10(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ movzwl -2(%rdi), %eax
+ movzwl -2(%rsi), %ecx
+ cmp %cl, %al
+ jne L(end)
+ and $0xffff, %eax
+ and $0xffff, %ecx
+ sub %ecx, %eax
+ ret
+
+ ALIGN (4)
+L(14bytes):
+ mov -14(%rdi), %rax
+ mov -14(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(6bytes):
+ mov -6(%rdi), %eax
+ mov -6(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+L(2bytes):
+ movzwl -2(%rsi), %ecx
+ movzwl -2(%rdi), %eax
+ cmp %cl, %al
+ jne L(end)
+ and $0xffff, %eax
+ and $0xffff, %ecx
+ sub %ecx, %eax
+ ret
+
+ ALIGN (4)
+L(68bytes):
+ movdqu -68(%rdi), %xmm2
+ movdqu -68(%rsi), %xmm1
+ mov $-68, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(52bytes):
+ movdqu -52(%rdi), %xmm2
+ movdqu -52(%rsi), %xmm1
+ mov $-52, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(36bytes):
+ movdqu -36(%rdi), %xmm2
+ movdqu -36(%rsi), %xmm1
+ mov $-36, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(20bytes):
+ movdqu -20(%rdi), %xmm2
+ movdqu -20(%rsi), %xmm1
+ mov $-20, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -4(%rdi), %eax
+ mov -4(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(70bytes):
+ movdqu -70(%rsi), %xmm1
+ movdqu -70(%rdi), %xmm2
+ mov $-70, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(54bytes):
+ movdqu -54(%rsi), %xmm1
+ movdqu -54(%rdi), %xmm2
+ mov $-54, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(38bytes):
+ movdqu -38(%rsi), %xmm1
+ movdqu -38(%rdi), %xmm2
+ mov $-38, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(22bytes):
+ movdqu -22(%rsi), %xmm1
+ movdqu -22(%rdi), %xmm2
+ mov $-22, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(72bytes):
+ movdqu -72(%rsi), %xmm1
+ movdqu -72(%rdi), %xmm2
+ mov $-72, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(56bytes):
+ movdqu -56(%rdi), %xmm2
+ movdqu -56(%rsi), %xmm1
+ mov $-56, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(40bytes):
+ movdqu -40(%rdi), %xmm2
+ movdqu -40(%rsi), %xmm1
+ mov $-40, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(24bytes):
+ movdqu -24(%rdi), %xmm2
+ movdqu -24(%rsi), %xmm1
+ mov $-24, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(74bytes):
+ movdqu -74(%rsi), %xmm1
+ movdqu -74(%rdi), %xmm2
+ mov $-74, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(58bytes):
+ movdqu -58(%rdi), %xmm2
+ movdqu -58(%rsi), %xmm1
+ mov $-58, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(42bytes):
+ movdqu -42(%rdi), %xmm2
+ movdqu -42(%rsi), %xmm1
+ mov $-42, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(26bytes):
+ movdqu -26(%rdi), %xmm2
+ movdqu -26(%rsi), %xmm1
+ mov $-26, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -10(%rdi), %rax
+ mov -10(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ movzwl -2(%rdi), %eax
+ movzwl -2(%rsi), %ecx
+ jmp L(end)
+
+ ALIGN (4)
+L(76bytes):
+ movdqu -76(%rsi), %xmm1
+ movdqu -76(%rdi), %xmm2
+ mov $-76, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(60bytes):
+ movdqu -60(%rdi), %xmm2
+ movdqu -60(%rsi), %xmm1
+ mov $-60, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(44bytes):
+ movdqu -44(%rdi), %xmm2
+ movdqu -44(%rsi), %xmm1
+ mov $-44, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(28bytes):
+ movdqu -28(%rdi), %xmm2
+ movdqu -28(%rsi), %xmm1
+ mov $-28, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -12(%rdi), %rax
+ mov -12(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -4(%rdi), %eax
+ mov -4(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(78bytes):
+ movdqu -78(%rsi), %xmm1
+ movdqu -78(%rdi), %xmm2
+ mov $-78, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(62bytes):
+ movdqu -62(%rdi), %xmm2
+ movdqu -62(%rsi), %xmm1
+ mov $-62, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(46bytes):
+ movdqu -46(%rdi), %xmm2
+ movdqu -46(%rsi), %xmm1
+ mov $-46, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(30bytes):
+ movdqu -30(%rdi), %xmm2
+ movdqu -30(%rsi), %xmm1
+ mov $-30, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -14(%rdi), %rax
+ mov -14(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(64bytes):
+ movdqu -64(%rdi), %xmm2
+ movdqu -64(%rsi), %xmm1
+ mov $-64, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(48bytes):
+ movdqu -48(%rdi), %xmm2
+ movdqu -48(%rsi), %xmm1
+ mov $-48, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(32bytes):
+ movdqu -32(%rdi), %xmm2
+ movdqu -32(%rsi), %xmm1
+ mov $-32, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -16(%rdi), %rax
+ mov -16(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+/*
+ * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
+ */
+ ALIGN (3)
+L(less16bytes):
+ movsbq %dl, %rdx
+ mov (%rsi, %rdx), %rcx
+ mov (%rdi, %rdx), %rax
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov 8(%rsi, %rdx), %rcx
+ mov 8(%rdi, %rdx), %rax
+L(diffin8bytes):
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ shr $32, %rcx
+ shr $32, %rax
+L(diffin4bytes):
+ cmp %cx, %ax
+ jne L(end)
+ shr $16, %ecx
+ shr $16, %eax
+ jmp L(end)
+
+ ALIGN (4)
+L(end):
+ and $0xffff, %eax
+ and $0xffff, %ecx
+ sub %ecx, %eax
+ ret
+
+END_FUNCTION MEMCMP
+
+ ALIGN (3)
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(2bytes), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(6bytes), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(10bytes), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(14bytes), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(18bytes), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(22bytes), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(26bytes), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(30bytes), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(34bytes), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(38bytes), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(42bytes), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(46bytes), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(50bytes), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(54bytes), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(58bytes), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(62bytes), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+ .int JMPTBL (L(66bytes), L(table_64bytes))
+ .int JMPTBL (L(68bytes), L(table_64bytes))
+ .int JMPTBL (L(70bytes), L(table_64bytes))
+ .int JMPTBL (L(72bytes), L(table_64bytes))
+ .int JMPTBL (L(74bytes), L(table_64bytes))
+ .int JMPTBL (L(76bytes), L(table_64bytes))
+ .int JMPTBL (L(78bytes), L(table_64bytes))
diff --git a/runtime/gc/accounting/card_table-inl.h b/runtime/gc/accounting/card_table-inl.h
index 46b9363..217360f 100644
--- a/runtime/gc/accounting/card_table-inl.h
+++ b/runtime/gc/accounting/card_table-inl.h
@@ -37,12 +37,13 @@
// Align the address down.
address -= shift_in_bytes;
const size_t shift_in_bits = shift_in_bytes * kBitsPerByte;
- AtomicInteger* word_atomic = reinterpret_cast<AtomicInteger*>(address);
+ Atomic<uintptr_t>* word_atomic = reinterpret_cast<Atomic<uintptr_t>*>(address);
// Word with the byte we are trying to cas cleared.
- const int32_t cur_word = word_atomic->LoadRelaxed() & ~(0xFF << shift_in_bits);
- const int32_t old_word = cur_word | (static_cast<int32_t>(old_value) << shift_in_bits);
- const int32_t new_word = cur_word | (static_cast<int32_t>(new_value) << shift_in_bits);
+ const uintptr_t cur_word = word_atomic->LoadRelaxed() &
+ ~(static_cast<uintptr_t>(0xFF) << shift_in_bits);
+ const uintptr_t old_word = cur_word | (static_cast<uintptr_t>(old_value) << shift_in_bits);
+ const uintptr_t new_word = cur_word | (static_cast<uintptr_t>(new_value) << shift_in_bits);
return word_atomic->CompareExchangeWeakRelaxed(old_word, new_word);
#endif
}