/* Copyright 2002 Andi Kleen */ | |
#include <asm/cpufeature.h> | |
/* | |
* memcpy - Copy a memory block. | |
* | |
* Input: | |
* rdi destination | |
* rsi source | |
* rdx count | |
* | |
* Output: | |
* rax original destination | |
*/ | |
.globl __memcpy | |
.globl memcpy | |
.p2align 4 | |
__memcpy: | |
memcpy: | |
pushq %rbx | |
movq %rdi,%rax | |
movl %edx,%ecx | |
shrl $6,%ecx | |
jz .Lhandle_tail | |
.p2align 4 | |
.Lloop_64: | |
decl %ecx | |
movq (%rsi),%r11 | |
movq 8(%rsi),%r8 | |
movq %r11,(%rdi) | |
movq %r8,1*8(%rdi) | |
movq 2*8(%rsi),%r9 | |
movq 3*8(%rsi),%r10 | |
movq %r9,2*8(%rdi) | |
movq %r10,3*8(%rdi) | |
movq 4*8(%rsi),%r11 | |
movq 5*8(%rsi),%r8 | |
movq %r11,4*8(%rdi) | |
movq %r8,5*8(%rdi) | |
movq 6*8(%rsi),%r9 | |
movq 7*8(%rsi),%r10 | |
movq %r9,6*8(%rdi) | |
movq %r10,7*8(%rdi) | |
leaq 64(%rsi),%rsi | |
leaq 64(%rdi),%rdi | |
jnz .Lloop_64 | |
.Lhandle_tail: | |
movl %edx,%ecx | |
andl $63,%ecx | |
shrl $3,%ecx | |
jz .Lhandle_7 | |
.p2align 4 | |
.Lloop_8: | |
decl %ecx | |
movq (%rsi),%r8 | |
movq %r8,(%rdi) | |
leaq 8(%rdi),%rdi | |
leaq 8(%rsi),%rsi | |
jnz .Lloop_8 | |
.Lhandle_7: | |
movl %edx,%ecx | |
andl $7,%ecx | |
jz .Lende | |
.p2align 4 | |
.Lloop_1: | |
movb (%rsi),%r8b | |
movb %r8b,(%rdi) | |
incq %rdi | |
incq %rsi | |
decl %ecx | |
jnz .Lloop_1 | |
.Lende: | |
popq %rbx | |
ret | |
.Lfinal: | |
/* Some CPUs run faster using the string copy instructions. | |
It is also a lot simpler. Use this when possible */ | |
.section .altinstructions,"a" | |
.align 8 | |
.quad memcpy | |
.quad memcpy_c | |
.byte X86_FEATURE_REP_GOOD | |
.byte .Lfinal-memcpy | |
.byte memcpy_c_end-memcpy_c | |
.previous | |
.section .altinstr_replacement,"ax" | |
/* rdi destination | |
* rsi source | |
* rdx count | |
*/ | |
memcpy_c: | |
movq %rdi,%rax | |
movl %edx,%ecx | |
shrl $3,%ecx | |
andl $7,%edx | |
rep | |
movsq | |
movl %edx,%ecx | |
rep | |
movsb | |
ret | |
memcpy_c_end: | |
.previous |