blob: bcbcd1e0f7d57fe4b3972adc24785dc6837386f6 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/* Copyright 2002 Andi Kleen */
Dave Jones038b0a62006-10-04 03:38:54 -04002
Jan Beulich8d379da2006-09-26 10:52:32 +02003#include <linux/linkage.h>
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +01004
Jan Beulich8d379da2006-09-26 10:52:32 +02005#include <asm/cpufeature.h>
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +01006#include <asm/dwarf2.h>
Jan Beulich8d379da2006-09-26 10:52:32 +02007
Linus Torvalds1da177e2005-04-16 15:20:36 -07008/*
9 * memcpy - Copy a memory block.
10 *
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010011 * Input:
12 * rdi destination
13 * rsi source
14 * rdx count
15 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070016 * Output:
17 * rax original destination
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010018 */
Linus Torvalds1da177e2005-04-16 15:20:36 -070019
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010020/*
21 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
22 *
Jan Beulich7269e882009-12-18 16:16:03 +000023 * This gets patched over the unrolled variant (below) via the
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010024 * alternative instructions framework:
25 */
Jan Beulich7269e882009-12-18 16:16:03 +000026 .section .altinstr_replacement, "ax", @progbits
27.Lmemcpy_c:
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010028 movq %rdi, %rax
29
30 movl %edx, %ecx
31 shrl $3, %ecx
32 andl $7, %edx
Jan Beulich8d379da2006-09-26 10:52:32 +020033 rep movsq
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010034 movl %edx, %ecx
Jan Beulich8d379da2006-09-26 10:52:32 +020035 rep movsb
36 ret
Jan Beulich7269e882009-12-18 16:16:03 +000037.Lmemcpy_e:
38 .previous
Jan Beulich8d379da2006-09-26 10:52:32 +020039
40ENTRY(__memcpy)
41ENTRY(memcpy)
42 CFI_STARTPROC
Andi Kleen7bcd3f32006-02-03 21:51:02 +010043
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010044 /*
45 * Put the number of full 64-byte blocks into %ecx.
46 * Tail portion is handled at the end:
47 */
48 movq %rdi, %rax
49 movl %edx, %ecx
50 shrl $6, %ecx
Andi Kleen7bcd3f32006-02-03 21:51:02 +010051 jz .Lhandle_tail
52
53 .p2align 4
54.Lloop_64:
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010055 /*
56 * We decrement the loop index here - and the zero-flag is
57 * checked at the end of the loop (instructions inbetween do
58 * not change the zero flag):
59 */
Andi Kleen7bcd3f32006-02-03 21:51:02 +010060 decl %ecx
61
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010062 /*
63 * Move in blocks of 4x16 bytes:
64 */
65 movq 0*8(%rsi), %r11
66 movq 1*8(%rsi), %r8
67 movq %r11, 0*8(%rdi)
68 movq %r8, 1*8(%rdi)
Andi Kleen7bcd3f32006-02-03 21:51:02 +010069
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010070 movq 2*8(%rsi), %r9
71 movq 3*8(%rsi), %r10
72 movq %r9, 2*8(%rdi)
73 movq %r10, 3*8(%rdi)
Andi Kleen7bcd3f32006-02-03 21:51:02 +010074
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010075 movq 4*8(%rsi), %r11
76 movq 5*8(%rsi), %r8
77 movq %r11, 4*8(%rdi)
78 movq %r8, 5*8(%rdi)
Andi Kleen7bcd3f32006-02-03 21:51:02 +010079
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010080 movq 6*8(%rsi), %r9
81 movq 7*8(%rsi), %r10
82 movq %r9, 6*8(%rdi)
83 movq %r10, 7*8(%rdi)
Andi Kleen7bcd3f32006-02-03 21:51:02 +010084
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010085 leaq 64(%rsi), %rsi
86 leaq 64(%rdi), %rdi
Andi Kleen7bcd3f32006-02-03 21:51:02 +010087
Andi Kleen7bcd3f32006-02-03 21:51:02 +010088 jnz .Lloop_64
89
90.Lhandle_tail:
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010091 movl %edx, %ecx
92 andl $63, %ecx
93 shrl $3, %ecx
Andi Kleen7bcd3f32006-02-03 21:51:02 +010094 jz .Lhandle_7
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010095
Andi Kleen7bcd3f32006-02-03 21:51:02 +010096 .p2align 4
97.Lloop_8:
98 decl %ecx
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010099 movq (%rsi), %r8
100 movq %r8, (%rdi)
101 leaq 8(%rdi), %rdi
102 leaq 8(%rsi), %rsi
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100103 jnz .Lloop_8
104
105.Lhandle_7:
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +0100106 movl %edx, %ecx
107 andl $7, %ecx
108 jz .Lend
109
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100110 .p2align 4
111.Lloop_1:
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +0100112 movb (%rsi), %r8b
113 movb %r8b, (%rdi)
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100114 incq %rdi
115 incq %rsi
116 decl %ecx
117 jnz .Lloop_1
118
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +0100119.Lend:
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100120 ret
Jan Beulich8d379da2006-09-26 10:52:32 +0200121 CFI_ENDPROC
122ENDPROC(memcpy)
123ENDPROC(__memcpy)
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100124
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +0100125 /*
126 * Some CPUs run faster using the string copy instructions.
127 * It is also a lot simpler. Use this when possible:
128 */
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100129
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +0100130 .section .altinstructions, "a"
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100131 .align 8
Jan Beulich8d379da2006-09-26 10:52:32 +0200132 .quad memcpy
Jan Beulich7269e882009-12-18 16:16:03 +0000133 .quad .Lmemcpy_c
H. Peter Anvin83a7a2a2010-06-10 00:10:43 +0000134 .word X86_FEATURE_REP_GOOD
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +0100135
136 /*
137 * Replace only beginning, memcpy is used to apply alternatives,
138 * so it is silly to overwrite itself with nops - reboot is the
139 * only outcome...
140 */
Jan Beulich7269e882009-12-18 16:16:03 +0000141 .byte .Lmemcpy_e - .Lmemcpy_c
142 .byte .Lmemcpy_e - .Lmemcpy_c
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100143 .previous