blob: 75ef61e35e38aee1cf62b05a48f1add62b148f98 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/* Copyright 2002 Andi Kleen */
Dave Jones038b0a62006-10-04 03:38:54 -04002
Jan Beulich8d379da2006-09-26 10:52:32 +02003#include <linux/linkage.h>
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +01004
Jan Beulich8d379da2006-09-26 10:52:32 +02005#include <asm/cpufeature.h>
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +01006#include <asm/dwarf2.h>
Jan Beulich8d379da2006-09-26 10:52:32 +02007
Linus Torvalds1da177e2005-04-16 15:20:36 -07008/*
9 * memcpy - Copy a memory block.
10 *
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010011 * Input:
12 * rdi destination
13 * rsi source
14 * rdx count
15 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070016 * Output:
17 * rax original destination
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010018 */
Linus Torvalds1da177e2005-04-16 15:20:36 -070019
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010020/*
21 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
22 *
Jan Beulich7269e882009-12-18 16:16:03 +000023 * This gets patched over the unrolled variant (below) via the
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010024 * alternative instructions framework:
25 */
Jan Beulich7269e882009-12-18 16:16:03 +000026 .section .altinstr_replacement, "ax", @progbits
27.Lmemcpy_c:
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010028 movq %rdi, %rax
29
30 movl %edx, %ecx
31 shrl $3, %ecx
32 andl $7, %edx
Jan Beulich8d379da2006-09-26 10:52:32 +020033 rep movsq
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010034 movl %edx, %ecx
Jan Beulich8d379da2006-09-26 10:52:32 +020035 rep movsb
36 ret
Jan Beulich7269e882009-12-18 16:16:03 +000037.Lmemcpy_e:
38 .previous
Jan Beulich8d379da2006-09-26 10:52:32 +020039
40ENTRY(__memcpy)
41ENTRY(memcpy)
42 CFI_STARTPROC
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +010043 movq %rdi, %rax
Andi Kleen7bcd3f32006-02-03 21:51:02 +010044
Ma Ling59daa702010-06-29 03:24:25 +080045 /*
46 * Use 32bit CMP here to avoid long NOP padding.
47 */
48 cmp $0x20, %edx
49 jb .Lhandle_tail
50
51 /*
52 * We check whether memory false dependece could occur,
53 * then jump to corresponding copy mode.
54 */
55 cmp %dil, %sil
56 jl .Lcopy_backward
57 subl $0x20, %edx
58.Lcopy_forward_loop:
59 subq $0x20, %rdx
60
61 /*
62 * Move in blocks of 4x8 bytes:
63 */
64 movq 0*8(%rsi), %r8
65 movq 1*8(%rsi), %r9
66 movq 2*8(%rsi), %r10
67 movq 3*8(%rsi), %r11
68 leaq 4*8(%rsi), %rsi
69
70 movq %r8, 0*8(%rdi)
71 movq %r9, 1*8(%rdi)
72 movq %r10, 2*8(%rdi)
73 movq %r11, 3*8(%rdi)
74 leaq 4*8(%rdi), %rdi
75 jae .Lcopy_forward_loop
76 addq $0x20, %rdx
77 jmp .Lhandle_tail
78
79.Lcopy_backward:
80 /*
81 * Calculate copy position to tail.
82 */
83 addq %rdx, %rsi
84 addq %rdx, %rdi
85 subq $0x20, %rdx
86 /*
87 * At most 3 ALU operations in one cycle,
88 * so append NOPS in the same 16bytes trunk.
89 */
Andi Kleen7bcd3f32006-02-03 21:51:02 +010090 .p2align 4
Ma Ling59daa702010-06-29 03:24:25 +080091.Lcopy_backward_loop:
92 subq $0x20, %rdx
93 movq -1*8(%rsi), %r8
94 movq -2*8(%rsi), %r9
95 movq -3*8(%rsi), %r10
96 movq -4*8(%rsi), %r11
97 leaq -4*8(%rsi), %rsi
98 movq %r8, -1*8(%rdi)
99 movq %r9, -2*8(%rdi)
100 movq %r10, -3*8(%rdi)
101 movq %r11, -4*8(%rdi)
102 leaq -4*8(%rdi), %rdi
103 jae .Lcopy_backward_loop
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100104
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +0100105 /*
Ma Ling59daa702010-06-29 03:24:25 +0800106 * Calculate copy position to head.
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +0100107 */
Ma Ling59daa702010-06-29 03:24:25 +0800108 addq $0x20, %rdx
109 subq %rdx, %rsi
110 subq %rdx, %rdi
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100111.Lhandle_tail:
Ma Ling59daa702010-06-29 03:24:25 +0800112 cmpq $16, %rdx
113 jb .Lless_16bytes
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +0100114
Ma Ling59daa702010-06-29 03:24:25 +0800115 /*
116 * Move data from 16 bytes to 31 bytes.
117 */
118 movq 0*8(%rsi), %r8
119 movq 1*8(%rsi), %r9
120 movq -2*8(%rsi, %rdx), %r10
121 movq -1*8(%rsi, %rdx), %r11
122 movq %r8, 0*8(%rdi)
123 movq %r9, 1*8(%rdi)
124 movq %r10, -2*8(%rdi, %rdx)
125 movq %r11, -1*8(%rdi, %rdx)
126 retq
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100127 .p2align 4
Ma Ling59daa702010-06-29 03:24:25 +0800128.Lless_16bytes:
129 cmpq $8, %rdx
130 jb .Lless_8bytes
131 /*
132 * Move data from 8 bytes to 15 bytes.
133 */
134 movq 0*8(%rsi), %r8
135 movq -1*8(%rsi, %rdx), %r9
136 movq %r8, 0*8(%rdi)
137 movq %r9, -1*8(%rdi, %rdx)
138 retq
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100139 .p2align 4
Ma Ling59daa702010-06-29 03:24:25 +0800140.Lless_8bytes:
141 cmpq $4, %rdx
142 jb .Lless_3bytes
143
144 /*
145 * Move data from 4 bytes to 7 bytes.
146 */
147 movl (%rsi), %ecx
148 movl -4(%rsi, %rdx), %r8d
149 movl %ecx, (%rdi)
150 movl %r8d, -4(%rdi, %rdx)
151 retq
152 .p2align 4
153.Lless_3bytes:
154 cmpl $0, %edx
155 je .Lend
156 /*
157 * Move data from 1 bytes to 3 bytes.
158 */
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100159.Lloop_1:
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +0100160 movb (%rsi), %r8b
161 movb %r8b, (%rdi)
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100162 incq %rdi
163 incq %rsi
Ma Ling59daa702010-06-29 03:24:25 +0800164 decl %edx
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100165 jnz .Lloop_1
166
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +0100167.Lend:
Ma Ling59daa702010-06-29 03:24:25 +0800168 retq
Jan Beulich8d379da2006-09-26 10:52:32 +0200169 CFI_ENDPROC
170ENDPROC(memcpy)
171ENDPROC(__memcpy)
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100172
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +0100173 /*
174 * Some CPUs run faster using the string copy instructions.
175 * It is also a lot simpler. Use this when possible:
176 */
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100177
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +0100178 .section .altinstructions, "a"
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100179 .align 8
Jan Beulich8d379da2006-09-26 10:52:32 +0200180 .quad memcpy
Jan Beulich7269e882009-12-18 16:16:03 +0000181 .quad .Lmemcpy_c
H. Peter Anvin83a7a2a2010-06-10 00:10:43 +0000182 .word X86_FEATURE_REP_GOOD
Ingo Molnarf3b6eaf2009-03-12 12:20:17 +0100183
184 /*
185 * Replace only beginning, memcpy is used to apply alternatives,
186 * so it is silly to overwrite itself with nops - reboot is the
187 * only outcome...
188 */
Jan Beulich7269e882009-12-18 16:16:03 +0000189 .byte .Lmemcpy_e - .Lmemcpy_c
190 .byte .Lmemcpy_e - .Lmemcpy_c
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100191 .previous