blob: e5e3ed8dc0798bd007e8573ddbf57dc4e2312049 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * MMX 3DNow! library helper functions
3 *
4 * To do:
Ingo Molnarca5d3f12008-02-18 08:53:56 +01005 * We can use MMX just for prefetch in IRQ's. This may be a win.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006 * (reported so on K6-III)
7 * We should use a better code neutral filler for the short jump
8 * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
9 * We also want to clobber the filler register so we don't get any
Ingo Molnarca5d3f12008-02-18 08:53:56 +010010 * register forwarding stalls on the filler.
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 *
12 * Add *user handling. Checksums are not a win with MMX on any CPU
13 * tested so far for any MMX solution figured.
14 *
Ingo Molnarca5d3f12008-02-18 08:53:56 +010015 * 22/09/2000 - Arjan van de Ven
16 * Improved for non-egineering-sample Athlons
Linus Torvalds1da177e2005-04-16 15:20:36 -070017 *
18 */
Ingo Molnarca5d3f12008-02-18 08:53:56 +010019#include <linux/hardirq.h>
20#include <linux/string.h>
21#include <linux/module.h>
22#include <linux/sched.h>
23#include <linux/types.h>
24
Ingo Molnardf6b35f2015-04-24 02:46:00 +020025#include <asm/fpu/api.h>
Ingo Molnarca5d3f12008-02-18 08:53:56 +010026#include <asm/asm.h>
27
Linus Torvalds1da177e2005-04-16 15:20:36 -070028void *_mmx_memcpy(void *to, const void *from, size_t len)
29{
30 void *p;
31 int i;
32
33 if (unlikely(in_interrupt()))
34 return __memcpy(to, from, len);
35
36 p = to;
37 i = len >> 6; /* len/64 */
38
39 kernel_fpu_begin();
40
41 __asm__ __volatile__ (
42 "1: prefetch (%0)\n" /* This set is 28 bytes */
43 " prefetch 64(%0)\n"
44 " prefetch 128(%0)\n"
45 " prefetch 192(%0)\n"
46 " prefetch 256(%0)\n"
47 "2: \n"
48 ".section .fixup, \"ax\"\n"
49 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
50 " jmp 2b\n"
51 ".previous\n"
Ingo Molnarca5d3f12008-02-18 08:53:56 +010052 _ASM_EXTABLE(1b, 3b)
53 : : "r" (from));
54
55 for ( ; i > 5; i--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -070056 __asm__ __volatile__ (
57 "1: prefetch 320(%0)\n"
58 "2: movq (%0), %%mm0\n"
59 " movq 8(%0), %%mm1\n"
60 " movq 16(%0), %%mm2\n"
61 " movq 24(%0), %%mm3\n"
62 " movq %%mm0, (%1)\n"
63 " movq %%mm1, 8(%1)\n"
64 " movq %%mm2, 16(%1)\n"
65 " movq %%mm3, 24(%1)\n"
66 " movq 32(%0), %%mm0\n"
67 " movq 40(%0), %%mm1\n"
68 " movq 48(%0), %%mm2\n"
69 " movq 56(%0), %%mm3\n"
70 " movq %%mm0, 32(%1)\n"
71 " movq %%mm1, 40(%1)\n"
72 " movq %%mm2, 48(%1)\n"
73 " movq %%mm3, 56(%1)\n"
74 ".section .fixup, \"ax\"\n"
75 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
76 " jmp 2b\n"
77 ".previous\n"
Ingo Molnarca5d3f12008-02-18 08:53:56 +010078 _ASM_EXTABLE(1b, 3b)
79 : : "r" (from), "r" (to) : "memory");
80
81 from += 64;
82 to += 64;
Linus Torvalds1da177e2005-04-16 15:20:36 -070083 }
84
Ingo Molnarca5d3f12008-02-18 08:53:56 +010085 for ( ; i > 0; i--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -070086 __asm__ __volatile__ (
87 " movq (%0), %%mm0\n"
88 " movq 8(%0), %%mm1\n"
89 " movq 16(%0), %%mm2\n"
90 " movq 24(%0), %%mm3\n"
91 " movq %%mm0, (%1)\n"
92 " movq %%mm1, 8(%1)\n"
93 " movq %%mm2, 16(%1)\n"
94 " movq %%mm3, 24(%1)\n"
95 " movq 32(%0), %%mm0\n"
96 " movq 40(%0), %%mm1\n"
97 " movq 48(%0), %%mm2\n"
98 " movq 56(%0), %%mm3\n"
99 " movq %%mm0, 32(%1)\n"
100 " movq %%mm1, 40(%1)\n"
101 " movq %%mm2, 48(%1)\n"
102 " movq %%mm3, 56(%1)\n"
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100103 : : "r" (from), "r" (to) : "memory");
104
105 from += 64;
106 to += 64;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107 }
108 /*
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100109 * Now do the tail of the block:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110 */
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100111 __memcpy(to, from, len & 63);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112 kernel_fpu_end();
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100113
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114 return p;
115}
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100116EXPORT_SYMBOL(_mmx_memcpy);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117
118#ifdef CONFIG_MK7
119
120/*
121 * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
122 * other MMX using processors do not.
123 */
124
125static void fast_clear_page(void *page)
126{
127 int i;
128
129 kernel_fpu_begin();
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100130
Linus Torvalds1da177e2005-04-16 15:20:36 -0700131 __asm__ __volatile__ (
132 " pxor %%mm0, %%mm0\n" : :
133 );
134
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100135 for (i = 0; i < 4096/64; i++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136 __asm__ __volatile__ (
137 " movntq %%mm0, (%0)\n"
138 " movntq %%mm0, 8(%0)\n"
139 " movntq %%mm0, 16(%0)\n"
140 " movntq %%mm0, 24(%0)\n"
141 " movntq %%mm0, 32(%0)\n"
142 " movntq %%mm0, 40(%0)\n"
143 " movntq %%mm0, 48(%0)\n"
144 " movntq %%mm0, 56(%0)\n"
145 : : "r" (page) : "memory");
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100146 page += 64;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147 }
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100148
149 /*
150 * Since movntq is weakly-ordered, a "sfence" is needed to become
151 * ordered again:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700152 */
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100153 __asm__ __volatile__("sfence\n"::);
154
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 kernel_fpu_end();
156}
157
158static void fast_copy_page(void *to, void *from)
159{
160 int i;
161
162 kernel_fpu_begin();
163
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100164 /*
165 * maybe the prefetch stuff can go before the expensive fnsave...
Linus Torvalds1da177e2005-04-16 15:20:36 -0700166 * but that is for later. -AV
167 */
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100168 __asm__ __volatile__(
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169 "1: prefetch (%0)\n"
170 " prefetch 64(%0)\n"
171 " prefetch 128(%0)\n"
172 " prefetch 192(%0)\n"
173 " prefetch 256(%0)\n"
174 "2: \n"
175 ".section .fixup, \"ax\"\n"
176 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
177 " jmp 2b\n"
178 ".previous\n"
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100179 _ASM_EXTABLE(1b, 3b) : : "r" (from));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700180
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100181 for (i = 0; i < (4096-320)/64; i++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700182 __asm__ __volatile__ (
183 "1: prefetch 320(%0)\n"
184 "2: movq (%0), %%mm0\n"
185 " movntq %%mm0, (%1)\n"
186 " movq 8(%0), %%mm1\n"
187 " movntq %%mm1, 8(%1)\n"
188 " movq 16(%0), %%mm2\n"
189 " movntq %%mm2, 16(%1)\n"
190 " movq 24(%0), %%mm3\n"
191 " movntq %%mm3, 24(%1)\n"
192 " movq 32(%0), %%mm4\n"
193 " movntq %%mm4, 32(%1)\n"
194 " movq 40(%0), %%mm5\n"
195 " movntq %%mm5, 40(%1)\n"
196 " movq 48(%0), %%mm6\n"
197 " movntq %%mm6, 48(%1)\n"
198 " movq 56(%0), %%mm7\n"
199 " movntq %%mm7, 56(%1)\n"
200 ".section .fixup, \"ax\"\n"
201 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
202 " jmp 2b\n"
203 ".previous\n"
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100204 _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
205
206 from += 64;
207 to += 64;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700208 }
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100209
210 for (i = (4096-320)/64; i < 4096/64; i++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700211 __asm__ __volatile__ (
212 "2: movq (%0), %%mm0\n"
213 " movntq %%mm0, (%1)\n"
214 " movq 8(%0), %%mm1\n"
215 " movntq %%mm1, 8(%1)\n"
216 " movq 16(%0), %%mm2\n"
217 " movntq %%mm2, 16(%1)\n"
218 " movq 24(%0), %%mm3\n"
219 " movntq %%mm3, 24(%1)\n"
220 " movq 32(%0), %%mm4\n"
221 " movntq %%mm4, 32(%1)\n"
222 " movq 40(%0), %%mm5\n"
223 " movntq %%mm5, 40(%1)\n"
224 " movq 48(%0), %%mm6\n"
225 " movntq %%mm6, 48(%1)\n"
226 " movq 56(%0), %%mm7\n"
227 " movntq %%mm7, 56(%1)\n"
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100228 : : "r" (from), "r" (to) : "memory");
229 from += 64;
230 to += 64;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700231 }
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100232 /*
233 * Since movntq is weakly-ordered, a "sfence" is needed to become
234 * ordered again:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700235 */
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100236 __asm__ __volatile__("sfence \n"::);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237 kernel_fpu_end();
238}
239
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100240#else /* CONFIG_MK7 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241
242/*
243 * Generic MMX implementation without K7 specific streaming
244 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700245static void fast_clear_page(void *page)
246{
247 int i;
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100248
Linus Torvalds1da177e2005-04-16 15:20:36 -0700249 kernel_fpu_begin();
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100250
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251 __asm__ __volatile__ (
252 " pxor %%mm0, %%mm0\n" : :
253 );
254
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100255 for (i = 0; i < 4096/128; i++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256 __asm__ __volatile__ (
257 " movq %%mm0, (%0)\n"
258 " movq %%mm0, 8(%0)\n"
259 " movq %%mm0, 16(%0)\n"
260 " movq %%mm0, 24(%0)\n"
261 " movq %%mm0, 32(%0)\n"
262 " movq %%mm0, 40(%0)\n"
263 " movq %%mm0, 48(%0)\n"
264 " movq %%mm0, 56(%0)\n"
265 " movq %%mm0, 64(%0)\n"
266 " movq %%mm0, 72(%0)\n"
267 " movq %%mm0, 80(%0)\n"
268 " movq %%mm0, 88(%0)\n"
269 " movq %%mm0, 96(%0)\n"
270 " movq %%mm0, 104(%0)\n"
271 " movq %%mm0, 112(%0)\n"
272 " movq %%mm0, 120(%0)\n"
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100273 : : "r" (page) : "memory");
274 page += 128;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700275 }
276
277 kernel_fpu_end();
278}
279
280static void fast_copy_page(void *to, void *from)
281{
282 int i;
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100283
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 kernel_fpu_begin();
285
286 __asm__ __volatile__ (
287 "1: prefetch (%0)\n"
288 " prefetch 64(%0)\n"
289 " prefetch 128(%0)\n"
290 " prefetch 192(%0)\n"
291 " prefetch 256(%0)\n"
292 "2: \n"
293 ".section .fixup, \"ax\"\n"
294 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
295 " jmp 2b\n"
296 ".previous\n"
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100297 _ASM_EXTABLE(1b, 3b) : : "r" (from));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100299 for (i = 0; i < 4096/64; i++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300 __asm__ __volatile__ (
301 "1: prefetch 320(%0)\n"
302 "2: movq (%0), %%mm0\n"
303 " movq 8(%0), %%mm1\n"
304 " movq 16(%0), %%mm2\n"
305 " movq 24(%0), %%mm3\n"
306 " movq %%mm0, (%1)\n"
307 " movq %%mm1, 8(%1)\n"
308 " movq %%mm2, 16(%1)\n"
309 " movq %%mm3, 24(%1)\n"
310 " movq 32(%0), %%mm0\n"
311 " movq 40(%0), %%mm1\n"
312 " movq 48(%0), %%mm2\n"
313 " movq 56(%0), %%mm3\n"
314 " movq %%mm0, 32(%1)\n"
315 " movq %%mm1, 40(%1)\n"
316 " movq %%mm2, 48(%1)\n"
317 " movq %%mm3, 56(%1)\n"
318 ".section .fixup, \"ax\"\n"
319 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
320 " jmp 2b\n"
321 ".previous\n"
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100322 _ASM_EXTABLE(1b, 3b)
323 : : "r" (from), "r" (to) : "memory");
324
325 from += 64;
326 to += 64;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700327 }
328 kernel_fpu_end();
329}
330
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100331#endif /* !CONFIG_MK7 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332
333/*
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100334 * Favour MMX for page clear and copy:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335 */
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100336static void slow_zero_page(void *page)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700337{
338 int d0, d1;
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100339
340 __asm__ __volatile__(
341 "cld\n\t"
342 "rep ; stosl"
343
344 : "=&c" (d0), "=&D" (d1)
345 :"a" (0), "1" (page), "0" (1024)
346 :"memory");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347}
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100348
349void mmx_clear_page(void *page)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350{
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100351 if (unlikely(in_interrupt()))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700352 slow_zero_page(page);
353 else
354 fast_clear_page(page);
355}
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100356EXPORT_SYMBOL(mmx_clear_page);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700357
358static void slow_copy_page(void *to, void *from)
359{
360 int d0, d1, d2;
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100361
362 __asm__ __volatile__(
363 "cld\n\t"
364 "rep ; movsl"
365 : "=&c" (d0), "=&D" (d1), "=&S" (d2)
366 : "0" (1024), "1" ((long) to), "2" ((long) from)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700367 : "memory");
368}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700369
370void mmx_copy_page(void *to, void *from)
371{
Ingo Molnarca5d3f12008-02-18 08:53:56 +0100372 if (unlikely(in_interrupt()))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700373 slow_copy_page(to, from);
374 else
375 fast_copy_page(to, from);
376}
Alexey Dobriyan129f6942005-06-23 00:08:33 -0700377EXPORT_SYMBOL(mmx_copy_page);