blob: cc9b4a4450f3fe3fc344f34114fc3d9ec4cd4fe4 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001#include <linux/types.h>
2#include <linux/string.h>
3#include <linux/sched.h>
4#include <linux/hardirq.h>
Alexey Dobriyan129f6942005-06-23 00:08:33 -07005#include <linux/module.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -07006
H. Peter Anvine7a40d22008-02-04 16:47:57 +01007#include <asm/asm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -07008#include <asm/i387.h>
9
10
11/*
12 * MMX 3DNow! library helper functions
13 *
14 * To do:
15 * We can use MMX just for prefetch in IRQ's. This may be a win.
16 * (reported so on K6-III)
17 * We should use a better code neutral filler for the short jump
18 * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
19 * We also want to clobber the filler register so we don't get any
20 * register forwarding stalls on the filler.
21 *
22 * Add *user handling. Checksums are not a win with MMX on any CPU
23 * tested so far for any MMX solution figured.
24 *
25 * 22/09/2000 - Arjan van de Ven
26 * Improved for non-egineering-sample Athlons
27 *
28 */
29
30void *_mmx_memcpy(void *to, const void *from, size_t len)
31{
32 void *p;
33 int i;
34
35 if (unlikely(in_interrupt()))
36 return __memcpy(to, from, len);
37
38 p = to;
39 i = len >> 6; /* len/64 */
40
41 kernel_fpu_begin();
42
43 __asm__ __volatile__ (
44 "1: prefetch (%0)\n" /* This set is 28 bytes */
45 " prefetch 64(%0)\n"
46 " prefetch 128(%0)\n"
47 " prefetch 192(%0)\n"
48 " prefetch 256(%0)\n"
49 "2: \n"
50 ".section .fixup, \"ax\"\n"
51 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
52 " jmp 2b\n"
53 ".previous\n"
H. Peter Anvine7a40d22008-02-04 16:47:57 +010054 _ASM_EXTABLE(1b,3b)
Linus Torvalds1da177e2005-04-16 15:20:36 -070055 : : "r" (from) );
56
57
58 for(; i>5; i--)
59 {
60 __asm__ __volatile__ (
61 "1: prefetch 320(%0)\n"
62 "2: movq (%0), %%mm0\n"
63 " movq 8(%0), %%mm1\n"
64 " movq 16(%0), %%mm2\n"
65 " movq 24(%0), %%mm3\n"
66 " movq %%mm0, (%1)\n"
67 " movq %%mm1, 8(%1)\n"
68 " movq %%mm2, 16(%1)\n"
69 " movq %%mm3, 24(%1)\n"
70 " movq 32(%0), %%mm0\n"
71 " movq 40(%0), %%mm1\n"
72 " movq 48(%0), %%mm2\n"
73 " movq 56(%0), %%mm3\n"
74 " movq %%mm0, 32(%1)\n"
75 " movq %%mm1, 40(%1)\n"
76 " movq %%mm2, 48(%1)\n"
77 " movq %%mm3, 56(%1)\n"
78 ".section .fixup, \"ax\"\n"
79 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
80 " jmp 2b\n"
81 ".previous\n"
H. Peter Anvine7a40d22008-02-04 16:47:57 +010082 _ASM_EXTABLE(1b,3b)
Linus Torvalds1da177e2005-04-16 15:20:36 -070083 : : "r" (from), "r" (to) : "memory");
84 from+=64;
85 to+=64;
86 }
87
88 for(; i>0; i--)
89 {
90 __asm__ __volatile__ (
91 " movq (%0), %%mm0\n"
92 " movq 8(%0), %%mm1\n"
93 " movq 16(%0), %%mm2\n"
94 " movq 24(%0), %%mm3\n"
95 " movq %%mm0, (%1)\n"
96 " movq %%mm1, 8(%1)\n"
97 " movq %%mm2, 16(%1)\n"
98 " movq %%mm3, 24(%1)\n"
99 " movq 32(%0), %%mm0\n"
100 " movq 40(%0), %%mm1\n"
101 " movq 48(%0), %%mm2\n"
102 " movq 56(%0), %%mm3\n"
103 " movq %%mm0, 32(%1)\n"
104 " movq %%mm1, 40(%1)\n"
105 " movq %%mm2, 48(%1)\n"
106 " movq %%mm3, 56(%1)\n"
107 : : "r" (from), "r" (to) : "memory");
108 from+=64;
109 to+=64;
110 }
111 /*
112 * Now do the tail of the block
113 */
114 __memcpy(to, from, len&63);
115 kernel_fpu_end();
116 return p;
117}
118
119#ifdef CONFIG_MK7
120
121/*
122 * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
123 * other MMX using processors do not.
124 */
125
126static void fast_clear_page(void *page)
127{
128 int i;
129
130 kernel_fpu_begin();
131
132 __asm__ __volatile__ (
133 " pxor %%mm0, %%mm0\n" : :
134 );
135
136 for(i=0;i<4096/64;i++)
137 {
138 __asm__ __volatile__ (
139 " movntq %%mm0, (%0)\n"
140 " movntq %%mm0, 8(%0)\n"
141 " movntq %%mm0, 16(%0)\n"
142 " movntq %%mm0, 24(%0)\n"
143 " movntq %%mm0, 32(%0)\n"
144 " movntq %%mm0, 40(%0)\n"
145 " movntq %%mm0, 48(%0)\n"
146 " movntq %%mm0, 56(%0)\n"
147 : : "r" (page) : "memory");
148 page+=64;
149 }
150 /* since movntq is weakly-ordered, a "sfence" is needed to become
151 * ordered again.
152 */
153 __asm__ __volatile__ (
154 " sfence \n" : :
155 );
156 kernel_fpu_end();
157}
158
159static void fast_copy_page(void *to, void *from)
160{
161 int i;
162
163 kernel_fpu_begin();
164
165 /* maybe the prefetch stuff can go before the expensive fnsave...
166 * but that is for later. -AV
167 */
168 __asm__ __volatile__ (
169 "1: prefetch (%0)\n"
170 " prefetch 64(%0)\n"
171 " prefetch 128(%0)\n"
172 " prefetch 192(%0)\n"
173 " prefetch 256(%0)\n"
174 "2: \n"
175 ".section .fixup, \"ax\"\n"
176 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
177 " jmp 2b\n"
178 ".previous\n"
H. Peter Anvine7a40d22008-02-04 16:47:57 +0100179 _ASM_EXTABLE(1b,3b)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700180 : : "r" (from) );
181
182 for(i=0; i<(4096-320)/64; i++)
183 {
184 __asm__ __volatile__ (
185 "1: prefetch 320(%0)\n"
186 "2: movq (%0), %%mm0\n"
187 " movntq %%mm0, (%1)\n"
188 " movq 8(%0), %%mm1\n"
189 " movntq %%mm1, 8(%1)\n"
190 " movq 16(%0), %%mm2\n"
191 " movntq %%mm2, 16(%1)\n"
192 " movq 24(%0), %%mm3\n"
193 " movntq %%mm3, 24(%1)\n"
194 " movq 32(%0), %%mm4\n"
195 " movntq %%mm4, 32(%1)\n"
196 " movq 40(%0), %%mm5\n"
197 " movntq %%mm5, 40(%1)\n"
198 " movq 48(%0), %%mm6\n"
199 " movntq %%mm6, 48(%1)\n"
200 " movq 56(%0), %%mm7\n"
201 " movntq %%mm7, 56(%1)\n"
202 ".section .fixup, \"ax\"\n"
203 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
204 " jmp 2b\n"
205 ".previous\n"
H. Peter Anvine7a40d22008-02-04 16:47:57 +0100206 _ASM_EXTABLE(1b,3b)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207 : : "r" (from), "r" (to) : "memory");
208 from+=64;
209 to+=64;
210 }
211 for(i=(4096-320)/64; i<4096/64; i++)
212 {
213 __asm__ __volatile__ (
214 "2: movq (%0), %%mm0\n"
215 " movntq %%mm0, (%1)\n"
216 " movq 8(%0), %%mm1\n"
217 " movntq %%mm1, 8(%1)\n"
218 " movq 16(%0), %%mm2\n"
219 " movntq %%mm2, 16(%1)\n"
220 " movq 24(%0), %%mm3\n"
221 " movntq %%mm3, 24(%1)\n"
222 " movq 32(%0), %%mm4\n"
223 " movntq %%mm4, 32(%1)\n"
224 " movq 40(%0), %%mm5\n"
225 " movntq %%mm5, 40(%1)\n"
226 " movq 48(%0), %%mm6\n"
227 " movntq %%mm6, 48(%1)\n"
228 " movq 56(%0), %%mm7\n"
229 " movntq %%mm7, 56(%1)\n"
230 : : "r" (from), "r" (to) : "memory");
231 from+=64;
232 to+=64;
233 }
234 /* since movntq is weakly-ordered, a "sfence" is needed to become
235 * ordered again.
236 */
237 __asm__ __volatile__ (
238 " sfence \n" : :
239 );
240 kernel_fpu_end();
241}
242
243#else
244
245/*
246 * Generic MMX implementation without K7 specific streaming
247 */
248
249static void fast_clear_page(void *page)
250{
251 int i;
252
253 kernel_fpu_begin();
254
255 __asm__ __volatile__ (
256 " pxor %%mm0, %%mm0\n" : :
257 );
258
259 for(i=0;i<4096/128;i++)
260 {
261 __asm__ __volatile__ (
262 " movq %%mm0, (%0)\n"
263 " movq %%mm0, 8(%0)\n"
264 " movq %%mm0, 16(%0)\n"
265 " movq %%mm0, 24(%0)\n"
266 " movq %%mm0, 32(%0)\n"
267 " movq %%mm0, 40(%0)\n"
268 " movq %%mm0, 48(%0)\n"
269 " movq %%mm0, 56(%0)\n"
270 " movq %%mm0, 64(%0)\n"
271 " movq %%mm0, 72(%0)\n"
272 " movq %%mm0, 80(%0)\n"
273 " movq %%mm0, 88(%0)\n"
274 " movq %%mm0, 96(%0)\n"
275 " movq %%mm0, 104(%0)\n"
276 " movq %%mm0, 112(%0)\n"
277 " movq %%mm0, 120(%0)\n"
278 : : "r" (page) : "memory");
279 page+=128;
280 }
281
282 kernel_fpu_end();
283}
284
285static void fast_copy_page(void *to, void *from)
286{
287 int i;
288
289
290 kernel_fpu_begin();
291
292 __asm__ __volatile__ (
293 "1: prefetch (%0)\n"
294 " prefetch 64(%0)\n"
295 " prefetch 128(%0)\n"
296 " prefetch 192(%0)\n"
297 " prefetch 256(%0)\n"
298 "2: \n"
299 ".section .fixup, \"ax\"\n"
300 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
301 " jmp 2b\n"
302 ".previous\n"
H. Peter Anvine7a40d22008-02-04 16:47:57 +0100303 _ASM_EXTABLE(1b,3b)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304 : : "r" (from) );
305
306 for(i=0; i<4096/64; i++)
307 {
308 __asm__ __volatile__ (
309 "1: prefetch 320(%0)\n"
310 "2: movq (%0), %%mm0\n"
311 " movq 8(%0), %%mm1\n"
312 " movq 16(%0), %%mm2\n"
313 " movq 24(%0), %%mm3\n"
314 " movq %%mm0, (%1)\n"
315 " movq %%mm1, 8(%1)\n"
316 " movq %%mm2, 16(%1)\n"
317 " movq %%mm3, 24(%1)\n"
318 " movq 32(%0), %%mm0\n"
319 " movq 40(%0), %%mm1\n"
320 " movq 48(%0), %%mm2\n"
321 " movq 56(%0), %%mm3\n"
322 " movq %%mm0, 32(%1)\n"
323 " movq %%mm1, 40(%1)\n"
324 " movq %%mm2, 48(%1)\n"
325 " movq %%mm3, 56(%1)\n"
326 ".section .fixup, \"ax\"\n"
327 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
328 " jmp 2b\n"
329 ".previous\n"
H. Peter Anvine7a40d22008-02-04 16:47:57 +0100330 _ASM_EXTABLE(1b,3b)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700331 : : "r" (from), "r" (to) : "memory");
332 from+=64;
333 to+=64;
334 }
335 kernel_fpu_end();
336}
337
338
339#endif
340
341/*
342 * Favour MMX for page clear and copy.
343 */
344
345static void slow_zero_page(void * page)
346{
347 int d0, d1;
348 __asm__ __volatile__( \
349 "cld\n\t" \
350 "rep ; stosl" \
351 : "=&c" (d0), "=&D" (d1)
352 :"a" (0),"1" (page),"0" (1024)
353 :"memory");
354}
355
356void mmx_clear_page(void * page)
357{
358 if(unlikely(in_interrupt()))
359 slow_zero_page(page);
360 else
361 fast_clear_page(page);
362}
363
364static void slow_copy_page(void *to, void *from)
365{
366 int d0, d1, d2;
367 __asm__ __volatile__( \
368 "cld\n\t" \
369 "rep ; movsl" \
370 : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
371 : "0" (1024),"1" ((long) to),"2" ((long) from) \
372 : "memory");
373}
374
375
376void mmx_copy_page(void *to, void *from)
377{
378 if(unlikely(in_interrupt()))
379 slow_copy_page(to, from);
380 else
381 fast_copy_page(to, from);
382}
Alexey Dobriyan129f6942005-06-23 00:08:33 -0700383
384EXPORT_SYMBOL(_mmx_memcpy);
385EXPORT_SYMBOL(mmx_clear_page);
386EXPORT_SYMBOL(mmx_copy_page);