blob: 1f5c5161ead682664dc30fc5dda802de2de0bc4b [file] [log] [blame]
Vegard Nossumf8561292008-04-04 00:53:23 +02001#ifdef CONFIG_KMEMCHECK
2/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
3# include <asm-generic/xor.h>
Jan Beuliche8f6e3f2012-11-02 14:19:18 +00004#elif !defined(_ASM_X86_XOR_H)
5#define _ASM_X86_XOR_H
6
7/*
8 * Optimized RAID-5 checksumming functions for SSE.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2, or (at your option)
13 * any later version.
14 *
15 * You should have received a copy of the GNU General Public License
16 * (for example /usr/src/linux/COPYING); if not, write to the Free
17 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20/*
21 * Cache avoiding checksumming functions utilizing KNI instructions
22 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
23 */
24
25/*
26 * Based on
27 * High-speed RAID5 checksumming functions utilizing SSE instructions.
28 * Copyright (C) 1998 Ingo Molnar.
29 */
30
31/*
32 * x86-64 changes / gcc fixes from Andi Kleen.
33 * Copyright 2002 Andi Kleen, SuSE Labs.
34 *
35 * This hasn't been optimized for the hammer yet, but there are likely
36 * no advantages to be gotten from x86-64 here anyways.
37 */
38
Ingo Molnardf6b35f2015-04-24 02:46:00 +020039#include <asm/fpu/api.h>
Jan Beuliche8f6e3f2012-11-02 14:19:18 +000040
41#ifdef CONFIG_X86_32
42/* reduce register pressure */
43# define XOR_CONSTANT_CONSTRAINT "i"
Vegard Nossumf8561292008-04-04 00:53:23 +020044#else
Jan Beuliche8f6e3f2012-11-02 14:19:18 +000045# define XOR_CONSTANT_CONSTRAINT "re"
46#endif
47
48#define OFFS(x) "16*("#x")"
49#define PF_OFFS(x) "256+16*("#x")"
50#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
51#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
52#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
53#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
54#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
55#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
56#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
57#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
58#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
59#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
60#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
Jan Beulichf3178202012-11-02 14:20:24 +000061#define NOP(x)
62
63#define BLK64(pf, op, i) \
64 pf(i) \
65 op(i, 0) \
66 op(i + 1, 1) \
67 op(i + 2, 2) \
68 op(i + 3, 3)
Jan Beuliche8f6e3f2012-11-02 14:19:18 +000069
70static void
71xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
72{
73 unsigned long lines = bytes >> 8;
74
75 kernel_fpu_begin();
76
77 asm volatile(
78#undef BLOCK
79#define BLOCK(i) \
80 LD(i, 0) \
81 LD(i + 1, 1) \
82 PF1(i) \
83 PF1(i + 2) \
84 LD(i + 2, 2) \
85 LD(i + 3, 3) \
86 PF0(i + 4) \
87 PF0(i + 6) \
88 XO1(i, 0) \
89 XO1(i + 1, 1) \
90 XO1(i + 2, 2) \
91 XO1(i + 3, 3) \
92 ST(i, 0) \
93 ST(i + 1, 1) \
94 ST(i + 2, 2) \
95 ST(i + 3, 3) \
96
97
98 PF0(0)
99 PF0(2)
100
101 " .align 32 ;\n"
102 " 1: ;\n"
103
104 BLOCK(0)
105 BLOCK(4)
106 BLOCK(8)
107 BLOCK(12)
108
109 " add %[inc], %[p1] ;\n"
110 " add %[inc], %[p2] ;\n"
111 " dec %[cnt] ;\n"
112 " jnz 1b ;\n"
113 : [cnt] "+r" (lines),
114 [p1] "+r" (p1), [p2] "+r" (p2)
115 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
116 : "memory");
117
118 kernel_fpu_end();
119}
120
121static void
Jan Beulichf3178202012-11-02 14:20:24 +0000122xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
123{
124 unsigned long lines = bytes >> 8;
125
126 kernel_fpu_begin();
127
128 asm volatile(
129#undef BLOCK
130#define BLOCK(i) \
131 BLK64(PF0, LD, i) \
132 BLK64(PF1, XO1, i) \
133 BLK64(NOP, ST, i) \
134
135 " .align 32 ;\n"
136 " 1: ;\n"
137
138 BLOCK(0)
139 BLOCK(4)
140 BLOCK(8)
141 BLOCK(12)
142
143 " add %[inc], %[p1] ;\n"
144 " add %[inc], %[p2] ;\n"
145 " dec %[cnt] ;\n"
146 " jnz 1b ;\n"
147 : [cnt] "+r" (lines),
148 [p1] "+r" (p1), [p2] "+r" (p2)
149 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
150 : "memory");
151
152 kernel_fpu_end();
153}
154
155static void
Jan Beuliche8f6e3f2012-11-02 14:19:18 +0000156xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
157 unsigned long *p3)
158{
159 unsigned long lines = bytes >> 8;
160
161 kernel_fpu_begin();
162
163 asm volatile(
164#undef BLOCK
165#define BLOCK(i) \
166 PF1(i) \
167 PF1(i + 2) \
168 LD(i, 0) \
169 LD(i + 1, 1) \
170 LD(i + 2, 2) \
171 LD(i + 3, 3) \
172 PF2(i) \
173 PF2(i + 2) \
174 PF0(i + 4) \
175 PF0(i + 6) \
176 XO1(i, 0) \
177 XO1(i + 1, 1) \
178 XO1(i + 2, 2) \
179 XO1(i + 3, 3) \
180 XO2(i, 0) \
181 XO2(i + 1, 1) \
182 XO2(i + 2, 2) \
183 XO2(i + 3, 3) \
184 ST(i, 0) \
185 ST(i + 1, 1) \
186 ST(i + 2, 2) \
187 ST(i + 3, 3) \
188
189
190 PF0(0)
191 PF0(2)
192
193 " .align 32 ;\n"
194 " 1: ;\n"
195
196 BLOCK(0)
197 BLOCK(4)
198 BLOCK(8)
199 BLOCK(12)
200
201 " add %[inc], %[p1] ;\n"
202 " add %[inc], %[p2] ;\n"
203 " add %[inc], %[p3] ;\n"
204 " dec %[cnt] ;\n"
205 " jnz 1b ;\n"
206 : [cnt] "+r" (lines),
207 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
208 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
209 : "memory");
210
211 kernel_fpu_end();
212}
213
214static void
Jan Beulichf3178202012-11-02 14:20:24 +0000215xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
216 unsigned long *p3)
217{
218 unsigned long lines = bytes >> 8;
219
220 kernel_fpu_begin();
221
222 asm volatile(
223#undef BLOCK
224#define BLOCK(i) \
225 BLK64(PF0, LD, i) \
226 BLK64(PF1, XO1, i) \
227 BLK64(PF2, XO2, i) \
228 BLK64(NOP, ST, i) \
229
230 " .align 32 ;\n"
231 " 1: ;\n"
232
233 BLOCK(0)
234 BLOCK(4)
235 BLOCK(8)
236 BLOCK(12)
237
238 " add %[inc], %[p1] ;\n"
239 " add %[inc], %[p2] ;\n"
240 " add %[inc], %[p3] ;\n"
241 " dec %[cnt] ;\n"
242 " jnz 1b ;\n"
243 : [cnt] "+r" (lines),
244 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
245 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
246 : "memory");
247
248 kernel_fpu_end();
249}
250
251static void
Jan Beuliche8f6e3f2012-11-02 14:19:18 +0000252xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
253 unsigned long *p3, unsigned long *p4)
254{
255 unsigned long lines = bytes >> 8;
256
257 kernel_fpu_begin();
258
259 asm volatile(
260#undef BLOCK
261#define BLOCK(i) \
262 PF1(i) \
263 PF1(i + 2) \
264 LD(i, 0) \
265 LD(i + 1, 1) \
266 LD(i + 2, 2) \
267 LD(i + 3, 3) \
268 PF2(i) \
269 PF2(i + 2) \
270 XO1(i, 0) \
271 XO1(i + 1, 1) \
272 XO1(i + 2, 2) \
273 XO1(i + 3, 3) \
274 PF3(i) \
275 PF3(i + 2) \
276 PF0(i + 4) \
277 PF0(i + 6) \
278 XO2(i, 0) \
279 XO2(i + 1, 1) \
280 XO2(i + 2, 2) \
281 XO2(i + 3, 3) \
282 XO3(i, 0) \
283 XO3(i + 1, 1) \
284 XO3(i + 2, 2) \
285 XO3(i + 3, 3) \
286 ST(i, 0) \
287 ST(i + 1, 1) \
288 ST(i + 2, 2) \
289 ST(i + 3, 3) \
290
291
292 PF0(0)
293 PF0(2)
294
295 " .align 32 ;\n"
296 " 1: ;\n"
297
298 BLOCK(0)
299 BLOCK(4)
300 BLOCK(8)
301 BLOCK(12)
302
303 " add %[inc], %[p1] ;\n"
304 " add %[inc], %[p2] ;\n"
305 " add %[inc], %[p3] ;\n"
306 " add %[inc], %[p4] ;\n"
307 " dec %[cnt] ;\n"
308 " jnz 1b ;\n"
309 : [cnt] "+r" (lines), [p1] "+r" (p1),
310 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
311 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
312 : "memory");
313
314 kernel_fpu_end();
315}
316
317static void
Jan Beulichf3178202012-11-02 14:20:24 +0000318xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
319 unsigned long *p3, unsigned long *p4)
320{
321 unsigned long lines = bytes >> 8;
322
323 kernel_fpu_begin();
324
325 asm volatile(
326#undef BLOCK
327#define BLOCK(i) \
328 BLK64(PF0, LD, i) \
329 BLK64(PF1, XO1, i) \
330 BLK64(PF2, XO2, i) \
331 BLK64(PF3, XO3, i) \
332 BLK64(NOP, ST, i) \
333
334 " .align 32 ;\n"
335 " 1: ;\n"
336
337 BLOCK(0)
338 BLOCK(4)
339 BLOCK(8)
340 BLOCK(12)
341
342 " add %[inc], %[p1] ;\n"
343 " add %[inc], %[p2] ;\n"
344 " add %[inc], %[p3] ;\n"
345 " add %[inc], %[p4] ;\n"
346 " dec %[cnt] ;\n"
347 " jnz 1b ;\n"
348 : [cnt] "+r" (lines), [p1] "+r" (p1),
349 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
350 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
351 : "memory");
352
353 kernel_fpu_end();
354}
355
356static void
Jan Beuliche8f6e3f2012-11-02 14:19:18 +0000357xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
358 unsigned long *p3, unsigned long *p4, unsigned long *p5)
359{
360 unsigned long lines = bytes >> 8;
361
362 kernel_fpu_begin();
363
364 asm volatile(
365#undef BLOCK
366#define BLOCK(i) \
367 PF1(i) \
368 PF1(i + 2) \
369 LD(i, 0) \
370 LD(i + 1, 1) \
371 LD(i + 2, 2) \
372 LD(i + 3, 3) \
373 PF2(i) \
374 PF2(i + 2) \
375 XO1(i, 0) \
376 XO1(i + 1, 1) \
377 XO1(i + 2, 2) \
378 XO1(i + 3, 3) \
379 PF3(i) \
380 PF3(i + 2) \
381 XO2(i, 0) \
382 XO2(i + 1, 1) \
383 XO2(i + 2, 2) \
384 XO2(i + 3, 3) \
385 PF4(i) \
386 PF4(i + 2) \
387 PF0(i + 4) \
388 PF0(i + 6) \
389 XO3(i, 0) \
390 XO3(i + 1, 1) \
391 XO3(i + 2, 2) \
392 XO3(i + 3, 3) \
393 XO4(i, 0) \
394 XO4(i + 1, 1) \
395 XO4(i + 2, 2) \
396 XO4(i + 3, 3) \
397 ST(i, 0) \
398 ST(i + 1, 1) \
399 ST(i + 2, 2) \
400 ST(i + 3, 3) \
401
402
403 PF0(0)
404 PF0(2)
405
406 " .align 32 ;\n"
407 " 1: ;\n"
408
409 BLOCK(0)
410 BLOCK(4)
411 BLOCK(8)
412 BLOCK(12)
413
414 " add %[inc], %[p1] ;\n"
415 " add %[inc], %[p2] ;\n"
416 " add %[inc], %[p3] ;\n"
417 " add %[inc], %[p4] ;\n"
418 " add %[inc], %[p5] ;\n"
419 " dec %[cnt] ;\n"
420 " jnz 1b ;\n"
421 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
422 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
423 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
424 : "memory");
425
426 kernel_fpu_end();
427}
428
Jan Beulichf3178202012-11-02 14:20:24 +0000429static void
430xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
431 unsigned long *p3, unsigned long *p4, unsigned long *p5)
432{
433 unsigned long lines = bytes >> 8;
434
435 kernel_fpu_begin();
436
437 asm volatile(
438#undef BLOCK
439#define BLOCK(i) \
440 BLK64(PF0, LD, i) \
441 BLK64(PF1, XO1, i) \
442 BLK64(PF2, XO2, i) \
443 BLK64(PF3, XO3, i) \
444 BLK64(PF4, XO4, i) \
445 BLK64(NOP, ST, i) \
446
447 " .align 32 ;\n"
448 " 1: ;\n"
449
450 BLOCK(0)
451 BLOCK(4)
452 BLOCK(8)
453 BLOCK(12)
454
455 " add %[inc], %[p1] ;\n"
456 " add %[inc], %[p2] ;\n"
457 " add %[inc], %[p3] ;\n"
458 " add %[inc], %[p4] ;\n"
459 " add %[inc], %[p5] ;\n"
460 " dec %[cnt] ;\n"
461 " jnz 1b ;\n"
462 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
463 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
464 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
465 : "memory");
466
467 kernel_fpu_end();
468}
469
470static struct xor_block_template xor_block_sse_pf64 = {
471 .name = "prefetch64-sse",
472 .do_2 = xor_sse_2_pf64,
473 .do_3 = xor_sse_3_pf64,
474 .do_4 = xor_sse_4_pf64,
475 .do_5 = xor_sse_5_pf64,
476};
477
Jan Beuliche8f6e3f2012-11-02 14:19:18 +0000478#undef LD
479#undef XO1
480#undef XO2
481#undef XO3
482#undef XO4
483#undef ST
Jan Beulichf3178202012-11-02 14:20:24 +0000484#undef NOP
485#undef BLK64
Jan Beuliche8f6e3f2012-11-02 14:19:18 +0000486#undef BLOCK
487
488#undef XOR_CONSTANT_CONSTRAINT
489
Thomas Gleixner96a388d2007-10-11 11:20:03 +0200490#ifdef CONFIG_X86_32
David Howellsa1ce3922012-10-02 18:01:25 +0100491# include <asm/xor_32.h>
Thomas Gleixner96a388d2007-10-11 11:20:03 +0200492#else
David Howellsa1ce3922012-10-02 18:01:25 +0100493# include <asm/xor_64.h>
Thomas Gleixner96a388d2007-10-11 11:20:03 +0200494#endif
Jan Beuliche8f6e3f2012-11-02 14:19:18 +0000495
Jan Beulichf3178202012-11-02 14:20:24 +0000496#define XOR_SELECT_TEMPLATE(FASTEST) \
497 AVX_SELECT(FASTEST)
498
Jan Beuliche8f6e3f2012-11-02 14:19:18 +0000499#endif /* _ASM_X86_XOR_H */