blob: 1549b5e261f616945ed1fdbcc5af8c7961a5dbb4 [file] [log] [blame]
H. Peter Anvin1965aae2008-10-22 22:26:29 -07001#ifndef _ASM_X86_XOR_64_H
2#define _ASM_X86_XOR_64_H
Vegard Nossum0db125c2008-06-10 23:45:45 +02003
Linus Torvalds1da177e2005-04-16 15:20:36 -07004/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07005 * Optimized RAID-5 checksumming functions for MMX and SSE.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2, or (at your option)
10 * any later version.
11 *
12 * You should have received a copy of the GNU General Public License
13 * (for example /usr/src/linux/COPYING); if not, write to the Free
14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17
18/*
19 * Cache avoiding checksumming functions utilizing KNI instructions
20 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
21 */
22
23/*
24 * Based on
25 * High-speed RAID5 checksumming functions utilizing SSE instructions.
26 * Copyright (C) 1998 Ingo Molnar.
27 */
28
29/*
Joe Perches687c8052008-03-23 01:04:03 -070030 * x86-64 changes / gcc fixes from Andi Kleen.
Linus Torvalds1da177e2005-04-16 15:20:36 -070031 * Copyright 2002 Andi Kleen, SuSE Labs.
32 *
33 * This hasn't been optimized for the hammer yet, but there are likely
34 * no advantages to be gotten from x86-64 here anyways.
35 */
36
Joe Perches687c8052008-03-23 01:04:03 -070037typedef struct {
38 unsigned long a, b;
39} __attribute__((aligned(16))) xmm_store_t;
Linus Torvalds1da177e2005-04-16 15:20:36 -070040
Joe Perches687c8052008-03-23 01:04:03 -070041/* Doesn't use gcc to save the XMM registers, because there is no easy way to
Linus Torvalds1da177e2005-04-16 15:20:36 -070042 tell it to do a clts before the register saving. */
Joe Perches687c8052008-03-23 01:04:03 -070043#define XMMS_SAVE \
44do { \
Linus Torvalds1da177e2005-04-16 15:20:36 -070045 preempt_disable(); \
Joe Perches687c8052008-03-23 01:04:03 -070046 asm volatile( \
Linus Torvalds1da177e2005-04-16 15:20:36 -070047 "movq %%cr0,%0 ;\n\t" \
48 "clts ;\n\t" \
49 "movups %%xmm0,(%1) ;\n\t" \
50 "movups %%xmm1,0x10(%1) ;\n\t" \
51 "movups %%xmm2,0x20(%1) ;\n\t" \
52 "movups %%xmm3,0x30(%1) ;\n\t" \
53 : "=&r" (cr0) \
54 : "r" (xmm_save) \
55 : "memory"); \
Joe Perches687c8052008-03-23 01:04:03 -070056} while (0)
Linus Torvalds1da177e2005-04-16 15:20:36 -070057
Joe Perches687c8052008-03-23 01:04:03 -070058#define XMMS_RESTORE \
59do { \
60 asm volatile( \
Linus Torvalds1da177e2005-04-16 15:20:36 -070061 "sfence ;\n\t" \
62 "movups (%1),%%xmm0 ;\n\t" \
63 "movups 0x10(%1),%%xmm1 ;\n\t" \
64 "movups 0x20(%1),%%xmm2 ;\n\t" \
65 "movups 0x30(%1),%%xmm3 ;\n\t" \
66 "movq %0,%%cr0 ;\n\t" \
67 : \
68 : "r" (cr0), "r" (xmm_save) \
69 : "memory"); \
70 preempt_enable(); \
Joe Perches687c8052008-03-23 01:04:03 -070071} while (0)
Linus Torvalds1da177e2005-04-16 15:20:36 -070072
73#define OFFS(x) "16*("#x")"
74#define PF_OFFS(x) "256+16*("#x")"
75#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
Joe Perches687c8052008-03-23 01:04:03 -070076#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
77#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -070078#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
79#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
80#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
81#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
82#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
Joe Perches687c8052008-03-23 01:04:03 -070083#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
84#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
85#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
86#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
87#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -070088
89
90static void
91xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
92{
Joe Perches687c8052008-03-23 01:04:03 -070093 unsigned int lines = bytes >> 8;
Linus Torvalds1da177e2005-04-16 15:20:36 -070094 unsigned long cr0;
95 xmm_store_t xmm_save[4];
96
97 XMMS_SAVE;
98
Joe Perches687c8052008-03-23 01:04:03 -070099 asm volatile(
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100#undef BLOCK
101#define BLOCK(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700102 LD(i, 0) \
103 LD(i + 1, 1) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700104 PF1(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700105 PF1(i + 2) \
106 LD(i + 2, 2) \
107 LD(i + 3, 3) \
108 PF0(i + 4) \
109 PF0(i + 6) \
110 XO1(i, 0) \
111 XO1(i + 1, 1) \
112 XO1(i + 2, 2) \
113 XO1(i + 3, 3) \
114 ST(i, 0) \
115 ST(i + 1, 1) \
116 ST(i + 2, 2) \
117 ST(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118
119
120 PF0(0)
121 PF0(2)
122
123 " .align 32 ;\n"
Joe Perches687c8052008-03-23 01:04:03 -0700124 " 1: ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700125
126 BLOCK(0)
127 BLOCK(4)
128 BLOCK(8)
129 BLOCK(12)
130
Joe Perches687c8052008-03-23 01:04:03 -0700131 " addq %[inc], %[p1] ;\n"
132 " addq %[inc], %[p2] ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700133 " decl %[cnt] ; jnz 1b"
134 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
Joe Perches687c8052008-03-23 01:04:03 -0700135 : [inc] "r" (256UL)
136 : "memory");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137
138 XMMS_RESTORE;
139}
140
141static void
142xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
143 unsigned long *p3)
144{
145 unsigned int lines = bytes >> 8;
146 xmm_store_t xmm_save[4];
147 unsigned long cr0;
148
149 XMMS_SAVE;
150
Joe Perches687c8052008-03-23 01:04:03 -0700151 asm volatile(
Linus Torvalds1da177e2005-04-16 15:20:36 -0700152#undef BLOCK
153#define BLOCK(i) \
154 PF1(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700155 PF1(i + 2) \
156 LD(i, 0) \
157 LD(i + 1, 1) \
158 LD(i + 2, 2) \
159 LD(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160 PF2(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700161 PF2(i + 2) \
162 PF0(i + 4) \
163 PF0(i + 6) \
164 XO1(i, 0) \
165 XO1(i + 1, 1) \
166 XO1(i + 2, 2) \
167 XO1(i + 3, 3) \
168 XO2(i, 0) \
169 XO2(i + 1, 1) \
170 XO2(i + 2, 2) \
171 XO2(i + 3, 3) \
172 ST(i, 0) \
173 ST(i + 1, 1) \
174 ST(i + 2, 2) \
175 ST(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700176
177
178 PF0(0)
179 PF0(2)
180
181 " .align 32 ;\n"
Joe Perches687c8052008-03-23 01:04:03 -0700182 " 1: ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183
184 BLOCK(0)
185 BLOCK(4)
186 BLOCK(8)
187 BLOCK(12)
188
Joe Perches687c8052008-03-23 01:04:03 -0700189 " addq %[inc], %[p1] ;\n"
190 " addq %[inc], %[p2] ;\n"
191 " addq %[inc], %[p3] ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700192 " decl %[cnt] ; jnz 1b"
193 : [cnt] "+r" (lines),
194 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
195 : [inc] "r" (256UL)
Joe Perches687c8052008-03-23 01:04:03 -0700196 : "memory");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700197 XMMS_RESTORE;
198}
199
200static void
201xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
202 unsigned long *p3, unsigned long *p4)
203{
204 unsigned int lines = bytes >> 8;
Joe Perches687c8052008-03-23 01:04:03 -0700205 xmm_store_t xmm_save[4];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206 unsigned long cr0;
207
208 XMMS_SAVE;
209
Joe Perches687c8052008-03-23 01:04:03 -0700210 asm volatile(
Linus Torvalds1da177e2005-04-16 15:20:36 -0700211#undef BLOCK
212#define BLOCK(i) \
213 PF1(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700214 PF1(i + 2) \
215 LD(i, 0) \
216 LD(i + 1, 1) \
217 LD(i + 2, 2) \
218 LD(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700219 PF2(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700220 PF2(i + 2) \
221 XO1(i, 0) \
222 XO1(i + 1, 1) \
223 XO1(i + 2, 2) \
224 XO1(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700225 PF3(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700226 PF3(i + 2) \
227 PF0(i + 4) \
228 PF0(i + 6) \
229 XO2(i, 0) \
230 XO2(i + 1, 1) \
231 XO2(i + 2, 2) \
232 XO2(i + 3, 3) \
233 XO3(i, 0) \
234 XO3(i + 1, 1) \
235 XO3(i + 2, 2) \
236 XO3(i + 3, 3) \
237 ST(i, 0) \
238 ST(i + 1, 1) \
239 ST(i + 2, 2) \
240 ST(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241
242
243 PF0(0)
244 PF0(2)
245
246 " .align 32 ;\n"
Joe Perches687c8052008-03-23 01:04:03 -0700247 " 1: ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700248
249 BLOCK(0)
250 BLOCK(4)
251 BLOCK(8)
252 BLOCK(12)
253
Joe Perches687c8052008-03-23 01:04:03 -0700254 " addq %[inc], %[p1] ;\n"
255 " addq %[inc], %[p2] ;\n"
256 " addq %[inc], %[p3] ;\n"
257 " addq %[inc], %[p4] ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258 " decl %[cnt] ; jnz 1b"
259 : [cnt] "+c" (lines),
260 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
261 : [inc] "r" (256UL)
Joe Perches687c8052008-03-23 01:04:03 -0700262 : "memory" );
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263
264 XMMS_RESTORE;
265}
266
267static void
268xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
269 unsigned long *p3, unsigned long *p4, unsigned long *p5)
270{
Joe Perches687c8052008-03-23 01:04:03 -0700271 unsigned int lines = bytes >> 8;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272 xmm_store_t xmm_save[4];
273 unsigned long cr0;
274
275 XMMS_SAVE;
276
Joe Perches687c8052008-03-23 01:04:03 -0700277 asm volatile(
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278#undef BLOCK
279#define BLOCK(i) \
280 PF1(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700281 PF1(i + 2) \
282 LD(i, 0) \
283 LD(i + 1, 1) \
284 LD(i + 2, 2) \
285 LD(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700286 PF2(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700287 PF2(i + 2) \
288 XO1(i, 0) \
289 XO1(i + 1, 1) \
290 XO1(i + 2, 2) \
291 XO1(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700292 PF3(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700293 PF3(i + 2) \
294 XO2(i, 0) \
295 XO2(i + 1, 1) \
296 XO2(i + 2, 2) \
297 XO2(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298 PF4(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700299 PF4(i + 2) \
300 PF0(i + 4) \
301 PF0(i + 6) \
302 XO3(i, 0) \
303 XO3(i + 1, 1) \
304 XO3(i + 2, 2) \
305 XO3(i + 3, 3) \
306 XO4(i, 0) \
307 XO4(i + 1, 1) \
308 XO4(i + 2, 2) \
309 XO4(i + 3, 3) \
310 ST(i, 0) \
311 ST(i + 1, 1) \
312 ST(i + 2, 2) \
313 ST(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700314
315
316 PF0(0)
317 PF0(2)
318
319 " .align 32 ;\n"
Joe Perches687c8052008-03-23 01:04:03 -0700320 " 1: ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321
322 BLOCK(0)
323 BLOCK(4)
324 BLOCK(8)
325 BLOCK(12)
326
Joe Perches687c8052008-03-23 01:04:03 -0700327 " addq %[inc], %[p1] ;\n"
328 " addq %[inc], %[p2] ;\n"
329 " addq %[inc], %[p3] ;\n"
330 " addq %[inc], %[p4] ;\n"
331 " addq %[inc], %[p5] ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332 " decl %[cnt] ; jnz 1b"
333 : [cnt] "+c" (lines),
Joe Perches687c8052008-03-23 01:04:03 -0700334 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335 [p5] "+r" (p5)
336 : [inc] "r" (256UL)
337 : "memory");
338
339 XMMS_RESTORE;
340}
341
342static struct xor_block_template xor_block_sse = {
Joe Perches687c8052008-03-23 01:04:03 -0700343 .name = "generic_sse",
344 .do_2 = xor_sse_2,
345 .do_3 = xor_sse_3,
346 .do_4 = xor_sse_4,
347 .do_5 = xor_sse_5,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348};
349
350#undef XOR_TRY_TEMPLATES
Joe Perches687c8052008-03-23 01:04:03 -0700351#define XOR_TRY_TEMPLATES \
352do { \
353 xor_speed(&xor_block_sse); \
354} while (0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700355
356/* We force the use of the SSE xor block because it can write around L2.
357 We may also be able to load into the L1 only depending on how the cpu
358 deals with a load to a line that is being prefetched. */
359#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
Vegard Nossum0db125c2008-06-10 23:45:45 +0200360
H. Peter Anvin1965aae2008-10-22 22:26:29 -0700361#endif /* _ASM_X86_XOR_64_H */