blob: 24957e39ac8aff0f0e7192d31c4b01abd1d3415d [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002 * Optimized RAID-5 checksumming functions for MMX and SSE.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2, or (at your option)
7 * any later version.
8 *
9 * You should have received a copy of the GNU General Public License
10 * (for example /usr/src/linux/COPYING); if not, write to the Free
11 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
12 */
13
14
15/*
16 * Cache avoiding checksumming functions utilizing KNI instructions
17 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
18 */
19
20/*
21 * Based on
22 * High-speed RAID5 checksumming functions utilizing SSE instructions.
23 * Copyright (C) 1998 Ingo Molnar.
24 */
25
26/*
Joe Perches687c8052008-03-23 01:04:03 -070027 * x86-64 changes / gcc fixes from Andi Kleen.
Linus Torvalds1da177e2005-04-16 15:20:36 -070028 * Copyright 2002 Andi Kleen, SuSE Labs.
29 *
30 * This hasn't been optimized for the hammer yet, but there are likely
31 * no advantages to be gotten from x86-64 here anyways.
32 */
33
Joe Perches687c8052008-03-23 01:04:03 -070034typedef struct {
35 unsigned long a, b;
36} __attribute__((aligned(16))) xmm_store_t;
Linus Torvalds1da177e2005-04-16 15:20:36 -070037
Joe Perches687c8052008-03-23 01:04:03 -070038/* Doesn't use gcc to save the XMM registers, because there is no easy way to
Linus Torvalds1da177e2005-04-16 15:20:36 -070039 tell it to do a clts before the register saving. */
Joe Perches687c8052008-03-23 01:04:03 -070040#define XMMS_SAVE \
41do { \
Linus Torvalds1da177e2005-04-16 15:20:36 -070042 preempt_disable(); \
Joe Perches687c8052008-03-23 01:04:03 -070043 asm volatile( \
Linus Torvalds1da177e2005-04-16 15:20:36 -070044 "movq %%cr0,%0 ;\n\t" \
45 "clts ;\n\t" \
46 "movups %%xmm0,(%1) ;\n\t" \
47 "movups %%xmm1,0x10(%1) ;\n\t" \
48 "movups %%xmm2,0x20(%1) ;\n\t" \
49 "movups %%xmm3,0x30(%1) ;\n\t" \
50 : "=&r" (cr0) \
51 : "r" (xmm_save) \
52 : "memory"); \
Joe Perches687c8052008-03-23 01:04:03 -070053} while (0)
Linus Torvalds1da177e2005-04-16 15:20:36 -070054
Joe Perches687c8052008-03-23 01:04:03 -070055#define XMMS_RESTORE \
56do { \
57 asm volatile( \
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 "sfence ;\n\t" \
59 "movups (%1),%%xmm0 ;\n\t" \
60 "movups 0x10(%1),%%xmm1 ;\n\t" \
61 "movups 0x20(%1),%%xmm2 ;\n\t" \
62 "movups 0x30(%1),%%xmm3 ;\n\t" \
63 "movq %0,%%cr0 ;\n\t" \
64 : \
65 : "r" (cr0), "r" (xmm_save) \
66 : "memory"); \
67 preempt_enable(); \
Joe Perches687c8052008-03-23 01:04:03 -070068} while (0)
Linus Torvalds1da177e2005-04-16 15:20:36 -070069
70#define OFFS(x) "16*("#x")"
71#define PF_OFFS(x) "256+16*("#x")"
72#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
Joe Perches687c8052008-03-23 01:04:03 -070073#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
74#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -070075#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
76#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
77#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
78#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
79#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
Joe Perches687c8052008-03-23 01:04:03 -070080#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
81#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
82#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
83#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
84#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -070085
86
87static void
88xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
89{
Joe Perches687c8052008-03-23 01:04:03 -070090 unsigned int lines = bytes >> 8;
Linus Torvalds1da177e2005-04-16 15:20:36 -070091 unsigned long cr0;
92 xmm_store_t xmm_save[4];
93
94 XMMS_SAVE;
95
Joe Perches687c8052008-03-23 01:04:03 -070096 asm volatile(
Linus Torvalds1da177e2005-04-16 15:20:36 -070097#undef BLOCK
98#define BLOCK(i) \
Joe Perches687c8052008-03-23 01:04:03 -070099 LD(i, 0) \
100 LD(i + 1, 1) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700101 PF1(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700102 PF1(i + 2) \
103 LD(i + 2, 2) \
104 LD(i + 3, 3) \
105 PF0(i + 4) \
106 PF0(i + 6) \
107 XO1(i, 0) \
108 XO1(i + 1, 1) \
109 XO1(i + 2, 2) \
110 XO1(i + 3, 3) \
111 ST(i, 0) \
112 ST(i + 1, 1) \
113 ST(i + 2, 2) \
114 ST(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115
116
117 PF0(0)
118 PF0(2)
119
120 " .align 32 ;\n"
Joe Perches687c8052008-03-23 01:04:03 -0700121 " 1: ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122
123 BLOCK(0)
124 BLOCK(4)
125 BLOCK(8)
126 BLOCK(12)
127
Joe Perches687c8052008-03-23 01:04:03 -0700128 " addq %[inc], %[p1] ;\n"
129 " addq %[inc], %[p2] ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700130 " decl %[cnt] ; jnz 1b"
131 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
Joe Perches687c8052008-03-23 01:04:03 -0700132 : [inc] "r" (256UL)
133 : "memory");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700134
135 XMMS_RESTORE;
136}
137
138static void
139xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
140 unsigned long *p3)
141{
142 unsigned int lines = bytes >> 8;
143 xmm_store_t xmm_save[4];
144 unsigned long cr0;
145
146 XMMS_SAVE;
147
Joe Perches687c8052008-03-23 01:04:03 -0700148 asm volatile(
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149#undef BLOCK
150#define BLOCK(i) \
151 PF1(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700152 PF1(i + 2) \
153 LD(i, 0) \
154 LD(i + 1, 1) \
155 LD(i + 2, 2) \
156 LD(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157 PF2(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700158 PF2(i + 2) \
159 PF0(i + 4) \
160 PF0(i + 6) \
161 XO1(i, 0) \
162 XO1(i + 1, 1) \
163 XO1(i + 2, 2) \
164 XO1(i + 3, 3) \
165 XO2(i, 0) \
166 XO2(i + 1, 1) \
167 XO2(i + 2, 2) \
168 XO2(i + 3, 3) \
169 ST(i, 0) \
170 ST(i + 1, 1) \
171 ST(i + 2, 2) \
172 ST(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700173
174
175 PF0(0)
176 PF0(2)
177
178 " .align 32 ;\n"
Joe Perches687c8052008-03-23 01:04:03 -0700179 " 1: ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700180
181 BLOCK(0)
182 BLOCK(4)
183 BLOCK(8)
184 BLOCK(12)
185
Joe Perches687c8052008-03-23 01:04:03 -0700186 " addq %[inc], %[p1] ;\n"
187 " addq %[inc], %[p2] ;\n"
188 " addq %[inc], %[p3] ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700189 " decl %[cnt] ; jnz 1b"
190 : [cnt] "+r" (lines),
191 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
192 : [inc] "r" (256UL)
Joe Perches687c8052008-03-23 01:04:03 -0700193 : "memory");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700194 XMMS_RESTORE;
195}
196
197static void
198xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
199 unsigned long *p3, unsigned long *p4)
200{
201 unsigned int lines = bytes >> 8;
Joe Perches687c8052008-03-23 01:04:03 -0700202 xmm_store_t xmm_save[4];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700203 unsigned long cr0;
204
205 XMMS_SAVE;
206
Joe Perches687c8052008-03-23 01:04:03 -0700207 asm volatile(
Linus Torvalds1da177e2005-04-16 15:20:36 -0700208#undef BLOCK
209#define BLOCK(i) \
210 PF1(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700211 PF1(i + 2) \
212 LD(i, 0) \
213 LD(i + 1, 1) \
214 LD(i + 2, 2) \
215 LD(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700216 PF2(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700217 PF2(i + 2) \
218 XO1(i, 0) \
219 XO1(i + 1, 1) \
220 XO1(i + 2, 2) \
221 XO1(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222 PF3(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700223 PF3(i + 2) \
224 PF0(i + 4) \
225 PF0(i + 6) \
226 XO2(i, 0) \
227 XO2(i + 1, 1) \
228 XO2(i + 2, 2) \
229 XO2(i + 3, 3) \
230 XO3(i, 0) \
231 XO3(i + 1, 1) \
232 XO3(i + 2, 2) \
233 XO3(i + 3, 3) \
234 ST(i, 0) \
235 ST(i + 1, 1) \
236 ST(i + 2, 2) \
237 ST(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700238
239
240 PF0(0)
241 PF0(2)
242
243 " .align 32 ;\n"
Joe Perches687c8052008-03-23 01:04:03 -0700244 " 1: ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700245
246 BLOCK(0)
247 BLOCK(4)
248 BLOCK(8)
249 BLOCK(12)
250
Joe Perches687c8052008-03-23 01:04:03 -0700251 " addq %[inc], %[p1] ;\n"
252 " addq %[inc], %[p2] ;\n"
253 " addq %[inc], %[p3] ;\n"
254 " addq %[inc], %[p4] ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700255 " decl %[cnt] ; jnz 1b"
256 : [cnt] "+c" (lines),
257 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
258 : [inc] "r" (256UL)
Joe Perches687c8052008-03-23 01:04:03 -0700259 : "memory" );
Linus Torvalds1da177e2005-04-16 15:20:36 -0700260
261 XMMS_RESTORE;
262}
263
264static void
265xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
266 unsigned long *p3, unsigned long *p4, unsigned long *p5)
267{
Joe Perches687c8052008-03-23 01:04:03 -0700268 unsigned int lines = bytes >> 8;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700269 xmm_store_t xmm_save[4];
270 unsigned long cr0;
271
272 XMMS_SAVE;
273
Joe Perches687c8052008-03-23 01:04:03 -0700274 asm volatile(
Linus Torvalds1da177e2005-04-16 15:20:36 -0700275#undef BLOCK
276#define BLOCK(i) \
277 PF1(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700278 PF1(i + 2) \
279 LD(i, 0) \
280 LD(i + 1, 1) \
281 LD(i + 2, 2) \
282 LD(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700283 PF2(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700284 PF2(i + 2) \
285 XO1(i, 0) \
286 XO1(i + 1, 1) \
287 XO1(i + 2, 2) \
288 XO1(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289 PF3(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700290 PF3(i + 2) \
291 XO2(i, 0) \
292 XO2(i + 1, 1) \
293 XO2(i + 2, 2) \
294 XO2(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700295 PF4(i) \
Joe Perches687c8052008-03-23 01:04:03 -0700296 PF4(i + 2) \
297 PF0(i + 4) \
298 PF0(i + 6) \
299 XO3(i, 0) \
300 XO3(i + 1, 1) \
301 XO3(i + 2, 2) \
302 XO3(i + 3, 3) \
303 XO4(i, 0) \
304 XO4(i + 1, 1) \
305 XO4(i + 2, 2) \
306 XO4(i + 3, 3) \
307 ST(i, 0) \
308 ST(i + 1, 1) \
309 ST(i + 2, 2) \
310 ST(i + 3, 3) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700311
312
313 PF0(0)
314 PF0(2)
315
316 " .align 32 ;\n"
Joe Perches687c8052008-03-23 01:04:03 -0700317 " 1: ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318
319 BLOCK(0)
320 BLOCK(4)
321 BLOCK(8)
322 BLOCK(12)
323
Joe Perches687c8052008-03-23 01:04:03 -0700324 " addq %[inc], %[p1] ;\n"
325 " addq %[inc], %[p2] ;\n"
326 " addq %[inc], %[p3] ;\n"
327 " addq %[inc], %[p4] ;\n"
328 " addq %[inc], %[p5] ;\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329 " decl %[cnt] ; jnz 1b"
330 : [cnt] "+c" (lines),
Joe Perches687c8052008-03-23 01:04:03 -0700331 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332 [p5] "+r" (p5)
333 : [inc] "r" (256UL)
334 : "memory");
335
336 XMMS_RESTORE;
337}
338
339static struct xor_block_template xor_block_sse = {
Joe Perches687c8052008-03-23 01:04:03 -0700340 .name = "generic_sse",
341 .do_2 = xor_sse_2,
342 .do_3 = xor_sse_3,
343 .do_4 = xor_sse_4,
344 .do_5 = xor_sse_5,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700345};
346
347#undef XOR_TRY_TEMPLATES
Joe Perches687c8052008-03-23 01:04:03 -0700348#define XOR_TRY_TEMPLATES \
349do { \
350 xor_speed(&xor_block_sse); \
351} while (0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700352
353/* We force the use of the SSE xor block because it can write around L2.
354 We may also be able to load into the L1 only depending on how the cpu
355 deals with a load to a line that is being prefetched. */
356#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)