blob: 1eee7fcb2420682f613eb7f2528a7056f38d7e04 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002 * Optimized RAID-5 checksumming functions for MMX and SSE.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2, or (at your option)
7 * any later version.
8 *
9 * You should have received a copy of the GNU General Public License
10 * (for example /usr/src/linux/COPYING); if not, write to the Free
11 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
12 */
13
14
15/*
16 * Cache avoiding checksumming functions utilizing KNI instructions
17 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
18 */
19
20/*
21 * Based on
22 * High-speed RAID5 checksumming functions utilizing SSE instructions.
23 * Copyright (C) 1998 Ingo Molnar.
24 */
25
26/*
27 * x86-64 changes / gcc fixes from Andi Kleen.
28 * Copyright 2002 Andi Kleen, SuSE Labs.
29 *
30 * This hasn't been optimized for the hammer yet, but there are likely
31 * no advantages to be gotten from x86-64 here anyways.
32 */
33
34typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
35
36/* Doesn't use gcc to save the XMM registers, because there is no easy way to
37 tell it to do a clts before the register saving. */
38#define XMMS_SAVE do { \
39 preempt_disable(); \
40 asm volatile ( \
41 "movq %%cr0,%0 ;\n\t" \
42 "clts ;\n\t" \
43 "movups %%xmm0,(%1) ;\n\t" \
44 "movups %%xmm1,0x10(%1) ;\n\t" \
45 "movups %%xmm2,0x20(%1) ;\n\t" \
46 "movups %%xmm3,0x30(%1) ;\n\t" \
47 : "=&r" (cr0) \
48 : "r" (xmm_save) \
49 : "memory"); \
50} while(0)
51
52#define XMMS_RESTORE do { \
53 asm volatile ( \
54 "sfence ;\n\t" \
55 "movups (%1),%%xmm0 ;\n\t" \
56 "movups 0x10(%1),%%xmm1 ;\n\t" \
57 "movups 0x20(%1),%%xmm2 ;\n\t" \
58 "movups 0x30(%1),%%xmm3 ;\n\t" \
59 "movq %0,%%cr0 ;\n\t" \
60 : \
61 : "r" (cr0), "r" (xmm_save) \
62 : "memory"); \
63 preempt_enable(); \
64} while(0)
65
66#define OFFS(x) "16*("#x")"
67#define PF_OFFS(x) "256+16*("#x")"
68#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
69#define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
70#define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
71#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
72#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
73#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
74#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
75#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
76#define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
77#define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
78#define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
79#define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
80#define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
81
82
83static void
84xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
85{
86 unsigned int lines = bytes >> 8;
87 unsigned long cr0;
88 xmm_store_t xmm_save[4];
89
90 XMMS_SAVE;
91
92 asm volatile (
93#undef BLOCK
94#define BLOCK(i) \
95 LD(i,0) \
96 LD(i+1,1) \
97 PF1(i) \
98 PF1(i+2) \
99 LD(i+2,2) \
100 LD(i+3,3) \
101 PF0(i+4) \
102 PF0(i+6) \
103 XO1(i,0) \
104 XO1(i+1,1) \
105 XO1(i+2,2) \
106 XO1(i+3,3) \
107 ST(i,0) \
108 ST(i+1,1) \
109 ST(i+2,2) \
110 ST(i+3,3) \
111
112
113 PF0(0)
114 PF0(2)
115
116 " .align 32 ;\n"
117 " 1: ;\n"
118
119 BLOCK(0)
120 BLOCK(4)
121 BLOCK(8)
122 BLOCK(12)
123
124 " addq %[inc], %[p1] ;\n"
125 " addq %[inc], %[p2] ;\n"
126 " decl %[cnt] ; jnz 1b"
127 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
128 : [inc] "r" (256UL)
129 : "memory");
130
131 XMMS_RESTORE;
132}
133
134static void
135xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
136 unsigned long *p3)
137{
138 unsigned int lines = bytes >> 8;
139 xmm_store_t xmm_save[4];
140 unsigned long cr0;
141
142 XMMS_SAVE;
143
144 __asm__ __volatile__ (
145#undef BLOCK
146#define BLOCK(i) \
147 PF1(i) \
148 PF1(i+2) \
149 LD(i,0) \
150 LD(i+1,1) \
151 LD(i+2,2) \
152 LD(i+3,3) \
153 PF2(i) \
154 PF2(i+2) \
155 PF0(i+4) \
156 PF0(i+6) \
157 XO1(i,0) \
158 XO1(i+1,1) \
159 XO1(i+2,2) \
160 XO1(i+3,3) \
161 XO2(i,0) \
162 XO2(i+1,1) \
163 XO2(i+2,2) \
164 XO2(i+3,3) \
165 ST(i,0) \
166 ST(i+1,1) \
167 ST(i+2,2) \
168 ST(i+3,3) \
169
170
171 PF0(0)
172 PF0(2)
173
174 " .align 32 ;\n"
175 " 1: ;\n"
176
177 BLOCK(0)
178 BLOCK(4)
179 BLOCK(8)
180 BLOCK(12)
181
182 " addq %[inc], %[p1] ;\n"
183 " addq %[inc], %[p2] ;\n"
184 " addq %[inc], %[p3] ;\n"
185 " decl %[cnt] ; jnz 1b"
186 : [cnt] "+r" (lines),
187 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
188 : [inc] "r" (256UL)
189 : "memory");
190 XMMS_RESTORE;
191}
192
193static void
194xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
195 unsigned long *p3, unsigned long *p4)
196{
197 unsigned int lines = bytes >> 8;
198 xmm_store_t xmm_save[4];
199 unsigned long cr0;
200
201 XMMS_SAVE;
202
203 __asm__ __volatile__ (
204#undef BLOCK
205#define BLOCK(i) \
206 PF1(i) \
207 PF1(i+2) \
208 LD(i,0) \
209 LD(i+1,1) \
210 LD(i+2,2) \
211 LD(i+3,3) \
212 PF2(i) \
213 PF2(i+2) \
214 XO1(i,0) \
215 XO1(i+1,1) \
216 XO1(i+2,2) \
217 XO1(i+3,3) \
218 PF3(i) \
219 PF3(i+2) \
220 PF0(i+4) \
221 PF0(i+6) \
222 XO2(i,0) \
223 XO2(i+1,1) \
224 XO2(i+2,2) \
225 XO2(i+3,3) \
226 XO3(i,0) \
227 XO3(i+1,1) \
228 XO3(i+2,2) \
229 XO3(i+3,3) \
230 ST(i,0) \
231 ST(i+1,1) \
232 ST(i+2,2) \
233 ST(i+3,3) \
234
235
236 PF0(0)
237 PF0(2)
238
239 " .align 32 ;\n"
240 " 1: ;\n"
241
242 BLOCK(0)
243 BLOCK(4)
244 BLOCK(8)
245 BLOCK(12)
246
247 " addq %[inc], %[p1] ;\n"
248 " addq %[inc], %[p2] ;\n"
249 " addq %[inc], %[p3] ;\n"
250 " addq %[inc], %[p4] ;\n"
251 " decl %[cnt] ; jnz 1b"
252 : [cnt] "+c" (lines),
253 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
254 : [inc] "r" (256UL)
255 : "memory" );
256
257 XMMS_RESTORE;
258}
259
260static void
261xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
262 unsigned long *p3, unsigned long *p4, unsigned long *p5)
263{
264 unsigned int lines = bytes >> 8;
265 xmm_store_t xmm_save[4];
266 unsigned long cr0;
267
268 XMMS_SAVE;
269
270 __asm__ __volatile__ (
271#undef BLOCK
272#define BLOCK(i) \
273 PF1(i) \
274 PF1(i+2) \
275 LD(i,0) \
276 LD(i+1,1) \
277 LD(i+2,2) \
278 LD(i+3,3) \
279 PF2(i) \
280 PF2(i+2) \
281 XO1(i,0) \
282 XO1(i+1,1) \
283 XO1(i+2,2) \
284 XO1(i+3,3) \
285 PF3(i) \
286 PF3(i+2) \
287 XO2(i,0) \
288 XO2(i+1,1) \
289 XO2(i+2,2) \
290 XO2(i+3,3) \
291 PF4(i) \
292 PF4(i+2) \
293 PF0(i+4) \
294 PF0(i+6) \
295 XO3(i,0) \
296 XO3(i+1,1) \
297 XO3(i+2,2) \
298 XO3(i+3,3) \
299 XO4(i,0) \
300 XO4(i+1,1) \
301 XO4(i+2,2) \
302 XO4(i+3,3) \
303 ST(i,0) \
304 ST(i+1,1) \
305 ST(i+2,2) \
306 ST(i+3,3) \
307
308
309 PF0(0)
310 PF0(2)
311
312 " .align 32 ;\n"
313 " 1: ;\n"
314
315 BLOCK(0)
316 BLOCK(4)
317 BLOCK(8)
318 BLOCK(12)
319
320 " addq %[inc], %[p1] ;\n"
321 " addq %[inc], %[p2] ;\n"
322 " addq %[inc], %[p3] ;\n"
323 " addq %[inc], %[p4] ;\n"
324 " addq %[inc], %[p5] ;\n"
325 " decl %[cnt] ; jnz 1b"
326 : [cnt] "+c" (lines),
327 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
328 [p5] "+r" (p5)
329 : [inc] "r" (256UL)
330 : "memory");
331
332 XMMS_RESTORE;
333}
334
335static struct xor_block_template xor_block_sse = {
336 .name = "generic_sse",
337 .do_2 = xor_sse_2,
338 .do_3 = xor_sse_3,
339 .do_4 = xor_sse_4,
340 .do_5 = xor_sse_5,
341};
342
343#undef XOR_TRY_TEMPLATES
344#define XOR_TRY_TEMPLATES \
345 do { \
346 xor_speed(&xor_block_sse); \
347 } while (0)
348
349/* We force the use of the SSE xor block because it can write around L2.
350 We may also be able to load into the L1 only depending on how the cpu
351 deals with a load to a line that is being prefetched. */
352#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)