blob: c661571ca0b70a5eafa3ab5bc0eb8ee29ced483e [file] [log] [blame]
Vegard Nossumf8561292008-04-04 00:53:23 +02001#ifdef CONFIG_KMEMCHECK
2/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
3# include <asm-generic/xor.h>
Jan Beuliche8f6e3f2012-11-02 14:19:18 +00004#elif !defined(_ASM_X86_XOR_H)
5#define _ASM_X86_XOR_H
6
7/*
8 * Optimized RAID-5 checksumming functions for SSE.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2, or (at your option)
13 * any later version.
14 *
15 * You should have received a copy of the GNU General Public License
16 * (for example /usr/src/linux/COPYING); if not, write to the Free
17 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20/*
21 * Cache avoiding checksumming functions utilizing KNI instructions
22 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
23 */
24
25/*
26 * Based on
27 * High-speed RAID5 checksumming functions utilizing SSE instructions.
28 * Copyright (C) 1998 Ingo Molnar.
29 */
30
31/*
32 * x86-64 changes / gcc fixes from Andi Kleen.
33 * Copyright 2002 Andi Kleen, SuSE Labs.
34 *
35 * This hasn't been optimized for the hammer yet, but there are likely
36 * no advantages to be gotten from x86-64 here anyways.
37 */
38
39#include <asm/i387.h>
40
41#ifdef CONFIG_X86_32
42/* reduce register pressure */
43# define XOR_CONSTANT_CONSTRAINT "i"
Vegard Nossumf8561292008-04-04 00:53:23 +020044#else
Jan Beuliche8f6e3f2012-11-02 14:19:18 +000045# define XOR_CONSTANT_CONSTRAINT "re"
46#endif
47
48#define OFFS(x) "16*("#x")"
49#define PF_OFFS(x) "256+16*("#x")"
50#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
51#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
52#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
53#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
54#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
55#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
56#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
57#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
58#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
59#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
60#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
61
62static void
63xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
64{
65 unsigned long lines = bytes >> 8;
66
67 kernel_fpu_begin();
68
69 asm volatile(
70#undef BLOCK
71#define BLOCK(i) \
72 LD(i, 0) \
73 LD(i + 1, 1) \
74 PF1(i) \
75 PF1(i + 2) \
76 LD(i + 2, 2) \
77 LD(i + 3, 3) \
78 PF0(i + 4) \
79 PF0(i + 6) \
80 XO1(i, 0) \
81 XO1(i + 1, 1) \
82 XO1(i + 2, 2) \
83 XO1(i + 3, 3) \
84 ST(i, 0) \
85 ST(i + 1, 1) \
86 ST(i + 2, 2) \
87 ST(i + 3, 3) \
88
89
90 PF0(0)
91 PF0(2)
92
93 " .align 32 ;\n"
94 " 1: ;\n"
95
96 BLOCK(0)
97 BLOCK(4)
98 BLOCK(8)
99 BLOCK(12)
100
101 " add %[inc], %[p1] ;\n"
102 " add %[inc], %[p2] ;\n"
103 " dec %[cnt] ;\n"
104 " jnz 1b ;\n"
105 : [cnt] "+r" (lines),
106 [p1] "+r" (p1), [p2] "+r" (p2)
107 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
108 : "memory");
109
110 kernel_fpu_end();
111}
112
113static void
114xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
115 unsigned long *p3)
116{
117 unsigned long lines = bytes >> 8;
118
119 kernel_fpu_begin();
120
121 asm volatile(
122#undef BLOCK
123#define BLOCK(i) \
124 PF1(i) \
125 PF1(i + 2) \
126 LD(i, 0) \
127 LD(i + 1, 1) \
128 LD(i + 2, 2) \
129 LD(i + 3, 3) \
130 PF2(i) \
131 PF2(i + 2) \
132 PF0(i + 4) \
133 PF0(i + 6) \
134 XO1(i, 0) \
135 XO1(i + 1, 1) \
136 XO1(i + 2, 2) \
137 XO1(i + 3, 3) \
138 XO2(i, 0) \
139 XO2(i + 1, 1) \
140 XO2(i + 2, 2) \
141 XO2(i + 3, 3) \
142 ST(i, 0) \
143 ST(i + 1, 1) \
144 ST(i + 2, 2) \
145 ST(i + 3, 3) \
146
147
148 PF0(0)
149 PF0(2)
150
151 " .align 32 ;\n"
152 " 1: ;\n"
153
154 BLOCK(0)
155 BLOCK(4)
156 BLOCK(8)
157 BLOCK(12)
158
159 " add %[inc], %[p1] ;\n"
160 " add %[inc], %[p2] ;\n"
161 " add %[inc], %[p3] ;\n"
162 " dec %[cnt] ;\n"
163 " jnz 1b ;\n"
164 : [cnt] "+r" (lines),
165 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
166 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
167 : "memory");
168
169 kernel_fpu_end();
170}
171
172static void
173xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
174 unsigned long *p3, unsigned long *p4)
175{
176 unsigned long lines = bytes >> 8;
177
178 kernel_fpu_begin();
179
180 asm volatile(
181#undef BLOCK
182#define BLOCK(i) \
183 PF1(i) \
184 PF1(i + 2) \
185 LD(i, 0) \
186 LD(i + 1, 1) \
187 LD(i + 2, 2) \
188 LD(i + 3, 3) \
189 PF2(i) \
190 PF2(i + 2) \
191 XO1(i, 0) \
192 XO1(i + 1, 1) \
193 XO1(i + 2, 2) \
194 XO1(i + 3, 3) \
195 PF3(i) \
196 PF3(i + 2) \
197 PF0(i + 4) \
198 PF0(i + 6) \
199 XO2(i, 0) \
200 XO2(i + 1, 1) \
201 XO2(i + 2, 2) \
202 XO2(i + 3, 3) \
203 XO3(i, 0) \
204 XO3(i + 1, 1) \
205 XO3(i + 2, 2) \
206 XO3(i + 3, 3) \
207 ST(i, 0) \
208 ST(i + 1, 1) \
209 ST(i + 2, 2) \
210 ST(i + 3, 3) \
211
212
213 PF0(0)
214 PF0(2)
215
216 " .align 32 ;\n"
217 " 1: ;\n"
218
219 BLOCK(0)
220 BLOCK(4)
221 BLOCK(8)
222 BLOCK(12)
223
224 " add %[inc], %[p1] ;\n"
225 " add %[inc], %[p2] ;\n"
226 " add %[inc], %[p3] ;\n"
227 " add %[inc], %[p4] ;\n"
228 " dec %[cnt] ;\n"
229 " jnz 1b ;\n"
230 : [cnt] "+r" (lines), [p1] "+r" (p1),
231 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
232 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
233 : "memory");
234
235 kernel_fpu_end();
236}
237
238static void
239xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
240 unsigned long *p3, unsigned long *p4, unsigned long *p5)
241{
242 unsigned long lines = bytes >> 8;
243
244 kernel_fpu_begin();
245
246 asm volatile(
247#undef BLOCK
248#define BLOCK(i) \
249 PF1(i) \
250 PF1(i + 2) \
251 LD(i, 0) \
252 LD(i + 1, 1) \
253 LD(i + 2, 2) \
254 LD(i + 3, 3) \
255 PF2(i) \
256 PF2(i + 2) \
257 XO1(i, 0) \
258 XO1(i + 1, 1) \
259 XO1(i + 2, 2) \
260 XO1(i + 3, 3) \
261 PF3(i) \
262 PF3(i + 2) \
263 XO2(i, 0) \
264 XO2(i + 1, 1) \
265 XO2(i + 2, 2) \
266 XO2(i + 3, 3) \
267 PF4(i) \
268 PF4(i + 2) \
269 PF0(i + 4) \
270 PF0(i + 6) \
271 XO3(i, 0) \
272 XO3(i + 1, 1) \
273 XO3(i + 2, 2) \
274 XO3(i + 3, 3) \
275 XO4(i, 0) \
276 XO4(i + 1, 1) \
277 XO4(i + 2, 2) \
278 XO4(i + 3, 3) \
279 ST(i, 0) \
280 ST(i + 1, 1) \
281 ST(i + 2, 2) \
282 ST(i + 3, 3) \
283
284
285 PF0(0)
286 PF0(2)
287
288 " .align 32 ;\n"
289 " 1: ;\n"
290
291 BLOCK(0)
292 BLOCK(4)
293 BLOCK(8)
294 BLOCK(12)
295
296 " add %[inc], %[p1] ;\n"
297 " add %[inc], %[p2] ;\n"
298 " add %[inc], %[p3] ;\n"
299 " add %[inc], %[p4] ;\n"
300 " add %[inc], %[p5] ;\n"
301 " dec %[cnt] ;\n"
302 " jnz 1b ;\n"
303 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
304 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
305 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
306 : "memory");
307
308 kernel_fpu_end();
309}
310
311#undef LD
312#undef XO1
313#undef XO2
314#undef XO3
315#undef XO4
316#undef ST
317#undef BLOCK
318
319#undef XOR_CONSTANT_CONSTRAINT
320
Thomas Gleixner96a388d2007-10-11 11:20:03 +0200321#ifdef CONFIG_X86_32
David Howellsa1ce3922012-10-02 18:01:25 +0100322# include <asm/xor_32.h>
Thomas Gleixner96a388d2007-10-11 11:20:03 +0200323#else
David Howellsa1ce3922012-10-02 18:01:25 +0100324# include <asm/xor_64.h>
Thomas Gleixner96a388d2007-10-11 11:20:03 +0200325#endif
Jan Beuliche8f6e3f2012-11-02 14:19:18 +0000326
327#endif /* _ASM_X86_XOR_H */