blob: dc640b212299cc92920a183dd381d7706bd9c674 [file] [log] [blame]
Anton Blanchard6dd7a822016-07-01 08:19:45 +10001/*
2 * Calculate the checksum of data that is 16 byte aligned and a multiple of
3 * 16 bytes.
4 *
5 * The first step is to reduce it to 1024 bits. We do this in 8 parallel
6 * chunks in order to mask the latency of the vpmsum instructions. If we
7 * have more than 32 kB of data to checksum we repeat this step multiple
8 * times, passing in the previous 1024 bits.
9 *
10 * The next step is to reduce the 1024 bits to 64 bits. This step adds
11 * 32 bits of 0s to the end - this matches what a CRC does. We just
12 * calculate constants that land the data in this 32 bits.
13 *
14 * We then use fixed point Barrett reduction to compute a mod n over GF(2)
15 * for n = CRC using POWER8 instructions. We use x = 32.
16 *
17 * http://en.wikipedia.org/wiki/Barrett_reduction
18 *
19 * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
20 *
21 * This program is free software; you can redistribute it and/or
22 * modify it under the terms of the GNU General Public License
23 * as published by the Free Software Foundation; either version
24 * 2 of the License, or (at your option) any later version.
25 */
26#include <asm/ppc_asm.h>
27#include <asm/ppc-opcode.h>
28
29 .section .rodata
30.balign 16
31
32.byteswap_constant:
33 /* byte reverse permute constant */
34 .octa 0x0F0E0D0C0B0A09080706050403020100
35
36#define MAX_SIZE 32768
37.constants:
38
39 /* Reduce 262144 kbits to 1024 bits */
40 /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
41 .octa 0x00000000b6ca9e20000000009c37c408
42
43 /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
44 .octa 0x00000000350249a800000001b51df26c
45
46 /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
47 .octa 0x00000001862dac54000000000724b9d0
48
49 /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
50 .octa 0x00000001d87fb48c00000001c00532fe
51
52 /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
53 .octa 0x00000001f39b699e00000000f05a9362
54
55 /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
56 .octa 0x0000000101da11b400000001e1007970
57
58 /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
59 .octa 0x00000001cab571e000000000a57366ee
60
61 /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
62 .octa 0x00000000c7020cfe0000000192011284
63
64 /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
65 .octa 0x00000000cdaed1ae0000000162716d9a
66
67 /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
68 .octa 0x00000001e804effc00000000cd97ecde
69
70 /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
71 .octa 0x0000000077c3ea3a0000000058812bc0
72
73 /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
74 .octa 0x0000000068df31b40000000088b8c12e
75
76 /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
77 .octa 0x00000000b059b6c200000001230b234c
78
79 /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
80 .octa 0x0000000145fb8ed800000001120b416e
81
82 /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
83 .octa 0x00000000cbc0916800000001974aecb0
84
85 /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
86 .octa 0x000000005ceeedc2000000008ee3f226
87
88 /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
89 .octa 0x0000000047d74e8600000001089aba9a
90
91 /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
92 .octa 0x00000001407e9e220000000065113872
93
94 /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
95 .octa 0x00000001da967bda000000005c07ec10
96
97 /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
98 .octa 0x000000006c8983680000000187590924
99
100 /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
101 .octa 0x00000000f2d14c9800000000e35da7c6
102
103 /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
104 .octa 0x00000001993c6ad4000000000415855a
105
106 /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
107 .octa 0x000000014683d1ac0000000073617758
108
109 /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
110 .octa 0x00000001a7c93e6c0000000176021d28
111
112 /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
113 .octa 0x000000010211e90a00000001c358fd0a
114
115 /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
116 .octa 0x000000001119403e00000001ff7a2c18
117
118 /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
119 .octa 0x000000001c3261aa00000000f2d9f7e4
120
121 /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
122 .octa 0x000000014e37a634000000016cf1f9c8
123
124 /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
125 .octa 0x0000000073786c0c000000010af9279a
126
127 /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
128 .octa 0x000000011dc037f80000000004f101e8
129
130 /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
131 .octa 0x0000000031433dfc0000000070bcf184
132
133 /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
134 .octa 0x000000009cde8348000000000a8de642
135
136 /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
137 .octa 0x0000000038d3c2a60000000062ea130c
138
139 /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
140 .octa 0x000000011b25f26000000001eb31cbb2
141
142 /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
143 .octa 0x000000001629e6f00000000170783448
144
145 /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
146 .octa 0x0000000160838b4c00000001a684b4c6
147
148 /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
149 .octa 0x000000007a44011c00000000253ca5b4
150
151 /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
152 .octa 0x00000000226f417a0000000057b4b1e2
153
154 /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
155 .octa 0x0000000045eb2eb400000000b6bd084c
156
157 /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
158 .octa 0x000000014459d70c0000000123c2d592
159
160 /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
161 .octa 0x00000001d406ed8200000000159dafce
162
163 /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
164 .octa 0x0000000160c8e1a80000000127e1a64e
165
166 /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
167 .octa 0x0000000027ba80980000000056860754
168
169 /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
170 .octa 0x000000006d92d01800000001e661aae8
171
172 /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
173 .octa 0x000000012ed7e3f200000000f82c6166
174
175 /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
176 .octa 0x000000002dc8778800000000c4f9c7ae
177
178 /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
179 .octa 0x0000000018240bb80000000074203d20
180
181 /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
182 .octa 0x000000001ad381580000000198173052
183
184 /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
185 .octa 0x00000001396b78f200000001ce8aba54
186
187 /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
188 .octa 0x000000011a68133400000001850d5d94
189
190 /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
191 .octa 0x000000012104732e00000001d609239c
192
193 /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
194 .octa 0x00000000a140d90c000000001595f048
195
196 /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
197 .octa 0x00000001b7215eda0000000042ccee08
198
199 /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
200 .octa 0x00000001aaf1df3c000000010a389d74
201
202 /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
203 .octa 0x0000000029d15b8a000000012a840da6
204
205 /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
206 .octa 0x00000000f1a96922000000001d181c0c
207
208 /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
209 .octa 0x00000001ac80d03c0000000068b7d1f6
210
211 /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
212 .octa 0x000000000f11d56a000000005b0f14fc
213
214 /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
215 .octa 0x00000001f1c022a20000000179e9e730
216
217 /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
218 .octa 0x0000000173d00ae200000001ce1368d6
219
220 /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
221 .octa 0x00000001d4ffe4ac0000000112c3a84c
222
223 /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
224 .octa 0x000000016edc5ae400000000de940fee
225
226 /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
227 .octa 0x00000001f1a0214000000000fe896b7e
228
229 /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
230 .octa 0x00000000ca0b28a000000001f797431c
231
232 /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
233 .octa 0x00000001928e30a20000000053e989ba
234
235 /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
236 .octa 0x0000000097b1b002000000003920cd16
237
238 /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
239 .octa 0x00000000b15bf90600000001e6f579b8
240
241 /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
242 .octa 0x00000000411c5d52000000007493cb0a
243
244 /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
245 .octa 0x00000001c36f330000000001bdd376d8
246
247 /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
248 .octa 0x00000001119227e0000000016badfee6
249
250 /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
251 .octa 0x00000000114d47020000000071de5c58
252
253 /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
254 .octa 0x00000000458b5b9800000000453f317c
255
256 /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
257 .octa 0x000000012e31fb8e0000000121675cce
258
259 /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
260 .octa 0x000000005cf619d800000001f409ee92
261
262 /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
263 .octa 0x0000000063f4d8b200000000f36b9c88
264
265 /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
266 .octa 0x000000004138dc8a0000000036b398f4
267
268 /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
269 .octa 0x00000001d29ee8e000000001748f9adc
270
271 /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
272 .octa 0x000000006a08ace800000001be94ec00
273
274 /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
275 .octa 0x0000000127d4201000000000b74370d6
276
277 /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
278 .octa 0x0000000019d76b6200000001174d0b98
279
280 /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
281 .octa 0x00000001b1471f6e00000000befc06a4
282
283 /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
284 .octa 0x00000001f64c19cc00000001ae125288
285
286 /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
287 .octa 0x00000000003c0ea00000000095c19b34
288
289 /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
290 .octa 0x000000014d73abf600000001a78496f2
291
292 /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
293 .octa 0x00000001620eb84400000001ac5390a0
294
295 /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
296 .octa 0x0000000147655048000000002a80ed6e
297
298 /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
299 .octa 0x0000000067b5077e00000001fa9b0128
300
301 /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
302 .octa 0x0000000010ffe20600000001ea94929e
303
304 /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
305 .octa 0x000000000fee8f1e0000000125f4305c
306
307 /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
308 .octa 0x00000001da26fbae00000001471e2002
309
310 /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
311 .octa 0x00000001b3a8bd880000000132d2253a
312
313 /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
314 .octa 0x00000000e8f3898e00000000f26b3592
315
316 /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
317 .octa 0x00000000b0d0d28c00000000bc8b67b0
318
319 /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
320 .octa 0x0000000030f2a798000000013a826ef2
321
322 /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
323 .octa 0x000000000fba10020000000081482c84
324
325 /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
326 .octa 0x00000000bdb9bd7200000000e77307c2
327
328 /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
329 .octa 0x0000000075d3bf5a00000000d4a07ec8
330
331 /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
332 .octa 0x00000000ef1f98a00000000017102100
333
334 /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
335 .octa 0x00000000689c760200000000db406486
336
337 /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
338 .octa 0x000000016d5fa5fe0000000192db7f88
339
340 /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
341 .octa 0x00000001d0d2b9ca000000018bf67b1e
342
343 /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
344 .octa 0x0000000041e7b470000000007c09163e
345
346 /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
347 .octa 0x00000001cbb6495e000000000adac060
348
349 /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
350 .octa 0x000000010052a0b000000000bd8316ae
351
352 /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
353 .octa 0x00000001d8effb5c000000019f09ab54
354
355 /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
356 .octa 0x00000001d969853c0000000125155542
357
358 /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
359 .octa 0x00000000523ccce2000000018fdb5882
360
361 /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
362 .octa 0x000000001e2436bc00000000e794b3f4
363
364 /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
365 .octa 0x00000000ddd1c3a2000000016f9bb022
366
367 /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
368 .octa 0x0000000019fcfe3800000000290c9978
369
370 /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
371 .octa 0x00000001ce95db640000000083c0f350
372
373 /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
374 .octa 0x00000000af5828060000000173ea6628
375
376 /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
377 .octa 0x00000001006388f600000001c8b4e00a
378
379 /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
380 .octa 0x0000000179eca00a00000000de95d6aa
381
382 /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
383 .octa 0x0000000122410a6a000000010b7f7248
384
385 /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
386 .octa 0x000000004288e87c00000001326e3a06
387
388 /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
389 .octa 0x000000016c5490da00000000bb62c2e6
390
391 /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
392 .octa 0x00000000d1c71f6e0000000156a4b2c2
393
394 /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
395 .octa 0x00000001b4ce08a6000000011dfe763a
396
397 /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
398 .octa 0x00000001466ba60c000000007bcca8e2
399
400 /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
401 .octa 0x00000001f6c488a40000000186118faa
402
403 /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
404 .octa 0x000000013bfb06820000000111a65a88
405
406 /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
407 .octa 0x00000000690e9e54000000003565e1c4
408
409 /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
410 .octa 0x00000000281346b6000000012ed02a82
411
412 /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
413 .octa 0x000000015646402400000000c486ecfc
414
415 /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
416 .octa 0x000000016063a8dc0000000001b951b2
417
418 /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
419 .octa 0x0000000116a663620000000048143916
420
421 /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
422 .octa 0x000000017e8aa4d200000001dc2ae124
423
424 /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
425 .octa 0x00000001728eb10c00000001416c58d6
426
427 /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
428 .octa 0x00000001b08fd7fa00000000a479744a
429
430 /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
431 .octa 0x00000001092a16e80000000096ca3a26
432
433 /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
434 .octa 0x00000000a505637c00000000ff223d4e
435
436 /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
437 .octa 0x00000000d94869b2000000010e84da42
438
439 /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
440 .octa 0x00000001c8b203ae00000001b61ba3d0
441
442 /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
443 .octa 0x000000005704aea000000000680f2de8
444
445 /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
446 .octa 0x000000012e295fa2000000008772a9a8
447
448 /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
449 .octa 0x000000011d0908bc0000000155f295bc
450
451 /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
452 .octa 0x0000000193ed97ea00000000595f9282
453
454 /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
455 .octa 0x000000013a0f1c520000000164b1c25a
456
457 /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
458 .octa 0x000000010c2c40c000000000fbd67c50
459
460 /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
461 .octa 0x00000000ff6fac3e0000000096076268
462
463 /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
464 .octa 0x000000017b3609c000000001d288e4cc
465
466 /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
467 .octa 0x0000000088c8c92200000001eaac1bdc
468
469 /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
470 .octa 0x00000001751baae600000001f1ea39e2
471
472 /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
473 .octa 0x000000010795297200000001eb6506fc
474
475 /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
476 .octa 0x0000000162b00abe000000010f806ffe
477
478 /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
479 .octa 0x000000000d7b404c000000010408481e
480
481 /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
482 .octa 0x00000000763b13d40000000188260534
483
484 /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
485 .octa 0x00000000f6dc22d80000000058fc73e0
486
487 /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
488 .octa 0x000000007daae06000000000391c59b8
489
490 /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
491 .octa 0x000000013359ab7c000000018b638400
492
493 /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
494 .octa 0x000000008add438a000000011738f5c4
495
496 /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
497 .octa 0x00000001edbefdea000000008cf7c6da
498
499 /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
500 .octa 0x000000004104e0f800000001ef97fb16
501
502 /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
503 .octa 0x00000000b48a82220000000102130e20
504
505 /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
506 .octa 0x00000001bcb4684400000000db968898
507
508 /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
509 .octa 0x000000013293ce0a00000000b5047b5e
510
511 /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
512 .octa 0x00000001710d0844000000010b90fdb2
513
514 /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
515 .octa 0x0000000117907f6e000000004834a32e
516
517 /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
518 .octa 0x0000000087ddf93e0000000059c8f2b0
519
520 /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
521 .octa 0x000000005970e9b00000000122cec508
522
523 /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
524 .octa 0x0000000185b2b7d0000000000a330cda
525
526 /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
527 .octa 0x00000001dcee0efc000000014a47148c
528
529 /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
530 .octa 0x0000000030da27220000000042c61cb8
531
532 /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
533 .octa 0x000000012f925a180000000012fe6960
534
535 /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
536 .octa 0x00000000dd2e357c00000000dbda2c20
537
538 /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
539 .octa 0x00000000071c80de000000011122410c
540
541 /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
542 .octa 0x000000011513140a00000000977b2070
543
544 /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
545 .octa 0x00000001df876e8e000000014050438e
546
547 /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
548 .octa 0x000000015f81d6ce0000000147c840e8
549
550 /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
551 .octa 0x000000019dd94dbe00000001cc7c88ce
552
553 /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
554 .octa 0x00000001373d206e00000001476b35a4
555
556 /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
557 .octa 0x00000000668ccade000000013d52d508
558
559 /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
560 .octa 0x00000001b192d268000000008e4be32e
561
562 /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
563 .octa 0x00000000e30f3a7800000000024120fe
564
565 /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
566 .octa 0x000000010ef1f7bc00000000ddecddb4
567
568 /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
569 .octa 0x00000001f5ac738000000000d4d403bc
570
571 /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
572 .octa 0x000000011822ea7000000001734b89aa
573
574 /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
575 .octa 0x00000000c3a33848000000010e7a58d6
576
577 /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
578 .octa 0x00000001bd151c2400000001f9f04e9c
579
580 /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
581 .octa 0x0000000056002d7600000000b692225e
582
583 /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
584 .octa 0x000000014657c4f4000000019b8d3f3e
585
586 /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
587 .octa 0x0000000113742d7c00000001a874f11e
588
589 /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
590 .octa 0x000000019c5920ba000000010d5a4254
591
592 /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
593 .octa 0x000000005216d2d600000000bbb2f5d6
594
595 /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
596 .octa 0x0000000136f5ad8a0000000179cc0e36
597
598 /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
599 .octa 0x000000018b07beb600000001dca1da4a
600
601 /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
602 .octa 0x00000000db1e93b000000000feb1a192
603
604 /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
605 .octa 0x000000000b96fa3a00000000d1eeedd6
606
607 /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
608 .octa 0x00000001d9968af0000000008fad9bb4
609
610 /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
611 .octa 0x000000000e4a77a200000001884938e4
612
613 /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
614 .octa 0x00000000508c2ac800000001bc2e9bc0
615
616 /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
617 .octa 0x0000000021572a8000000001f9658a68
618
619 /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
620 .octa 0x00000001b859daf2000000001b9224fc
621
622 /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
623 .octa 0x000000016f7884740000000055b2fb84
624
625 /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
626 .octa 0x00000001b438810e000000018b090348
627
628 /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
629 .octa 0x0000000095ddc6f2000000011ccbd5ea
630
631 /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
632 .octa 0x00000001d977c20c0000000007ae47f8
633
634 /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
635 .octa 0x00000000ebedb99a0000000172acbec0
636
637 /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
638 .octa 0x00000001df9e9e9200000001c6e3ff20
639
640 /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
641 .octa 0x00000001a4a3f95200000000e1b38744
642
643 /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
644 .octa 0x00000000e2f5122000000000791585b2
645
646 /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
647 .octa 0x000000004aa01f3e00000000ac53b894
648
649 /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
650 .octa 0x00000000b3e90a5800000001ed5f2cf4
651
652 /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
653 .octa 0x000000000c9ca2aa00000001df48b2e0
654
655 /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
656 .octa 0x000000015168231600000000049c1c62
657
658 /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
659 .octa 0x0000000036fce78c000000017c460c12
660
661 /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
662 .octa 0x000000009037dc10000000015be4da7e
663
664 /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
665 .octa 0x00000000d3298582000000010f38f668
666
667 /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
668 .octa 0x00000001b42e8ad60000000039f40a00
669
670 /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
671 .octa 0x00000000142a983800000000bd4c10c4
672
673 /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
674 .octa 0x0000000109c7f1900000000042db1d98
675
676 /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
677 .octa 0x0000000056ff931000000001c905bae6
678
679 /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
680 .octa 0x00000001594513aa00000000069d40ea
681
682 /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
683 .octa 0x00000001e3b5b1e8000000008e4fbad0
684
685 /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
686 .octa 0x000000011dd5fc080000000047bedd46
687
688 /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
689 .octa 0x00000001675f0cc20000000026396bf8
690
691 /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
692 .octa 0x00000000d1c8dd4400000000379beb92
693
694 /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
695 .octa 0x0000000115ebd3d8000000000abae54a
696
697 /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
698 .octa 0x00000001ecbd0dac0000000007e6a128
699
700 /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
701 .octa 0x00000000cdf67af2000000000ade29d2
702
703 /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
704 .octa 0x000000004c01ff4c00000000f974c45c
705
706 /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
707 .octa 0x00000000f2d8657e00000000e77ac60a
708
709 /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
710 .octa 0x000000006bae74c40000000145895816
711
712 /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
713 .octa 0x0000000152af8aa00000000038e362be
714
715 /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
716 .octa 0x0000000004663802000000007f991a64
717
718 /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
719 .octa 0x00000001ab2f5afc00000000fa366d3a
720
721 /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
722 .octa 0x0000000074a4ebd400000001a2bb34f0
723
724 /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
725 .octa 0x00000001d7ab3a4c0000000028a9981e
726
727 /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
728 .octa 0x00000001a8da60c600000001dbc672be
729
730 /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
731 .octa 0x000000013cf6382000000000b04d77f6
732
733 /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
734 .octa 0x00000000bec12e1e0000000124400d96
735
736 /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
737 .octa 0x00000001c6368010000000014ca4b414
738
739 /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
740 .octa 0x00000001e6e78758000000012fe2c938
741
742 /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
743 .octa 0x000000008d7f2b3c00000001faed01e6
744
745 /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
746 .octa 0x000000016b4a156e000000007e80ecfe
747
748 /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
749 .octa 0x00000001c63cfeb60000000098daee94
750
751 /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
752 .octa 0x000000015f902670000000010a04edea
753
754 /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
755 .octa 0x00000001cd5de11e00000001c00b4524
756
757 /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
758 .octa 0x000000001acaec540000000170296550
759
760 /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
761 .octa 0x000000002bd0ca780000000181afaa48
762
763 /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
764 .octa 0x0000000032d63d5c0000000185a31ffa
765
766 /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
767 .octa 0x000000001c6d4e4c000000002469f608
768
769 /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
770 .octa 0x0000000106a60b92000000006980102a
771
772 /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
773 .octa 0x00000000d3855e120000000111ea9ca8
774
775 /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
776 .octa 0x00000000e312563600000001bd1d29ce
777
778 /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
779 .octa 0x000000009e8f7ea400000001b34b9580
780
781 /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
782 .octa 0x00000001c82e562c000000003076054e
783
784 /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
785 .octa 0x00000000ca9f09ce000000012a608ea4
786
787 /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
788 .octa 0x00000000c63764e600000000784d05fe
789
790 /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
791 .octa 0x0000000168d2e49e000000016ef0d82a
792
793 /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
794 .octa 0x00000000e986c1480000000075bda454
795
796 /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
797 .octa 0x00000000cfb65894000000003dc0a1c4
798
799 /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
800 .octa 0x0000000111cadee400000000e9a5d8be
801
802 /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
803 .octa 0x0000000171fb63ce00000001609bc4b4
804
805.short_constants:
806
807 /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
808 /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */
809 .octa 0x7fec2963e5bf80485cf015c388e56f72
810
811 /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */
812 .octa 0x38e888d4844752a9963a18920246e2e6
813
814 /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */
815 .octa 0x42316c00730206ad419a441956993a31
816
817 /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */
818 .octa 0x543d5c543e65ddf9924752ba2b830011
819
820 /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */
821 .octa 0x78e87aaf56767c9255bd7f9518e4a304
822
823 /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */
824 .octa 0x8f68fcec1903da7f6d76739fe0553f1e
825
826 /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */
827 .octa 0x3f4840246791d588c133722b1fe0b5c3
828
829 /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */
830 .octa 0x34c96751b04de25a64b67ee0e55ef1f3
831
832 /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */
833 .octa 0x156c8e180b4a395b069db049b8fdb1e7
834
835 /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */
836 .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e
837
838 /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */
839 .octa 0x041d37768cd75659817cdc5119b29a35
840
841 /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */
842 .octa 0x3a0777818cfaa9651ce9d94b36c41f1c
843
844 /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */
845 .octa 0x0e148e8252377a554f256efcb82be955
846
847 /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */
848 .octa 0x9c25531d19e65ddeec1631edb2dea967
849
850 /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */
851 .octa 0x790606ff9957c0a65d27e147510ac59a
852
853 /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */
854 .octa 0x82f63b786ea2d55ca66805eb18b8ea18
855
856
857.barrett_constants:
858 /* 33 bit reflected Barrett constant m - (4^32)/n */
859 .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */
860 /* 33 bit reflected Barrett constant n */
861 .octa 0x00000000000000000000000105ec76f1
862
863 .text
864
865#if defined(__BIG_ENDIAN__)
866#define BYTESWAP_DATA
867#else
868#undef BYTESWAP_DATA
869#endif
870
871#define off16 r25
872#define off32 r26
873#define off48 r27
874#define off64 r28
875#define off80 r29
876#define off96 r30
877#define off112 r31
878
879#define const1 v24
880#define const2 v25
881
882#define byteswap v26
883#define mask_32bit v27
884#define mask_64bit v28
885#define zeroes v29
886
887#ifdef BYTESWAP_DATA
888#define VPERM(A, B, C, D) vperm A, B, C, D
889#else
890#define VPERM(A, B, C, D)
891#endif
892
893/* unsigned int __crc32c_vpmsum(unsigned int crc, void *p, unsigned long len) */
894FUNC_START(__crc32c_vpmsum)
895 std r31,-8(r1)
896 std r30,-16(r1)
897 std r29,-24(r1)
898 std r28,-32(r1)
899 std r27,-40(r1)
900 std r26,-48(r1)
901 std r25,-56(r1)
902
903 li off16,16
904 li off32,32
905 li off48,48
906 li off64,64
907 li off80,80
908 li off96,96
909 li off112,112
910 li r0,0
911
912 /* Enough room for saving 10 non volatile VMX registers */
913 subi r6,r1,56+10*16
914 subi r7,r1,56+2*16
915
916 stvx v20,0,r6
917 stvx v21,off16,r6
918 stvx v22,off32,r6
919 stvx v23,off48,r6
920 stvx v24,off64,r6
921 stvx v25,off80,r6
922 stvx v26,off96,r6
923 stvx v27,off112,r6
924 stvx v28,0,r7
925 stvx v29,off16,r7
926
927 mr r10,r3
928
929 vxor zeroes,zeroes,zeroes
930 vspltisw v0,-1
931
932 vsldoi mask_32bit,zeroes,v0,4
933 vsldoi mask_64bit,zeroes,v0,8
934
935 /* Get the initial value into v8 */
936 vxor v8,v8,v8
937 MTVRD(v8, R3)
938 vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
939
940#ifdef BYTESWAP_DATA
941 addis r3,r2,.byteswap_constant@toc@ha
942 addi r3,r3,.byteswap_constant@toc@l
943
944 lvx byteswap,0,r3
945 addi r3,r3,16
946#endif
947
948 cmpdi r5,256
949 blt .Lshort
950
951 rldicr r6,r5,0,56
952
953 /* Checksum in blocks of MAX_SIZE */
9541: lis r7,MAX_SIZE@h
955 ori r7,r7,MAX_SIZE@l
956 mr r9,r7
957 cmpd r6,r7
958 bgt 2f
959 mr r7,r6
9602: subf r6,r7,r6
961
962 /* our main loop does 128 bytes at a time */
963 srdi r7,r7,7
964
965 /*
966 * Work out the offset into the constants table to start at. Each
967 * constant is 16 bytes, and it is used against 128 bytes of input
968 * data - 128 / 16 = 8
969 */
970 sldi r8,r7,4
971 srdi r9,r9,3
972 subf r8,r8,r9
973
974 /* We reduce our final 128 bytes in a separate step */
975 addi r7,r7,-1
976 mtctr r7
977
978 addis r3,r2,.constants@toc@ha
979 addi r3,r3,.constants@toc@l
980
981 /* Find the start of our constants */
982 add r3,r3,r8
983
984 /* zero v0-v7 which will contain our checksums */
985 vxor v0,v0,v0
986 vxor v1,v1,v1
987 vxor v2,v2,v2
988 vxor v3,v3,v3
989 vxor v4,v4,v4
990 vxor v5,v5,v5
991 vxor v6,v6,v6
992 vxor v7,v7,v7
993
994 lvx const1,0,r3
995
996 /*
997 * If we are looping back to consume more data we use the values
998 * already in v16-v23.
999 */
1000 cmpdi r0,1
1001 beq 2f
1002
1003 /* First warm up pass */
1004 lvx v16,0,r4
1005 lvx v17,off16,r4
1006 VPERM(v16,v16,v16,byteswap)
1007 VPERM(v17,v17,v17,byteswap)
1008 lvx v18,off32,r4
1009 lvx v19,off48,r4
1010 VPERM(v18,v18,v18,byteswap)
1011 VPERM(v19,v19,v19,byteswap)
1012 lvx v20,off64,r4
1013 lvx v21,off80,r4
1014 VPERM(v20,v20,v20,byteswap)
1015 VPERM(v21,v21,v21,byteswap)
1016 lvx v22,off96,r4
1017 lvx v23,off112,r4
1018 VPERM(v22,v22,v22,byteswap)
1019 VPERM(v23,v23,v23,byteswap)
1020 addi r4,r4,8*16
1021
1022 /* xor in initial value */
1023 vxor v16,v16,v8
1024
10252: bdz .Lfirst_warm_up_done
1026
1027 addi r3,r3,16
1028 lvx const2,0,r3
1029
1030 /* Second warm up pass */
1031 VPMSUMD(v8,v16,const1)
1032 lvx v16,0,r4
1033 VPERM(v16,v16,v16,byteswap)
1034 ori r2,r2,0
1035
1036 VPMSUMD(v9,v17,const1)
1037 lvx v17,off16,r4
1038 VPERM(v17,v17,v17,byteswap)
1039 ori r2,r2,0
1040
1041 VPMSUMD(v10,v18,const1)
1042 lvx v18,off32,r4
1043 VPERM(v18,v18,v18,byteswap)
1044 ori r2,r2,0
1045
1046 VPMSUMD(v11,v19,const1)
1047 lvx v19,off48,r4
1048 VPERM(v19,v19,v19,byteswap)
1049 ori r2,r2,0
1050
1051 VPMSUMD(v12,v20,const1)
1052 lvx v20,off64,r4
1053 VPERM(v20,v20,v20,byteswap)
1054 ori r2,r2,0
1055
1056 VPMSUMD(v13,v21,const1)
1057 lvx v21,off80,r4
1058 VPERM(v21,v21,v21,byteswap)
1059 ori r2,r2,0
1060
1061 VPMSUMD(v14,v22,const1)
1062 lvx v22,off96,r4
1063 VPERM(v22,v22,v22,byteswap)
1064 ori r2,r2,0
1065
1066 VPMSUMD(v15,v23,const1)
1067 lvx v23,off112,r4
1068 VPERM(v23,v23,v23,byteswap)
1069
1070 addi r4,r4,8*16
1071
1072 bdz .Lfirst_cool_down
1073
1074 /*
1075 * main loop. We modulo schedule it such that it takes three iterations
1076 * to complete - first iteration load, second iteration vpmsum, third
1077 * iteration xor.
1078 */
1079 .balign 16
10804: lvx const1,0,r3
1081 addi r3,r3,16
1082 ori r2,r2,0
1083
1084 vxor v0,v0,v8
1085 VPMSUMD(v8,v16,const2)
1086 lvx v16,0,r4
1087 VPERM(v16,v16,v16,byteswap)
1088 ori r2,r2,0
1089
1090 vxor v1,v1,v9
1091 VPMSUMD(v9,v17,const2)
1092 lvx v17,off16,r4
1093 VPERM(v17,v17,v17,byteswap)
1094 ori r2,r2,0
1095
1096 vxor v2,v2,v10
1097 VPMSUMD(v10,v18,const2)
1098 lvx v18,off32,r4
1099 VPERM(v18,v18,v18,byteswap)
1100 ori r2,r2,0
1101
1102 vxor v3,v3,v11
1103 VPMSUMD(v11,v19,const2)
1104 lvx v19,off48,r4
1105 VPERM(v19,v19,v19,byteswap)
1106 lvx const2,0,r3
1107 ori r2,r2,0
1108
1109 vxor v4,v4,v12
1110 VPMSUMD(v12,v20,const1)
1111 lvx v20,off64,r4
1112 VPERM(v20,v20,v20,byteswap)
1113 ori r2,r2,0
1114
1115 vxor v5,v5,v13
1116 VPMSUMD(v13,v21,const1)
1117 lvx v21,off80,r4
1118 VPERM(v21,v21,v21,byteswap)
1119 ori r2,r2,0
1120
1121 vxor v6,v6,v14
1122 VPMSUMD(v14,v22,const1)
1123 lvx v22,off96,r4
1124 VPERM(v22,v22,v22,byteswap)
1125 ori r2,r2,0
1126
1127 vxor v7,v7,v15
1128 VPMSUMD(v15,v23,const1)
1129 lvx v23,off112,r4
1130 VPERM(v23,v23,v23,byteswap)
1131
1132 addi r4,r4,8*16
1133
1134 bdnz 4b
1135
1136.Lfirst_cool_down:
1137 /* First cool down pass */
1138 lvx const1,0,r3
1139 addi r3,r3,16
1140
1141 vxor v0,v0,v8
1142 VPMSUMD(v8,v16,const1)
1143 ori r2,r2,0
1144
1145 vxor v1,v1,v9
1146 VPMSUMD(v9,v17,const1)
1147 ori r2,r2,0
1148
1149 vxor v2,v2,v10
1150 VPMSUMD(v10,v18,const1)
1151 ori r2,r2,0
1152
1153 vxor v3,v3,v11
1154 VPMSUMD(v11,v19,const1)
1155 ori r2,r2,0
1156
1157 vxor v4,v4,v12
1158 VPMSUMD(v12,v20,const1)
1159 ori r2,r2,0
1160
1161 vxor v5,v5,v13
1162 VPMSUMD(v13,v21,const1)
1163 ori r2,r2,0
1164
1165 vxor v6,v6,v14
1166 VPMSUMD(v14,v22,const1)
1167 ori r2,r2,0
1168
1169 vxor v7,v7,v15
1170 VPMSUMD(v15,v23,const1)
1171 ori r2,r2,0
1172
1173.Lsecond_cool_down:
1174 /* Second cool down pass */
1175 vxor v0,v0,v8
1176 vxor v1,v1,v9
1177 vxor v2,v2,v10
1178 vxor v3,v3,v11
1179 vxor v4,v4,v12
1180 vxor v5,v5,v13
1181 vxor v6,v6,v14
1182 vxor v7,v7,v15
1183
1184 /*
1185 * vpmsumd produces a 96 bit result in the least significant bits
1186 * of the register. Since we are bit reflected we have to shift it
1187 * left 32 bits so it occupies the least significant bits in the
1188 * bit reflected domain.
1189 */
1190 vsldoi v0,v0,zeroes,4
1191 vsldoi v1,v1,zeroes,4
1192 vsldoi v2,v2,zeroes,4
1193 vsldoi v3,v3,zeroes,4
1194 vsldoi v4,v4,zeroes,4
1195 vsldoi v5,v5,zeroes,4
1196 vsldoi v6,v6,zeroes,4
1197 vsldoi v7,v7,zeroes,4
1198
1199 /* xor with last 1024 bits */
1200 lvx v8,0,r4
1201 lvx v9,off16,r4
1202 VPERM(v8,v8,v8,byteswap)
1203 VPERM(v9,v9,v9,byteswap)
1204 lvx v10,off32,r4
1205 lvx v11,off48,r4
1206 VPERM(v10,v10,v10,byteswap)
1207 VPERM(v11,v11,v11,byteswap)
1208 lvx v12,off64,r4
1209 lvx v13,off80,r4
1210 VPERM(v12,v12,v12,byteswap)
1211 VPERM(v13,v13,v13,byteswap)
1212 lvx v14,off96,r4
1213 lvx v15,off112,r4
1214 VPERM(v14,v14,v14,byteswap)
1215 VPERM(v15,v15,v15,byteswap)
1216
1217 addi r4,r4,8*16
1218
1219 vxor v16,v0,v8
1220 vxor v17,v1,v9
1221 vxor v18,v2,v10
1222 vxor v19,v3,v11
1223 vxor v20,v4,v12
1224 vxor v21,v5,v13
1225 vxor v22,v6,v14
1226 vxor v23,v7,v15
1227
1228 li r0,1
1229 cmpdi r6,0
1230 addi r6,r6,128
1231 bne 1b
1232
1233 /* Work out how many bytes we have left */
1234 andi. r5,r5,127
1235
1236 /* Calculate where in the constant table we need to start */
1237 subfic r6,r5,128
1238 add r3,r3,r6
1239
1240 /* How many 16 byte chunks are in the tail */
1241 srdi r7,r5,4
1242 mtctr r7
1243
1244 /*
1245 * Reduce the previously calculated 1024 bits to 64 bits, shifting
1246 * 32 bits to include the trailing 32 bits of zeros
1247 */
1248 lvx v0,0,r3
1249 lvx v1,off16,r3
1250 lvx v2,off32,r3
1251 lvx v3,off48,r3
1252 lvx v4,off64,r3
1253 lvx v5,off80,r3
1254 lvx v6,off96,r3
1255 lvx v7,off112,r3
1256 addi r3,r3,8*16
1257
1258 VPMSUMW(v0,v16,v0)
1259 VPMSUMW(v1,v17,v1)
1260 VPMSUMW(v2,v18,v2)
1261 VPMSUMW(v3,v19,v3)
1262 VPMSUMW(v4,v20,v4)
1263 VPMSUMW(v5,v21,v5)
1264 VPMSUMW(v6,v22,v6)
1265 VPMSUMW(v7,v23,v7)
1266
1267 /* Now reduce the tail (0 - 112 bytes) */
1268 cmpdi r7,0
1269 beq 1f
1270
1271 lvx v16,0,r4
1272 lvx v17,0,r3
1273 VPERM(v16,v16,v16,byteswap)
1274 VPMSUMW(v16,v16,v17)
1275 vxor v0,v0,v16
1276 bdz 1f
1277
1278 lvx v16,off16,r4
1279 lvx v17,off16,r3
1280 VPERM(v16,v16,v16,byteswap)
1281 VPMSUMW(v16,v16,v17)
1282 vxor v0,v0,v16
1283 bdz 1f
1284
1285 lvx v16,off32,r4
1286 lvx v17,off32,r3
1287 VPERM(v16,v16,v16,byteswap)
1288 VPMSUMW(v16,v16,v17)
1289 vxor v0,v0,v16
1290 bdz 1f
1291
1292 lvx v16,off48,r4
1293 lvx v17,off48,r3
1294 VPERM(v16,v16,v16,byteswap)
1295 VPMSUMW(v16,v16,v17)
1296 vxor v0,v0,v16
1297 bdz 1f
1298
1299 lvx v16,off64,r4
1300 lvx v17,off64,r3
1301 VPERM(v16,v16,v16,byteswap)
1302 VPMSUMW(v16,v16,v17)
1303 vxor v0,v0,v16
1304 bdz 1f
1305
1306 lvx v16,off80,r4
1307 lvx v17,off80,r3
1308 VPERM(v16,v16,v16,byteswap)
1309 VPMSUMW(v16,v16,v17)
1310 vxor v0,v0,v16
1311 bdz 1f
1312
1313 lvx v16,off96,r4
1314 lvx v17,off96,r3
1315 VPERM(v16,v16,v16,byteswap)
1316 VPMSUMW(v16,v16,v17)
1317 vxor v0,v0,v16
1318
1319 /* Now xor all the parallel chunks together */
13201: vxor v0,v0,v1
1321 vxor v2,v2,v3
1322 vxor v4,v4,v5
1323 vxor v6,v6,v7
1324
1325 vxor v0,v0,v2
1326 vxor v4,v4,v6
1327
1328 vxor v0,v0,v4
1329
1330.Lbarrett_reduction:
1331 /* Barrett constants */
1332 addis r3,r2,.barrett_constants@toc@ha
1333 addi r3,r3,.barrett_constants@toc@l
1334
1335 lvx const1,0,r3
1336 lvx const2,off16,r3
1337
1338 vsldoi v1,v0,v0,8
1339 vxor v0,v0,v1 /* xor two 64 bit results together */
1340
1341 /* shift left one bit */
1342 vspltisb v1,1
1343 vsl v0,v0,v1
1344
1345 vand v0,v0,mask_64bit
1346
1347 /*
1348 * The reflected version of Barrett reduction. Instead of bit
1349 * reflecting our data (which is expensive to do), we bit reflect our
1350 * constants and our algorithm, which means the intermediate data in
1351 * our vector registers goes from 0-63 instead of 63-0. We can reflect
1352 * the algorithm because we don't carry in mod 2 arithmetic.
1353 */
1354 vand v1,v0,mask_32bit /* bottom 32 bits of a */
1355 VPMSUMD(v1,v1,const1) /* ma */
1356 vand v1,v1,mask_32bit /* bottom 32bits of ma */
1357 VPMSUMD(v1,v1,const2) /* qn */
1358 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
1359
1360 /*
1361 * Since we are bit reflected, the result (ie the low 32 bits) is in
1362 * the high 32 bits. We just need to shift it left 4 bytes
1363 * V0 [ 0 1 X 3 ]
1364 * V0 [ 0 X 2 3 ]
1365 */
1366 vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
1367
1368 /* Get it into r3 */
1369 MFVRD(R3, v0)
1370
1371.Lout:
1372 subi r6,r1,56+10*16
1373 subi r7,r1,56+2*16
1374
1375 lvx v20,0,r6
1376 lvx v21,off16,r6
1377 lvx v22,off32,r6
1378 lvx v23,off48,r6
1379 lvx v24,off64,r6
1380 lvx v25,off80,r6
1381 lvx v26,off96,r6
1382 lvx v27,off112,r6
1383 lvx v28,0,r7
1384 lvx v29,off16,r7
1385
1386 ld r31,-8(r1)
1387 ld r30,-16(r1)
1388 ld r29,-24(r1)
1389 ld r28,-32(r1)
1390 ld r27,-40(r1)
1391 ld r26,-48(r1)
1392 ld r25,-56(r1)
1393
1394 blr
1395
1396.Lfirst_warm_up_done:
1397 lvx const1,0,r3
1398 addi r3,r3,16
1399
1400 VPMSUMD(v8,v16,const1)
1401 VPMSUMD(v9,v17,const1)
1402 VPMSUMD(v10,v18,const1)
1403 VPMSUMD(v11,v19,const1)
1404 VPMSUMD(v12,v20,const1)
1405 VPMSUMD(v13,v21,const1)
1406 VPMSUMD(v14,v22,const1)
1407 VPMSUMD(v15,v23,const1)
1408
1409 b .Lsecond_cool_down
1410
1411.Lshort:
1412 cmpdi r5,0
1413 beq .Lzero
1414
1415 addis r3,r2,.short_constants@toc@ha
1416 addi r3,r3,.short_constants@toc@l
1417
1418 /* Calculate where in the constant table we need to start */
1419 subfic r6,r5,256
1420 add r3,r3,r6
1421
1422 /* How many 16 byte chunks? */
1423 srdi r7,r5,4
1424 mtctr r7
1425
1426 vxor v19,v19,v19
1427 vxor v20,v20,v20
1428
1429 lvx v0,0,r4
1430 lvx v16,0,r3
1431 VPERM(v0,v0,v16,byteswap)
1432 vxor v0,v0,v8 /* xor in initial value */
1433 VPMSUMW(v0,v0,v16)
1434 bdz .Lv0
1435
1436 lvx v1,off16,r4
1437 lvx v17,off16,r3
1438 VPERM(v1,v1,v17,byteswap)
1439 VPMSUMW(v1,v1,v17)
1440 bdz .Lv1
1441
1442 lvx v2,off32,r4
1443 lvx v16,off32,r3
1444 VPERM(v2,v2,v16,byteswap)
1445 VPMSUMW(v2,v2,v16)
1446 bdz .Lv2
1447
1448 lvx v3,off48,r4
1449 lvx v17,off48,r3
1450 VPERM(v3,v3,v17,byteswap)
1451 VPMSUMW(v3,v3,v17)
1452 bdz .Lv3
1453
1454 lvx v4,off64,r4
1455 lvx v16,off64,r3
1456 VPERM(v4,v4,v16,byteswap)
1457 VPMSUMW(v4,v4,v16)
1458 bdz .Lv4
1459
1460 lvx v5,off80,r4
1461 lvx v17,off80,r3
1462 VPERM(v5,v5,v17,byteswap)
1463 VPMSUMW(v5,v5,v17)
1464 bdz .Lv5
1465
1466 lvx v6,off96,r4
1467 lvx v16,off96,r3
1468 VPERM(v6,v6,v16,byteswap)
1469 VPMSUMW(v6,v6,v16)
1470 bdz .Lv6
1471
1472 lvx v7,off112,r4
1473 lvx v17,off112,r3
1474 VPERM(v7,v7,v17,byteswap)
1475 VPMSUMW(v7,v7,v17)
1476 bdz .Lv7
1477
1478 addi r3,r3,128
1479 addi r4,r4,128
1480
1481 lvx v8,0,r4
1482 lvx v16,0,r3
1483 VPERM(v8,v8,v16,byteswap)
1484 VPMSUMW(v8,v8,v16)
1485 bdz .Lv8
1486
1487 lvx v9,off16,r4
1488 lvx v17,off16,r3
1489 VPERM(v9,v9,v17,byteswap)
1490 VPMSUMW(v9,v9,v17)
1491 bdz .Lv9
1492
1493 lvx v10,off32,r4
1494 lvx v16,off32,r3
1495 VPERM(v10,v10,v16,byteswap)
1496 VPMSUMW(v10,v10,v16)
1497 bdz .Lv10
1498
1499 lvx v11,off48,r4
1500 lvx v17,off48,r3
1501 VPERM(v11,v11,v17,byteswap)
1502 VPMSUMW(v11,v11,v17)
1503 bdz .Lv11
1504
1505 lvx v12,off64,r4
1506 lvx v16,off64,r3
1507 VPERM(v12,v12,v16,byteswap)
1508 VPMSUMW(v12,v12,v16)
1509 bdz .Lv12
1510
1511 lvx v13,off80,r4
1512 lvx v17,off80,r3
1513 VPERM(v13,v13,v17,byteswap)
1514 VPMSUMW(v13,v13,v17)
1515 bdz .Lv13
1516
1517 lvx v14,off96,r4
1518 lvx v16,off96,r3
1519 VPERM(v14,v14,v16,byteswap)
1520 VPMSUMW(v14,v14,v16)
1521 bdz .Lv14
1522
1523 lvx v15,off112,r4
1524 lvx v17,off112,r3
1525 VPERM(v15,v15,v17,byteswap)
1526 VPMSUMW(v15,v15,v17)
1527
1528.Lv15: vxor v19,v19,v15
1529.Lv14: vxor v20,v20,v14
1530.Lv13: vxor v19,v19,v13
1531.Lv12: vxor v20,v20,v12
1532.Lv11: vxor v19,v19,v11
1533.Lv10: vxor v20,v20,v10
1534.Lv9: vxor v19,v19,v9
1535.Lv8: vxor v20,v20,v8
1536.Lv7: vxor v19,v19,v7
1537.Lv6: vxor v20,v20,v6
1538.Lv5: vxor v19,v19,v5
1539.Lv4: vxor v20,v20,v4
1540.Lv3: vxor v19,v19,v3
1541.Lv2: vxor v20,v20,v2
1542.Lv1: vxor v19,v19,v1
1543.Lv0: vxor v20,v20,v0
1544
1545 vxor v0,v19,v20
1546
1547 b .Lbarrett_reduction
1548
1549.Lzero:
1550 mr r3,r10
1551 b .Lout
1552
1553FUNC_END(__crc32_vpmsum)