blob: 72e7689cc5a6dd5619715aabaf81ec5abcce0170 [file] [log] [blame]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001/* x86_64 BIGNUM accelerator version 0.1, December 2002.
2 *
3 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
4 * project.
5 *
6 * Rights for redistribution and usage in source and binary forms are
7 * granted according to the OpenSSL license. Warranty of any kind is
8 * disclaimed.
9 *
10 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
11 * versions, like 1.0...
12 * A. Well, that's because this code is basically a quick-n-dirty
13 * proof-of-concept hack. As you can see it's implemented with
14 * inline assembler, which means that you're bound to GCC and that
15 * there might be enough room for further improvement.
16 *
17 * Q. Why inline assembler?
18 * A. x86_64 features own ABI which I'm not familiar with. This is
19 * why I decided to let the compiler take care of subroutine
20 * prologue/epilogue as well as register allocation. For reference.
21 * Win64 implements different ABI for AMD64, different from Linux.
22 *
23 * Q. How much faster does it get?
24 * A. 'apps/openssl speed rsa dsa' output with no-asm:
25 *
26 * sign verify sign/s verify/s
27 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
28 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
29 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
30 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
31 * sign verify sign/s verify/s
32 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
33 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
34 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
35 *
36 * 'apps/openssl speed rsa dsa' output with this module:
37 *
38 * sign verify sign/s verify/s
39 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
40 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
41 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
42 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
43 * sign verify sign/s verify/s
44 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
45 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
46 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
47 *
48 * For the reference. IA-32 assembler implementation performs
49 * very much like 64-bit code compiled with no-asm on the same
50 * machine.
51 */
52
David Benjamin4969cc92016-04-22 15:02:23 -040053#include <openssl/bn.h>
54
55/* TODO(davidben): Get this file working on Windows x64. */
56#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__GNUC__)
57
58#include "../internal.h"
59
Adam Langleyd9e397b2015-01-22 14:27:53 -080060
61#undef mul
62#undef mul_add
63
64#define asm __asm__
65
66/*
67 * "m"(a), "+m"(r) is the way to favor DirectPath ยต-code;
68 * "g"(0) let the compiler to decide where does it
69 * want to keep the value of zero;
70 */
71#define mul_add(r, a, word, carry) \
72 do { \
73 register BN_ULONG high, low; \
74 asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \
75 asm("addq %2,%0; adcq %3,%1" \
76 : "+r"(carry), "+d"(high) \
77 : "a"(low), "g"(0) \
78 : "cc"); \
79 asm("addq %2,%0; adcq %3,%1" \
80 : "+m"(r), "+d"(high) \
81 : "r"(carry), "g"(0) \
82 : "cc"); \
David Benjamin95add822016-10-19 01:09:12 -040083 (carry) = high; \
Adam Langleyd9e397b2015-01-22 14:27:53 -080084 } while (0)
85
86#define mul(r, a, word, carry) \
87 do { \
88 register BN_ULONG high, low; \
89 asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \
90 asm("addq %2,%0; adcq %3,%1" \
91 : "+r"(carry), "+d"(high) \
92 : "a"(low), "g"(0) \
93 : "cc"); \
David Benjamin95add822016-10-19 01:09:12 -040094 (r) = (carry); \
95 (carry) = high; \
Adam Langleyd9e397b2015-01-22 14:27:53 -080096 } while (0)
97#undef sqr
98#define sqr(r0, r1, a) asm("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc");
99
100BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
101 BN_ULONG w) {
102 BN_ULONG c1 = 0;
103
Adam Langleye9ada862015-05-11 17:20:37 -0700104 if (num <= 0) {
Adam Langleyd9e397b2015-01-22 14:27:53 -0800105 return (c1);
Adam Langleye9ada862015-05-11 17:20:37 -0700106 }
Adam Langleyd9e397b2015-01-22 14:27:53 -0800107
108 while (num & ~3) {
109 mul_add(rp[0], ap[0], w, c1);
110 mul_add(rp[1], ap[1], w, c1);
111 mul_add(rp[2], ap[2], w, c1);
112 mul_add(rp[3], ap[3], w, c1);
113 ap += 4;
114 rp += 4;
115 num -= 4;
116 }
117 if (num) {
118 mul_add(rp[0], ap[0], w, c1);
Adam Langleye9ada862015-05-11 17:20:37 -0700119 if (--num == 0) {
Adam Langleyd9e397b2015-01-22 14:27:53 -0800120 return c1;
Adam Langleye9ada862015-05-11 17:20:37 -0700121 }
Adam Langleyd9e397b2015-01-22 14:27:53 -0800122 mul_add(rp[1], ap[1], w, c1);
Adam Langleye9ada862015-05-11 17:20:37 -0700123 if (--num == 0) {
Adam Langleyd9e397b2015-01-22 14:27:53 -0800124 return c1;
Adam Langleye9ada862015-05-11 17:20:37 -0700125 }
Adam Langleyd9e397b2015-01-22 14:27:53 -0800126 mul_add(rp[2], ap[2], w, c1);
127 return c1;
128 }
129
Adam Langleye9ada862015-05-11 17:20:37 -0700130 return c1;
Adam Langleyd9e397b2015-01-22 14:27:53 -0800131}
132
133BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
134 BN_ULONG c1 = 0;
135
Adam Langleye9ada862015-05-11 17:20:37 -0700136 if (num <= 0) {
137 return c1;
138 }
Adam Langleyd9e397b2015-01-22 14:27:53 -0800139
140 while (num & ~3) {
141 mul(rp[0], ap[0], w, c1);
142 mul(rp[1], ap[1], w, c1);
143 mul(rp[2], ap[2], w, c1);
144 mul(rp[3], ap[3], w, c1);
145 ap += 4;
146 rp += 4;
147 num -= 4;
148 }
149 if (num) {
150 mul(rp[0], ap[0], w, c1);
Adam Langleye9ada862015-05-11 17:20:37 -0700151 if (--num == 0) {
Adam Langleyd9e397b2015-01-22 14:27:53 -0800152 return c1;
Adam Langleye9ada862015-05-11 17:20:37 -0700153 }
Adam Langleyd9e397b2015-01-22 14:27:53 -0800154 mul(rp[1], ap[1], w, c1);
Adam Langleye9ada862015-05-11 17:20:37 -0700155 if (--num == 0) {
Adam Langleyd9e397b2015-01-22 14:27:53 -0800156 return c1;
Adam Langleye9ada862015-05-11 17:20:37 -0700157 }
Adam Langleyd9e397b2015-01-22 14:27:53 -0800158 mul(rp[2], ap[2], w, c1);
159 }
Adam Langleye9ada862015-05-11 17:20:37 -0700160 return c1;
Adam Langleyd9e397b2015-01-22 14:27:53 -0800161}
162
163void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
Adam Langleye9ada862015-05-11 17:20:37 -0700164 if (n <= 0) {
Adam Langleyd9e397b2015-01-22 14:27:53 -0800165 return;
Adam Langleye9ada862015-05-11 17:20:37 -0700166 }
Adam Langleyd9e397b2015-01-22 14:27:53 -0800167
168 while (n & ~3) {
169 sqr(r[0], r[1], a[0]);
170 sqr(r[2], r[3], a[1]);
171 sqr(r[4], r[5], a[2]);
172 sqr(r[6], r[7], a[3]);
173 a += 4;
174 r += 8;
175 n -= 4;
176 }
177 if (n) {
178 sqr(r[0], r[1], a[0]);
Adam Langleye9ada862015-05-11 17:20:37 -0700179 if (--n == 0) {
Adam Langleyd9e397b2015-01-22 14:27:53 -0800180 return;
Adam Langleye9ada862015-05-11 17:20:37 -0700181 }
Adam Langleyd9e397b2015-01-22 14:27:53 -0800182 sqr(r[2], r[3], a[1]);
Adam Langleye9ada862015-05-11 17:20:37 -0700183 if (--n == 0) {
Adam Langleyd9e397b2015-01-22 14:27:53 -0800184 return;
Adam Langleye9ada862015-05-11 17:20:37 -0700185 }
Adam Langleyd9e397b2015-01-22 14:27:53 -0800186 sqr(r[4], r[5], a[2]);
187 }
188}
189
Adam Langleyd9e397b2015-01-22 14:27:53 -0800190BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
191 int n) {
192 BN_ULONG ret;
193 size_t i = 0;
194
Adam Langleye9ada862015-05-11 17:20:37 -0700195 if (n <= 0) {
Adam Langleyd9e397b2015-01-22 14:27:53 -0800196 return 0;
Adam Langleye9ada862015-05-11 17:20:37 -0700197 }
Adam Langleyd9e397b2015-01-22 14:27:53 -0800198
199 asm volatile (
200 " subq %0,%0 \n" /* clear carry */
201 " jmp 1f \n"
202 ".p2align 4 \n"
Robert Sloan8ff03552017-06-14 12:40:58 -0700203 "1:"
204 " movq (%4,%2,8),%0 \n"
Adam Langleyd9e397b2015-01-22 14:27:53 -0800205 " adcq (%5,%2,8),%0 \n"
206 " movq %0,(%3,%2,8) \n"
207 " lea 1(%2),%2 \n"
208 " loop 1b \n"
209 " sbbq %0,%0 \n"
210 : "=&r"(ret), "+c"(n), "+r"(i)
211 : "r"(rp), "r"(ap), "r"(bp)
212 : "cc", "memory");
213
214 return ret & 1;
215}
216
Adam Langleyd9e397b2015-01-22 14:27:53 -0800217BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
218 int n) {
219 BN_ULONG ret;
220 size_t i = 0;
221
Adam Langleye9ada862015-05-11 17:20:37 -0700222 if (n <= 0) {
Adam Langleyd9e397b2015-01-22 14:27:53 -0800223 return 0;
Adam Langleye9ada862015-05-11 17:20:37 -0700224 }
Adam Langleyd9e397b2015-01-22 14:27:53 -0800225
226 asm volatile (
227 " subq %0,%0 \n" /* clear borrow */
228 " jmp 1f \n"
229 ".p2align 4 \n"
Robert Sloan8ff03552017-06-14 12:40:58 -0700230 "1:"
231 " movq (%4,%2,8),%0 \n"
Adam Langleyd9e397b2015-01-22 14:27:53 -0800232 " sbbq (%5,%2,8),%0 \n"
233 " movq %0,(%3,%2,8) \n"
234 " lea 1(%2),%2 \n"
235 " loop 1b \n"
236 " sbbq %0,%0 \n"
237 : "=&r"(ret), "+c"(n), "+r"(i)
238 : "r"(rp), "r"(ap), "r"(bp)
239 : "cc", "memory");
240
241 return ret & 1;
242}
Adam Langleyd9e397b2015-01-22 14:27:53 -0800243
244/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
245/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
246/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
247/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
248 */
249
250/* Keep in mind that carrying into high part of multiplication result can not
251 * overflow, because it cannot be all-ones. */
252#define mul_add_c(a, b, c0, c1, c2) \
253 do { \
254 BN_ULONG t1, t2; \
255 asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
256 asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
257 : "+r"(c0), "+r"(c1), "+r"(c2) \
258 : "r"(t1), "r"(t2), "g"(0) \
259 : "cc"); \
260 } while (0)
261
David Benjamin95add822016-10-19 01:09:12 -0400262#define sqr_add_c(a, i, c0, c1, c2) \
263 do { \
264 BN_ULONG t1, t2; \
265 asm("mulq %2" : "=a"(t1), "=d"(t2) : "a"((a)[i]) : "cc"); \
266 asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
267 : "+r"(c0), "+r"(c1), "+r"(c2) \
268 : "r"(t1), "r"(t2), "g"(0) \
269 : "cc"); \
Adam Langleyd9e397b2015-01-22 14:27:53 -0800270 } while (0)
271
272#define mul_add_c2(a, b, c0, c1, c2) \
273 do { \
274 BN_ULONG t1, t2; \
275 asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
276 asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
277 : "+r"(c0), "+r"(c1), "+r"(c2) \
278 : "r"(t1), "r"(t2), "g"(0) \
279 : "cc"); \
280 asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
281 : "+r"(c0), "+r"(c1), "+r"(c2) \
282 : "r"(t1), "r"(t2), "g"(0) \
283 : "cc"); \
284 } while (0)
285
286#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
287
288void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
289 BN_ULONG c1, c2, c3;
290
291 c1 = 0;
292 c2 = 0;
293 c3 = 0;
294 mul_add_c(a[0], b[0], c1, c2, c3);
295 r[0] = c1;
296 c1 = 0;
297 mul_add_c(a[0], b[1], c2, c3, c1);
298 mul_add_c(a[1], b[0], c2, c3, c1);
299 r[1] = c2;
300 c2 = 0;
301 mul_add_c(a[2], b[0], c3, c1, c2);
302 mul_add_c(a[1], b[1], c3, c1, c2);
303 mul_add_c(a[0], b[2], c3, c1, c2);
304 r[2] = c3;
305 c3 = 0;
306 mul_add_c(a[0], b[3], c1, c2, c3);
307 mul_add_c(a[1], b[2], c1, c2, c3);
308 mul_add_c(a[2], b[1], c1, c2, c3);
309 mul_add_c(a[3], b[0], c1, c2, c3);
310 r[3] = c1;
311 c1 = 0;
312 mul_add_c(a[4], b[0], c2, c3, c1);
313 mul_add_c(a[3], b[1], c2, c3, c1);
314 mul_add_c(a[2], b[2], c2, c3, c1);
315 mul_add_c(a[1], b[3], c2, c3, c1);
316 mul_add_c(a[0], b[4], c2, c3, c1);
317 r[4] = c2;
318 c2 = 0;
319 mul_add_c(a[0], b[5], c3, c1, c2);
320 mul_add_c(a[1], b[4], c3, c1, c2);
321 mul_add_c(a[2], b[3], c3, c1, c2);
322 mul_add_c(a[3], b[2], c3, c1, c2);
323 mul_add_c(a[4], b[1], c3, c1, c2);
324 mul_add_c(a[5], b[0], c3, c1, c2);
325 r[5] = c3;
326 c3 = 0;
327 mul_add_c(a[6], b[0], c1, c2, c3);
328 mul_add_c(a[5], b[1], c1, c2, c3);
329 mul_add_c(a[4], b[2], c1, c2, c3);
330 mul_add_c(a[3], b[3], c1, c2, c3);
331 mul_add_c(a[2], b[4], c1, c2, c3);
332 mul_add_c(a[1], b[5], c1, c2, c3);
333 mul_add_c(a[0], b[6], c1, c2, c3);
334 r[6] = c1;
335 c1 = 0;
336 mul_add_c(a[0], b[7], c2, c3, c1);
337 mul_add_c(a[1], b[6], c2, c3, c1);
338 mul_add_c(a[2], b[5], c2, c3, c1);
339 mul_add_c(a[3], b[4], c2, c3, c1);
340 mul_add_c(a[4], b[3], c2, c3, c1);
341 mul_add_c(a[5], b[2], c2, c3, c1);
342 mul_add_c(a[6], b[1], c2, c3, c1);
343 mul_add_c(a[7], b[0], c2, c3, c1);
344 r[7] = c2;
345 c2 = 0;
346 mul_add_c(a[7], b[1], c3, c1, c2);
347 mul_add_c(a[6], b[2], c3, c1, c2);
348 mul_add_c(a[5], b[3], c3, c1, c2);
349 mul_add_c(a[4], b[4], c3, c1, c2);
350 mul_add_c(a[3], b[5], c3, c1, c2);
351 mul_add_c(a[2], b[6], c3, c1, c2);
352 mul_add_c(a[1], b[7], c3, c1, c2);
353 r[8] = c3;
354 c3 = 0;
355 mul_add_c(a[2], b[7], c1, c2, c3);
356 mul_add_c(a[3], b[6], c1, c2, c3);
357 mul_add_c(a[4], b[5], c1, c2, c3);
358 mul_add_c(a[5], b[4], c1, c2, c3);
359 mul_add_c(a[6], b[3], c1, c2, c3);
360 mul_add_c(a[7], b[2], c1, c2, c3);
361 r[9] = c1;
362 c1 = 0;
363 mul_add_c(a[7], b[3], c2, c3, c1);
364 mul_add_c(a[6], b[4], c2, c3, c1);
365 mul_add_c(a[5], b[5], c2, c3, c1);
366 mul_add_c(a[4], b[6], c2, c3, c1);
367 mul_add_c(a[3], b[7], c2, c3, c1);
368 r[10] = c2;
369 c2 = 0;
370 mul_add_c(a[4], b[7], c3, c1, c2);
371 mul_add_c(a[5], b[6], c3, c1, c2);
372 mul_add_c(a[6], b[5], c3, c1, c2);
373 mul_add_c(a[7], b[4], c3, c1, c2);
374 r[11] = c3;
375 c3 = 0;
376 mul_add_c(a[7], b[5], c1, c2, c3);
377 mul_add_c(a[6], b[6], c1, c2, c3);
378 mul_add_c(a[5], b[7], c1, c2, c3);
379 r[12] = c1;
380 c1 = 0;
381 mul_add_c(a[6], b[7], c2, c3, c1);
382 mul_add_c(a[7], b[6], c2, c3, c1);
383 r[13] = c2;
384 c2 = 0;
385 mul_add_c(a[7], b[7], c3, c1, c2);
386 r[14] = c3;
387 r[15] = c1;
388}
389
390void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
391 BN_ULONG c1, c2, c3;
392
393 c1 = 0;
394 c2 = 0;
395 c3 = 0;
396 mul_add_c(a[0], b[0], c1, c2, c3);
397 r[0] = c1;
398 c1 = 0;
399 mul_add_c(a[0], b[1], c2, c3, c1);
400 mul_add_c(a[1], b[0], c2, c3, c1);
401 r[1] = c2;
402 c2 = 0;
403 mul_add_c(a[2], b[0], c3, c1, c2);
404 mul_add_c(a[1], b[1], c3, c1, c2);
405 mul_add_c(a[0], b[2], c3, c1, c2);
406 r[2] = c3;
407 c3 = 0;
408 mul_add_c(a[0], b[3], c1, c2, c3);
409 mul_add_c(a[1], b[2], c1, c2, c3);
410 mul_add_c(a[2], b[1], c1, c2, c3);
411 mul_add_c(a[3], b[0], c1, c2, c3);
412 r[3] = c1;
413 c1 = 0;
414 mul_add_c(a[3], b[1], c2, c3, c1);
415 mul_add_c(a[2], b[2], c2, c3, c1);
416 mul_add_c(a[1], b[3], c2, c3, c1);
417 r[4] = c2;
418 c2 = 0;
419 mul_add_c(a[2], b[3], c3, c1, c2);
420 mul_add_c(a[3], b[2], c3, c1, c2);
421 r[5] = c3;
422 c3 = 0;
423 mul_add_c(a[3], b[3], c1, c2, c3);
424 r[6] = c1;
425 r[7] = c2;
426}
427
428void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
429 BN_ULONG c1, c2, c3;
430
431 c1 = 0;
432 c2 = 0;
433 c3 = 0;
434 sqr_add_c(a, 0, c1, c2, c3);
435 r[0] = c1;
436 c1 = 0;
437 sqr_add_c2(a, 1, 0, c2, c3, c1);
438 r[1] = c2;
439 c2 = 0;
440 sqr_add_c(a, 1, c3, c1, c2);
441 sqr_add_c2(a, 2, 0, c3, c1, c2);
442 r[2] = c3;
443 c3 = 0;
444 sqr_add_c2(a, 3, 0, c1, c2, c3);
445 sqr_add_c2(a, 2, 1, c1, c2, c3);
446 r[3] = c1;
447 c1 = 0;
448 sqr_add_c(a, 2, c2, c3, c1);
449 sqr_add_c2(a, 3, 1, c2, c3, c1);
450 sqr_add_c2(a, 4, 0, c2, c3, c1);
451 r[4] = c2;
452 c2 = 0;
453 sqr_add_c2(a, 5, 0, c3, c1, c2);
454 sqr_add_c2(a, 4, 1, c3, c1, c2);
455 sqr_add_c2(a, 3, 2, c3, c1, c2);
456 r[5] = c3;
457 c3 = 0;
458 sqr_add_c(a, 3, c1, c2, c3);
459 sqr_add_c2(a, 4, 2, c1, c2, c3);
460 sqr_add_c2(a, 5, 1, c1, c2, c3);
461 sqr_add_c2(a, 6, 0, c1, c2, c3);
462 r[6] = c1;
463 c1 = 0;
464 sqr_add_c2(a, 7, 0, c2, c3, c1);
465 sqr_add_c2(a, 6, 1, c2, c3, c1);
466 sqr_add_c2(a, 5, 2, c2, c3, c1);
467 sqr_add_c2(a, 4, 3, c2, c3, c1);
468 r[7] = c2;
469 c2 = 0;
470 sqr_add_c(a, 4, c3, c1, c2);
471 sqr_add_c2(a, 5, 3, c3, c1, c2);
472 sqr_add_c2(a, 6, 2, c3, c1, c2);
473 sqr_add_c2(a, 7, 1, c3, c1, c2);
474 r[8] = c3;
475 c3 = 0;
476 sqr_add_c2(a, 7, 2, c1, c2, c3);
477 sqr_add_c2(a, 6, 3, c1, c2, c3);
478 sqr_add_c2(a, 5, 4, c1, c2, c3);
479 r[9] = c1;
480 c1 = 0;
481 sqr_add_c(a, 5, c2, c3, c1);
482 sqr_add_c2(a, 6, 4, c2, c3, c1);
483 sqr_add_c2(a, 7, 3, c2, c3, c1);
484 r[10] = c2;
485 c2 = 0;
486 sqr_add_c2(a, 7, 4, c3, c1, c2);
487 sqr_add_c2(a, 6, 5, c3, c1, c2);
488 r[11] = c3;
489 c3 = 0;
490 sqr_add_c(a, 6, c1, c2, c3);
491 sqr_add_c2(a, 7, 5, c1, c2, c3);
492 r[12] = c1;
493 c1 = 0;
494 sqr_add_c2(a, 7, 6, c2, c3, c1);
495 r[13] = c2;
496 c2 = 0;
497 sqr_add_c(a, 7, c3, c1, c2);
498 r[14] = c3;
499 r[15] = c1;
500}
501
502void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
503 BN_ULONG c1, c2, c3;
504
505 c1 = 0;
506 c2 = 0;
507 c3 = 0;
508 sqr_add_c(a, 0, c1, c2, c3);
509 r[0] = c1;
510 c1 = 0;
511 sqr_add_c2(a, 1, 0, c2, c3, c1);
512 r[1] = c2;
513 c2 = 0;
514 sqr_add_c(a, 1, c3, c1, c2);
515 sqr_add_c2(a, 2, 0, c3, c1, c2);
516 r[2] = c3;
517 c3 = 0;
518 sqr_add_c2(a, 3, 0, c1, c2, c3);
519 sqr_add_c2(a, 2, 1, c1, c2, c3);
520 r[3] = c1;
521 c1 = 0;
522 sqr_add_c(a, 2, c2, c3, c1);
523 sqr_add_c2(a, 3, 1, c2, c3, c1);
524 r[4] = c2;
525 c2 = 0;
526 sqr_add_c2(a, 3, 2, c3, c1, c2);
527 r[5] = c3;
528 c3 = 0;
529 sqr_add_c(a, 3, c1, c2, c3);
530 r[6] = c1;
531 r[7] = c2;
532}
533
Robert Sloan8ff03552017-06-14 12:40:58 -0700534#undef mul_add
535#undef mul
536#undef sqr
537#undef mul_add_c
538#undef sqr_add_c
539#undef mul_add_c2
540#undef sqr_add_c2
541
David Benjamin4969cc92016-04-22 15:02:23 -0400542#endif /* !NO_ASM && X86_64 && __GNUC__ */