blob: 224a47cf207ec06aa65b7b3afc0fa2e3e6896e79 [file] [log] [blame]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
2 * All rights reserved.
3 *
4 * This package is an SSL implementation written
5 * by Eric Young (eay@cryptsoft.com).
6 * The implementation was written so as to conform with Netscapes SSL.
7 *
8 * This library is free for commercial and non-commercial use as long as
9 * the following conditions are aheared to. The following conditions
10 * apply to all code found in this distribution, be it the RC4, RSA,
11 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
12 * included with this distribution is covered by the same copyright terms
13 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
14 *
15 * Copyright remains Eric Young's, and as such any Copyright notices in
16 * the code are not to be removed.
17 * If this package is used in a product, Eric Young should be given attribution
18 * as the author of the parts of the library used.
19 * This can be in the form of a textual message at program startup or
20 * in documentation (online or textual) provided with the package.
21 *
22 * Redistribution and use in source and binary forms, with or without
23 * modification, are permitted provided that the following conditions
24 * are met:
25 * 1. Redistributions of source code must retain the copyright
26 * notice, this list of conditions and the following disclaimer.
27 * 2. Redistributions in binary form must reproduce the above copyright
28 * notice, this list of conditions and the following disclaimer in the
29 * documentation and/or other materials provided with the distribution.
30 * 3. All advertising materials mentioning features or use of this software
31 * must display the following acknowledgement:
32 * "This product includes cryptographic software written by
33 * Eric Young (eay@cryptsoft.com)"
34 * The word 'cryptographic' can be left out if the rouines from the library
35 * being used are not cryptographic related :-).
36 * 4. If you include any Windows specific code (or a derivative thereof) from
37 * the apps directory (application code) you must include an acknowledgement:
38 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
39 *
40 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 *
52 * The licence and distribution terms for any publically available version or
53 * derivative of this code cannot be changed. i.e. this code cannot simply be
54 * copied and put under another distribution licence
55 * [including the GNU Public Licence.] */
56
57#include <openssl/bn.h>
58
59#include <assert.h>
60
61#include "internal.h"
62
63
64/* Generic implementations of most operations are needed for:
65 * - Configurations without inline assembly.
66 * - Architectures other than x86 or x86_64.
67 * - Windows x84_64; x86_64-gcc.c does not build on MSVC. */
68#if defined(OPENSSL_NO_ASM) || \
69 (!defined(OPENSSL_X86_64) && !defined(OPENSSL_X86)) || \
70 (defined(OPENSSL_X86_64) && defined(OPENSSL_WINDOWS))
71
72#if defined(OPENSSL_WINDOWS)
73#define alloca _alloca
74#else
75#include <alloca.h>
76#endif
77
78#ifdef BN_LLONG
79#define mul_add(r, a, w, c) \
80 { \
81 BN_ULLONG t; \
82 t = (BN_ULLONG)w * (a) + (r) + (c); \
83 (r) = Lw(t); \
84 (c) = Hw(t); \
85 }
86
87#define mul(r, a, w, c) \
88 { \
89 BN_ULLONG t; \
90 t = (BN_ULLONG)w * (a) + (c); \
91 (r) = Lw(t); \
92 (c) = Hw(t); \
93 }
94
95#define sqr(r0, r1, a) \
96 { \
97 BN_ULLONG t; \
98 t = (BN_ULLONG)(a) * (a); \
99 (r0) = Lw(t); \
100 (r1) = Hw(t); \
101 }
102
103#elif defined(BN_UMULT_LOHI)
104#define mul_add(r, a, w, c) \
105 { \
106 BN_ULONG high, low, ret, tmp = (a); \
107 ret = (r); \
108 BN_UMULT_LOHI(low, high, w, tmp); \
109 ret += (c); \
110 (c) = (ret < (c)) ? 1 : 0; \
111 (c) += high; \
112 ret += low; \
113 (c) += (ret < low) ? 1 : 0; \
114 (r) = ret; \
115 }
116
117#define mul(r, a, w, c) \
118 { \
119 BN_ULONG high, low, ret, ta = (a); \
120 BN_UMULT_LOHI(low, high, w, ta); \
121 ret = low + (c); \
122 (c) = high; \
123 (c) += (ret < low) ? 1 : 0; \
124 (r) = ret; \
125 }
126
127#define sqr(r0, r1, a) \
128 { \
129 BN_ULONG tmp = (a); \
130 BN_UMULT_LOHI(r0, r1, tmp, tmp); \
131 }
132
133#else
134
135/*************************************************************
136 * No long long type
137 */
138
139#define LBITS(a) ((a) & BN_MASK2l)
140#define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
141#define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)
142
143#define LLBITS(a) ((a) & BN_MASKl)
144#define LHBITS(a) (((a) >> BN_BITS2) & BN_MASKl)
145#define LL2HBITS(a) ((BN_ULLONG)((a) & BN_MASKl) << BN_BITS2)
146
147#define mul64(l, h, bl, bh) \
148 { \
149 BN_ULONG m, m1, lt, ht; \
150 \
151 lt = l; \
152 ht = h; \
153 m = (bh) * (lt); \
154 lt = (bl) * (lt); \
155 m1 = (bl) * (ht); \
156 ht = (bh) * (ht); \
157 m = (m + m1) & BN_MASK2; \
158 if (m < m1) \
159 ht += L2HBITS((BN_ULONG)1); \
160 ht += HBITS(m); \
161 m1 = L2HBITS(m); \
162 lt = (lt + m1) & BN_MASK2; \
163 if (lt < m1) \
164 ht++; \
165 (l) = lt; \
166 (h) = ht; \
167 }
168
169#define sqr64(lo, ho, in) \
170 { \
171 BN_ULONG l, h, m; \
172 \
173 h = (in); \
174 l = LBITS(h); \
175 h = HBITS(h); \
176 m = (l) * (h); \
177 l *= l; \
178 h *= h; \
179 h += (m & BN_MASK2h1) >> (BN_BITS4 - 1); \
180 m = (m & BN_MASK2l) << (BN_BITS4 + 1); \
181 l = (l + m) & BN_MASK2; \
182 if (l < m) \
183 h++; \
184 (lo) = l; \
185 (ho) = h; \
186 }
187
188#define mul_add(r, a, bl, bh, c) \
189 { \
190 BN_ULONG l, h; \
191 \
192 h = (a); \
193 l = LBITS(h); \
194 h = HBITS(h); \
195 mul64(l, h, (bl), (bh)); \
196 \
197 /* non-multiply part */ \
198 l = (l + (c)) & BN_MASK2; \
199 if (l < (c)) \
200 h++; \
201 (c) = (r); \
202 l = (l + (c)) & BN_MASK2; \
203 if (l < (c)) \
204 h++; \
205 (c) = h & BN_MASK2; \
206 (r) = l; \
207 }
208
209#define mul(r, a, bl, bh, c) \
210 { \
211 BN_ULONG l, h; \
212 \
213 h = (a); \
214 l = LBITS(h); \
215 h = HBITS(h); \
216 mul64(l, h, (bl), (bh)); \
217 \
218 /* non-multiply part */ \
219 l += (c); \
220 if ((l & BN_MASK2) < (c)) \
221 h++; \
222 (c) = h & BN_MASK2; \
223 (r) = l & BN_MASK2; \
224 }
225#endif /* !BN_LLONG */
226
227#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
228
229BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
230 BN_ULONG w) {
231 BN_ULONG c1 = 0;
232
233 assert(num >= 0);
234 if (num <= 0) {
235 return c1;
236 }
237
238 while (num & ~3) {
239 mul_add(rp[0], ap[0], w, c1);
240 mul_add(rp[1], ap[1], w, c1);
241 mul_add(rp[2], ap[2], w, c1);
242 mul_add(rp[3], ap[3], w, c1);
243 ap += 4;
244 rp += 4;
245 num -= 4;
246 }
247
248 while (num) {
249 mul_add(rp[0], ap[0], w, c1);
250 ap++;
251 rp++;
252 num--;
253 }
254
255 return c1;
256}
257
258BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
259 BN_ULONG c1 = 0;
260
261 assert(num >= 0);
262 if (num <= 0) {
263 return c1;
264 }
265
266 while (num & ~3) {
267 mul(rp[0], ap[0], w, c1);
268 mul(rp[1], ap[1], w, c1);
269 mul(rp[2], ap[2], w, c1);
270 mul(rp[3], ap[3], w, c1);
271 ap += 4;
272 rp += 4;
273 num -= 4;
274 }
275 while (num) {
276 mul(rp[0], ap[0], w, c1);
277 ap++;
278 rp++;
279 num--;
280 }
281 return c1;
282}
283
284void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
285 assert(n >= 0);
286 if (n <= 0) {
287 return;
288 }
289
290 while (n & ~3) {
291 sqr(r[0], r[1], a[0]);
292 sqr(r[2], r[3], a[1]);
293 sqr(r[4], r[5], a[2]);
294 sqr(r[6], r[7], a[3]);
295 a += 4;
296 r += 8;
297 n -= 4;
298 }
299 while (n) {
300 sqr(r[0], r[1], a[0]);
301 a++;
302 r += 2;
303 n--;
304 }
305}
306
307#else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
308
309BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
310 BN_ULONG w) {
311 BN_ULONG c = 0;
312 BN_ULONG bl, bh;
313
314 assert(num >= 0);
315 if (num <= 0) {
316 return (BN_ULONG)0;
317 }
318
319 bl = LBITS(w);
320 bh = HBITS(w);
321
322 while (num & ~3) {
323 mul_add(rp[0], ap[0], bl, bh, c);
324 mul_add(rp[1], ap[1], bl, bh, c);
325 mul_add(rp[2], ap[2], bl, bh, c);
326 mul_add(rp[3], ap[3], bl, bh, c);
327 ap += 4;
328 rp += 4;
329 num -= 4;
330 }
331 while (num) {
332 mul_add(rp[0], ap[0], bl, bh, c);
333 ap++;
334 rp++;
335 num--;
336 }
337 return c;
338}
339
340BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
341 BN_ULONG carry = 0;
342 BN_ULONG bl, bh;
343
344 assert(num >= 0);
345 if (num <= 0) {
346 return (BN_ULONG)0;
347 }
348
349 bl = LBITS(w);
350 bh = HBITS(w);
351
352 while (num & ~3) {
353 mul(rp[0], ap[0], bl, bh, carry);
354 mul(rp[1], ap[1], bl, bh, carry);
355 mul(rp[2], ap[2], bl, bh, carry);
356 mul(rp[3], ap[3], bl, bh, carry);
357 ap += 4;
358 rp += 4;
359 num -= 4;
360 }
361 while (num) {
362 mul(rp[0], ap[0], bl, bh, carry);
363 ap++;
364 rp++;
365 num--;
366 }
367 return carry;
368}
369
370void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
371 assert(n >= 0);
372 if (n <= 0) {
373 return;
374 }
375
376 while (n & ~3) {
377 sqr64(r[0], r[1], a[0]);
378 sqr64(r[2], r[3], a[1]);
379 sqr64(r[4], r[5], a[2]);
380 sqr64(r[6], r[7], a[3]);
381 a += 4;
382 r += 8;
383 n -= 4;
384 }
385 while (n) {
386 sqr64(r[0], r[1], a[0]);
387 a++;
388 r += 2;
389 n--;
390 }
391}
392
393#endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
394
395#if defined(BN_LLONG)
396
397BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
398 return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d);
399}
400
401#else
402
403/* Divide h,l by d and return the result. */
404BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
405 BN_ULONG dh, dl, q, ret = 0, th, tl, t;
406 int i, count = 2;
407
408 if (d == 0) {
409 return BN_MASK2;
410 }
411
412 i = BN_num_bits_word(d);
413 assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
414
415 i = BN_BITS2 - i;
416 if (h >= d) {
417 h -= d;
418 }
419
420 if (i) {
421 d <<= i;
422 h = (h << i) | (l >> (BN_BITS2 - i));
423 l <<= i;
424 }
425 dh = (d & BN_MASK2h) >> BN_BITS4;
426 dl = (d & BN_MASK2l);
427 for (;;) {
428 if ((h >> BN_BITS4) == dh) {
429 q = BN_MASK2l;
430 } else {
431 q = h / dh;
432 }
433
434 th = q * dh;
435 tl = dl * q;
436 for (;;) {
437 t = h - th;
438 if ((t & BN_MASK2h) ||
439 ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) {
440 break;
441 }
442 q--;
443 th -= dh;
444 tl -= dl;
445 }
446 t = (tl >> BN_BITS4);
447 tl = (tl << BN_BITS4) & BN_MASK2h;
448 th += t;
449
450 if (l < tl) {
451 th++;
452 }
453 l -= tl;
454 if (h < th) {
455 h += d;
456 q--;
457 }
458 h -= th;
459
460 if (--count == 0) {
461 break;
462 }
463
464 ret = q << BN_BITS4;
465 h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
466 l = (l & BN_MASK2l) << BN_BITS4;
467 }
468
469 ret |= q;
470 return ret;
471}
472
473#endif /* !defined(BN_LLONG) */
474
475#ifdef BN_LLONG
476BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
477 int n) {
478 BN_ULLONG ll = 0;
479
480 assert(n >= 0);
481 if (n <= 0) {
482 return (BN_ULONG)0;
483 }
484
485 while (n & ~3) {
486 ll += (BN_ULLONG)a[0] + b[0];
487 r[0] = (BN_ULONG)ll & BN_MASK2;
488 ll >>= BN_BITS2;
489 ll += (BN_ULLONG)a[1] + b[1];
490 r[1] = (BN_ULONG)ll & BN_MASK2;
491 ll >>= BN_BITS2;
492 ll += (BN_ULLONG)a[2] + b[2];
493 r[2] = (BN_ULONG)ll & BN_MASK2;
494 ll >>= BN_BITS2;
495 ll += (BN_ULLONG)a[3] + b[3];
496 r[3] = (BN_ULONG)ll & BN_MASK2;
497 ll >>= BN_BITS2;
498 a += 4;
499 b += 4;
500 r += 4;
501 n -= 4;
502 }
503 while (n) {
504 ll += (BN_ULLONG)a[0] + b[0];
505 r[0] = (BN_ULONG)ll & BN_MASK2;
506 ll >>= BN_BITS2;
507 a++;
508 b++;
509 r++;
510 n--;
511 }
512 return (BN_ULONG)ll;
513}
514
515#else /* !BN_LLONG */
516
517BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
518 int n) {
519 BN_ULONG c, l, t;
520
521 assert(n >= 0);
522 if (n <= 0) {
523 return (BN_ULONG)0;
524 }
525
526 c = 0;
527 while (n & ~3) {
528 t = a[0];
529 t = (t + c) & BN_MASK2;
530 c = (t < c);
531 l = (t + b[0]) & BN_MASK2;
532 c += (l < t);
533 r[0] = l;
534 t = a[1];
535 t = (t + c) & BN_MASK2;
536 c = (t < c);
537 l = (t + b[1]) & BN_MASK2;
538 c += (l < t);
539 r[1] = l;
540 t = a[2];
541 t = (t + c) & BN_MASK2;
542 c = (t < c);
543 l = (t + b[2]) & BN_MASK2;
544 c += (l < t);
545 r[2] = l;
546 t = a[3];
547 t = (t + c) & BN_MASK2;
548 c = (t < c);
549 l = (t + b[3]) & BN_MASK2;
550 c += (l < t);
551 r[3] = l;
552 a += 4;
553 b += 4;
554 r += 4;
555 n -= 4;
556 }
557 while (n) {
558 t = a[0];
559 t = (t + c) & BN_MASK2;
560 c = (t < c);
561 l = (t + b[0]) & BN_MASK2;
562 c += (l < t);
563 r[0] = l;
564 a++;
565 b++;
566 r++;
567 n--;
568 }
569 return (BN_ULONG)c;
570}
571
572#endif /* !BN_LLONG */
573
574BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
575 int n) {
576 BN_ULONG t1, t2;
577 int c = 0;
578
579 assert(n >= 0);
580 if (n <= 0) {
581 return (BN_ULONG)0;
582 }
583
584 while (n & ~3) {
585 t1 = a[0];
586 t2 = b[0];
587 r[0] = (t1 - t2 - c) & BN_MASK2;
588 if (t1 != t2)
589 c = (t1 < t2);
590 t1 = a[1];
591 t2 = b[1];
592 r[1] = (t1 - t2 - c) & BN_MASK2;
593 if (t1 != t2)
594 c = (t1 < t2);
595 t1 = a[2];
596 t2 = b[2];
597 r[2] = (t1 - t2 - c) & BN_MASK2;
598 if (t1 != t2)
599 c = (t1 < t2);
600 t1 = a[3];
601 t2 = b[3];
602 r[3] = (t1 - t2 - c) & BN_MASK2;
603 if (t1 != t2)
604 c = (t1 < t2);
605 a += 4;
606 b += 4;
607 r += 4;
608 n -= 4;
609 }
610 while (n) {
611 t1 = a[0];
612 t2 = b[0];
613 r[0] = (t1 - t2 - c) & BN_MASK2;
614 if (t1 != t2)
615 c = (t1 < t2);
616 a++;
617 b++;
618 r++;
619 n--;
620 }
621 return c;
622}
623
624/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
625/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
626/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
627/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
628
629#ifdef BN_LLONG
630
631/* Keep in mind that additions to multiplication result can not overflow,
632 * because its high half cannot be all-ones. */
633#define mul_add_c(a, b, c0, c1, c2) \
634 do { \
635 BN_ULONG hi; \
636 BN_ULLONG t = (BN_ULLONG)(a) * (b); \
637 t += c0; /* no carry */ \
638 c0 = (BN_ULONG)Lw(t); \
639 hi = (BN_ULONG)Hw(t); \
640 c1 = (c1 + hi) & BN_MASK2; \
641 if (c1 < hi) \
642 c2++; \
643 } while (0)
644
645#define mul_add_c2(a, b, c0, c1, c2) \
646 do { \
647 BN_ULONG hi; \
648 BN_ULLONG t = (BN_ULLONG)(a) * (b); \
649 BN_ULLONG tt = t + c0; /* no carry */ \
650 c0 = (BN_ULONG)Lw(tt); \
651 hi = (BN_ULONG)Hw(tt); \
652 c1 = (c1 + hi) & BN_MASK2; \
653 if (c1 < hi) \
654 c2++; \
655 t += c0; /* no carry */ \
656 c0 = (BN_ULONG)Lw(t); \
657 hi = (BN_ULONG)Hw(t); \
658 c1 = (c1 + hi) & BN_MASK2; \
659 if (c1 < hi) \
660 c2++; \
661 } while (0)
662
663#define sqr_add_c(a, i, c0, c1, c2) \
664 do { \
665 BN_ULONG hi; \
666 BN_ULLONG t = (BN_ULLONG)a[i] * a[i]; \
667 t += c0; /* no carry */ \
668 c0 = (BN_ULONG)Lw(t); \
669 hi = (BN_ULONG)Hw(t); \
670 c1 = (c1 + hi) & BN_MASK2; \
671 if (c1 < hi) \
672 c2++; \
673 } while (0)
674
675#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
676
677#elif defined(BN_UMULT_LOHI)
678
679/* Keep in mind that additions to hi can not overflow, because the high word of
680 * a multiplication result cannot be all-ones. */
681#define mul_add_c(a, b, c0, c1, c2) \
682 do { \
683 BN_ULONG ta = (a), tb = (b); \
684 BN_ULONG lo, hi; \
685 BN_UMULT_LOHI(lo, hi, ta, tb); \
686 c0 += lo; \
687 hi += (c0 < lo) ? 1 : 0; \
688 c1 += hi; \
689 c2 += (c1 < hi) ? 1 : 0; \
690 } while (0)
691
692#define mul_add_c2(a, b, c0, c1, c2) \
693 do { \
694 BN_ULONG ta = (a), tb = (b); \
695 BN_ULONG lo, hi, tt; \
696 BN_UMULT_LOHI(lo, hi, ta, tb); \
697 c0 += lo; \
698 tt = hi + ((c0 < lo) ? 1 : 0); \
699 c1 += tt; \
700 c2 += (c1 < tt) ? 1 : 0; \
701 c0 += lo; \
702 hi += (c0 < lo) ? 1 : 0; \
703 c1 += hi; \
704 c2 += (c1 < hi) ? 1 : 0; \
705 } while (0)
706
707#define sqr_add_c(a, i, c0, c1, c2) \
708 do { \
709 BN_ULONG ta = (a)[i]; \
710 BN_ULONG lo, hi; \
711 BN_UMULT_LOHI(lo, hi, ta, ta); \
712 c0 += lo; \
713 hi += (c0 < lo) ? 1 : 0; \
714 c1 += hi; \
715 c2 += (c1 < hi) ? 1 : 0; \
716 } while (0)
717
718#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
719
720#else /* !BN_LLONG */
721
722/* Keep in mind that additions to hi can not overflow, because
723 * the high word of a multiplication result cannot be all-ones. */
724
725#define mul_add_c(a, b, c0, c1, c2) \
726 do { \
727 BN_ULONG lo = LBITS(a), hi = HBITS(a); \
728 BN_ULONG bl = LBITS(b), bh = HBITS(b); \
729 mul64(lo, hi, bl, bh); \
730 c0 = (c0 + lo) & BN_MASK2; \
731 if (c0 < lo) \
732 hi++; \
733 c1 = (c1 + hi) & BN_MASK2; \
734 if (c1 < hi) \
735 c2++; \
736 } while (0)
737
738#define mul_add_c2(a, b, c0, c1, c2) \
739 do { \
740 BN_ULONG tt; \
741 BN_ULONG lo = LBITS(a), hi = HBITS(a); \
742 BN_ULONG bl = LBITS(b), bh = HBITS(b); \
743 mul64(lo, hi, bl, bh); \
744 tt = hi; \
745 c0 = (c0 + lo) & BN_MASK2; \
746 if (c0 < lo) \
747 tt++; \
748 c1 = (c1 + tt) & BN_MASK2; \
749 if (c1 < tt) \
750 c2++; \
751 c0 = (c0 + lo) & BN_MASK2; \
752 if (c0 < lo) \
753 hi++; \
754 c1 = (c1 + hi) & BN_MASK2; \
755 if (c1 < hi) \
756 c2++; \
757 } while (0)
758
759#define sqr_add_c(a, i, c0, c1, c2) \
760 do { \
761 BN_ULONG lo, hi; \
762 sqr64(lo, hi, (a)[i]); \
763 c0 = (c0 + lo) & BN_MASK2; \
764 if (c0 < lo) \
765 hi++; \
766 c1 = (c1 + hi) & BN_MASK2; \
767 if (c1 < hi) \
768 c2++; \
769 } while (0)
770
771#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
772#endif /* !BN_LLONG */
773
774void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
775 BN_ULONG c1, c2, c3;
776
777 c1 = 0;
778 c2 = 0;
779 c3 = 0;
780 mul_add_c(a[0], b[0], c1, c2, c3);
781 r[0] = c1;
782 c1 = 0;
783 mul_add_c(a[0], b[1], c2, c3, c1);
784 mul_add_c(a[1], b[0], c2, c3, c1);
785 r[1] = c2;
786 c2 = 0;
787 mul_add_c(a[2], b[0], c3, c1, c2);
788 mul_add_c(a[1], b[1], c3, c1, c2);
789 mul_add_c(a[0], b[2], c3, c1, c2);
790 r[2] = c3;
791 c3 = 0;
792 mul_add_c(a[0], b[3], c1, c2, c3);
793 mul_add_c(a[1], b[2], c1, c2, c3);
794 mul_add_c(a[2], b[1], c1, c2, c3);
795 mul_add_c(a[3], b[0], c1, c2, c3);
796 r[3] = c1;
797 c1 = 0;
798 mul_add_c(a[4], b[0], c2, c3, c1);
799 mul_add_c(a[3], b[1], c2, c3, c1);
800 mul_add_c(a[2], b[2], c2, c3, c1);
801 mul_add_c(a[1], b[3], c2, c3, c1);
802 mul_add_c(a[0], b[4], c2, c3, c1);
803 r[4] = c2;
804 c2 = 0;
805 mul_add_c(a[0], b[5], c3, c1, c2);
806 mul_add_c(a[1], b[4], c3, c1, c2);
807 mul_add_c(a[2], b[3], c3, c1, c2);
808 mul_add_c(a[3], b[2], c3, c1, c2);
809 mul_add_c(a[4], b[1], c3, c1, c2);
810 mul_add_c(a[5], b[0], c3, c1, c2);
811 r[5] = c3;
812 c3 = 0;
813 mul_add_c(a[6], b[0], c1, c2, c3);
814 mul_add_c(a[5], b[1], c1, c2, c3);
815 mul_add_c(a[4], b[2], c1, c2, c3);
816 mul_add_c(a[3], b[3], c1, c2, c3);
817 mul_add_c(a[2], b[4], c1, c2, c3);
818 mul_add_c(a[1], b[5], c1, c2, c3);
819 mul_add_c(a[0], b[6], c1, c2, c3);
820 r[6] = c1;
821 c1 = 0;
822 mul_add_c(a[0], b[7], c2, c3, c1);
823 mul_add_c(a[1], b[6], c2, c3, c1);
824 mul_add_c(a[2], b[5], c2, c3, c1);
825 mul_add_c(a[3], b[4], c2, c3, c1);
826 mul_add_c(a[4], b[3], c2, c3, c1);
827 mul_add_c(a[5], b[2], c2, c3, c1);
828 mul_add_c(a[6], b[1], c2, c3, c1);
829 mul_add_c(a[7], b[0], c2, c3, c1);
830 r[7] = c2;
831 c2 = 0;
832 mul_add_c(a[7], b[1], c3, c1, c2);
833 mul_add_c(a[6], b[2], c3, c1, c2);
834 mul_add_c(a[5], b[3], c3, c1, c2);
835 mul_add_c(a[4], b[4], c3, c1, c2);
836 mul_add_c(a[3], b[5], c3, c1, c2);
837 mul_add_c(a[2], b[6], c3, c1, c2);
838 mul_add_c(a[1], b[7], c3, c1, c2);
839 r[8] = c3;
840 c3 = 0;
841 mul_add_c(a[2], b[7], c1, c2, c3);
842 mul_add_c(a[3], b[6], c1, c2, c3);
843 mul_add_c(a[4], b[5], c1, c2, c3);
844 mul_add_c(a[5], b[4], c1, c2, c3);
845 mul_add_c(a[6], b[3], c1, c2, c3);
846 mul_add_c(a[7], b[2], c1, c2, c3);
847 r[9] = c1;
848 c1 = 0;
849 mul_add_c(a[7], b[3], c2, c3, c1);
850 mul_add_c(a[6], b[4], c2, c3, c1);
851 mul_add_c(a[5], b[5], c2, c3, c1);
852 mul_add_c(a[4], b[6], c2, c3, c1);
853 mul_add_c(a[3], b[7], c2, c3, c1);
854 r[10] = c2;
855 c2 = 0;
856 mul_add_c(a[4], b[7], c3, c1, c2);
857 mul_add_c(a[5], b[6], c3, c1, c2);
858 mul_add_c(a[6], b[5], c3, c1, c2);
859 mul_add_c(a[7], b[4], c3, c1, c2);
860 r[11] = c3;
861 c3 = 0;
862 mul_add_c(a[7], b[5], c1, c2, c3);
863 mul_add_c(a[6], b[6], c1, c2, c3);
864 mul_add_c(a[5], b[7], c1, c2, c3);
865 r[12] = c1;
866 c1 = 0;
867 mul_add_c(a[6], b[7], c2, c3, c1);
868 mul_add_c(a[7], b[6], c2, c3, c1);
869 r[13] = c2;
870 c2 = 0;
871 mul_add_c(a[7], b[7], c3, c1, c2);
872 r[14] = c3;
873 r[15] = c1;
874}
875
876void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
877 BN_ULONG c1, c2, c3;
878
879 c1 = 0;
880 c2 = 0;
881 c3 = 0;
882 mul_add_c(a[0], b[0], c1, c2, c3);
883 r[0] = c1;
884 c1 = 0;
885 mul_add_c(a[0], b[1], c2, c3, c1);
886 mul_add_c(a[1], b[0], c2, c3, c1);
887 r[1] = c2;
888 c2 = 0;
889 mul_add_c(a[2], b[0], c3, c1, c2);
890 mul_add_c(a[1], b[1], c3, c1, c2);
891 mul_add_c(a[0], b[2], c3, c1, c2);
892 r[2] = c3;
893 c3 = 0;
894 mul_add_c(a[0], b[3], c1, c2, c3);
895 mul_add_c(a[1], b[2], c1, c2, c3);
896 mul_add_c(a[2], b[1], c1, c2, c3);
897 mul_add_c(a[3], b[0], c1, c2, c3);
898 r[3] = c1;
899 c1 = 0;
900 mul_add_c(a[3], b[1], c2, c3, c1);
901 mul_add_c(a[2], b[2], c2, c3, c1);
902 mul_add_c(a[1], b[3], c2, c3, c1);
903 r[4] = c2;
904 c2 = 0;
905 mul_add_c(a[2], b[3], c3, c1, c2);
906 mul_add_c(a[3], b[2], c3, c1, c2);
907 r[5] = c3;
908 c3 = 0;
909 mul_add_c(a[3], b[3], c1, c2, c3);
910 r[6] = c1;
911 r[7] = c2;
912}
913
914void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
915 BN_ULONG c1, c2, c3;
916
917 c1 = 0;
918 c2 = 0;
919 c3 = 0;
920 sqr_add_c(a, 0, c1, c2, c3);
921 r[0] = c1;
922 c1 = 0;
923 sqr_add_c2(a, 1, 0, c2, c3, c1);
924 r[1] = c2;
925 c2 = 0;
926 sqr_add_c(a, 1, c3, c1, c2);
927 sqr_add_c2(a, 2, 0, c3, c1, c2);
928 r[2] = c3;
929 c3 = 0;
930 sqr_add_c2(a, 3, 0, c1, c2, c3);
931 sqr_add_c2(a, 2, 1, c1, c2, c3);
932 r[3] = c1;
933 c1 = 0;
934 sqr_add_c(a, 2, c2, c3, c1);
935 sqr_add_c2(a, 3, 1, c2, c3, c1);
936 sqr_add_c2(a, 4, 0, c2, c3, c1);
937 r[4] = c2;
938 c2 = 0;
939 sqr_add_c2(a, 5, 0, c3, c1, c2);
940 sqr_add_c2(a, 4, 1, c3, c1, c2);
941 sqr_add_c2(a, 3, 2, c3, c1, c2);
942 r[5] = c3;
943 c3 = 0;
944 sqr_add_c(a, 3, c1, c2, c3);
945 sqr_add_c2(a, 4, 2, c1, c2, c3);
946 sqr_add_c2(a, 5, 1, c1, c2, c3);
947 sqr_add_c2(a, 6, 0, c1, c2, c3);
948 r[6] = c1;
949 c1 = 0;
950 sqr_add_c2(a, 7, 0, c2, c3, c1);
951 sqr_add_c2(a, 6, 1, c2, c3, c1);
952 sqr_add_c2(a, 5, 2, c2, c3, c1);
953 sqr_add_c2(a, 4, 3, c2, c3, c1);
954 r[7] = c2;
955 c2 = 0;
956 sqr_add_c(a, 4, c3, c1, c2);
957 sqr_add_c2(a, 5, 3, c3, c1, c2);
958 sqr_add_c2(a, 6, 2, c3, c1, c2);
959 sqr_add_c2(a, 7, 1, c3, c1, c2);
960 r[8] = c3;
961 c3 = 0;
962 sqr_add_c2(a, 7, 2, c1, c2, c3);
963 sqr_add_c2(a, 6, 3, c1, c2, c3);
964 sqr_add_c2(a, 5, 4, c1, c2, c3);
965 r[9] = c1;
966 c1 = 0;
967 sqr_add_c(a, 5, c2, c3, c1);
968 sqr_add_c2(a, 6, 4, c2, c3, c1);
969 sqr_add_c2(a, 7, 3, c2, c3, c1);
970 r[10] = c2;
971 c2 = 0;
972 sqr_add_c2(a, 7, 4, c3, c1, c2);
973 sqr_add_c2(a, 6, 5, c3, c1, c2);
974 r[11] = c3;
975 c3 = 0;
976 sqr_add_c(a, 6, c1, c2, c3);
977 sqr_add_c2(a, 7, 5, c1, c2, c3);
978 r[12] = c1;
979 c1 = 0;
980 sqr_add_c2(a, 7, 6, c2, c3, c1);
981 r[13] = c2;
982 c2 = 0;
983 sqr_add_c(a, 7, c3, c1, c2);
984 r[14] = c3;
985 r[15] = c1;
986}
987
988void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
989 BN_ULONG c1, c2, c3;
990
991 c1 = 0;
992 c2 = 0;
993 c3 = 0;
994 sqr_add_c(a, 0, c1, c2, c3);
995 r[0] = c1;
996 c1 = 0;
997 sqr_add_c2(a, 1, 0, c2, c3, c1);
998 r[1] = c2;
999 c2 = 0;
1000 sqr_add_c(a, 1, c3, c1, c2);
1001 sqr_add_c2(a, 2, 0, c3, c1, c2);
1002 r[2] = c3;
1003 c3 = 0;
1004 sqr_add_c2(a, 3, 0, c1, c2, c3);
1005 sqr_add_c2(a, 2, 1, c1, c2, c3);
1006 r[3] = c1;
1007 c1 = 0;
1008 sqr_add_c(a, 2, c2, c3, c1);
1009 sqr_add_c2(a, 3, 1, c2, c3, c1);
1010 r[4] = c2;
1011 c2 = 0;
1012 sqr_add_c2(a, 3, 2, c3, c1, c2);
1013 r[5] = c3;
1014 c3 = 0;
1015 sqr_add_c(a, 3, c1, c2, c3);
1016 r[6] = c1;
1017 r[7] = c2;
1018}
1019
1020#if defined(OPENSSL_NO_ASM) || (!defined(OPENSSL_ARM) && !defined(OPENSSL_X86_64))
1021/* This is essentially reference implementation, which may or may not
1022 * result in performance improvement. E.g. on IA-32 this routine was
1023 * observed to give 40% faster rsa1024 private key operations and 10%
1024 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
1025 * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
1026 * reference implementation, one to be used as starting point for
1027 * platform-specific assembler. Mentioned numbers apply to compiler
1028 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
1029 * can vary not only from platform to platform, but even for compiler
1030 * versions. Assembler vs. assembler improvement coefficients can
1031 * [and are known to] differ and are to be documented elsewhere. */
1032int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1033 const BN_ULONG *np, const BN_ULONG *n0p, int num) {
1034 BN_ULONG c0, c1, ml, *tp, n0;
1035#ifdef mul64
1036 BN_ULONG mh;
1037#endif
1038 volatile BN_ULONG *vp;
1039 int i = 0, j;
1040
1041#if 0 /* template for platform-specific implementation */
1042 if (ap==bp) return bn_sqr_mont(rp,ap,np,n0p,num);
1043#endif
1044 vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
1045
1046 n0 = *n0p;
1047
1048 c0 = 0;
1049 ml = bp[0];
1050#ifdef mul64
1051 mh = HBITS(ml);
1052 ml = LBITS(ml);
1053 for (j = 0; j < num; ++j)
1054 mul(tp[j], ap[j], ml, mh, c0);
1055#else
1056 for (j = 0; j < num; ++j)
1057 mul(tp[j], ap[j], ml, c0);
1058#endif
1059
1060 tp[num] = c0;
1061 tp[num + 1] = 0;
1062 goto enter;
1063
1064 for (i = 0; i < num; i++) {
1065 c0 = 0;
1066 ml = bp[i];
1067#ifdef mul64
1068 mh = HBITS(ml);
1069 ml = LBITS(ml);
1070 for (j = 0; j < num; ++j)
1071 mul_add(tp[j], ap[j], ml, mh, c0);
1072#else
1073 for (j = 0; j < num; ++j)
1074 mul_add(tp[j], ap[j], ml, c0);
1075#endif
1076 c1 = (tp[num] + c0) & BN_MASK2;
1077 tp[num] = c1;
1078 tp[num + 1] = (c1 < c0 ? 1 : 0);
1079 enter:
1080 c1 = tp[0];
1081 ml = (c1 * n0) & BN_MASK2;
1082 c0 = 0;
1083#ifdef mul64
1084 mh = HBITS(ml);
1085 ml = LBITS(ml);
1086 mul_add(c1, np[0], ml, mh, c0);
1087#else
1088 mul_add(c1, ml, np[0], c0);
1089#endif
1090 for (j = 1; j < num; j++) {
1091 c1 = tp[j];
1092#ifdef mul64
1093 mul_add(c1, np[j], ml, mh, c0);
1094#else
1095 mul_add(c1, ml, np[j], c0);
1096#endif
1097 tp[j - 1] = c1 & BN_MASK2;
1098 }
1099 c1 = (tp[num] + c0) & BN_MASK2;
1100 tp[num - 1] = c1;
1101 tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
1102 }
1103
1104 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1105 c0 = bn_sub_words(rp, tp, np, num);
1106 if (tp[num] != 0 || c0 == 0) {
1107 for (i = 0; i < num + 2; i++)
1108 vp[i] = 0;
1109 return 1;
1110 }
1111 }
1112 for (i = 0; i < num; i++)
1113 rp[i] = tp[i], vp[i] = 0;
1114 vp[num] = 0;
1115 vp[num + 1] = 0;
1116 return 1;
1117}
1118#endif
1119
1120#endif