Blame - src/crypto/bn/generic.c - platform/external/boringssl

blob: 224a47cf207ec06aa65b7b3afc0fa2e3e6896e79 [file] [log] [blame]

Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame^]	1	/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
				2	* All rights reserved.
				3	*
				4	* This package is an SSL implementation written
				5	* by Eric Young (eay@cryptsoft.com).
				6	* The implementation was written so as to conform with Netscapes SSL.
				7	*
				8	* This library is free for commercial and non-commercial use as long as
				9	* the following conditions are aheared to. The following conditions
				10	* apply to all code found in this distribution, be it the RC4, RSA,
				11	* lhash, DES, etc., code; not just the SSL code. The SSL documentation
				12	* included with this distribution is covered by the same copyright terms
				13	* except that the holder is Tim Hudson (tjh@cryptsoft.com).
				14	*
				15	* Copyright remains Eric Young's, and as such any Copyright notices in
				16	* the code are not to be removed.
				17	* If this package is used in a product, Eric Young should be given attribution
				18	* as the author of the parts of the library used.
				19	* This can be in the form of a textual message at program startup or
				20	* in documentation (online or textual) provided with the package.
				21	*
				22	* Redistribution and use in source and binary forms, with or without
				23	* modification, are permitted provided that the following conditions
				24	* are met:
				25	* 1. Redistributions of source code must retain the copyright
				26	* notice, this list of conditions and the following disclaimer.
				27	* 2. Redistributions in binary form must reproduce the above copyright
				28	* notice, this list of conditions and the following disclaimer in the
				29	* documentation and/or other materials provided with the distribution.
				30	* 3. All advertising materials mentioning features or use of this software
				31	* must display the following acknowledgement:
				32	* "This product includes cryptographic software written by
				33	* Eric Young (eay@cryptsoft.com)"
				34	* The word 'cryptographic' can be left out if the rouines from the library
				35	* being used are not cryptographic related :-).
				36	* 4. If you include any Windows specific code (or a derivative thereof) from
				37	* the apps directory (application code) you must include an acknowledgement:
				38	* "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
				39	*
				40	* THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
				41	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				42	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				43	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
				44	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
				45	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
				46	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
				47	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
				48	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
				49	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
				50	* SUCH DAMAGE.
				51	*
				52	* The licence and distribution terms for any publically available version or
				53	* derivative of this code cannot be changed. i.e. this code cannot simply be
				54	* copied and put under another distribution licence
				55	* [including the GNU Public Licence.] */
				56
				57	#include <openssl/bn.h>
				58
				59	#include <assert.h>
				60
				61	#include "internal.h"
				62
				63
				64	/* Generic implementations of most operations are needed for:
				65	* - Configurations without inline assembly.
				66	* - Architectures other than x86 or x86_64.
				67	* - Windows x84_64; x86_64-gcc.c does not build on MSVC. */
				68	#if defined(OPENSSL_NO_ASM) \|\| \
				69	(!defined(OPENSSL_X86_64) && !defined(OPENSSL_X86)) \|\| \
				70	(defined(OPENSSL_X86_64) && defined(OPENSSL_WINDOWS))
				71
				72	#if defined(OPENSSL_WINDOWS)
				73	#define alloca _alloca
				74	#else
				75	#include <alloca.h>
				76	#endif
				77
				78	#ifdef BN_LLONG
				79	#define mul_add(r, a, w, c) \
				80	{ \
				81	BN_ULLONG t; \
				82	t = (BN_ULLONG)w * (a) + (r) + (c); \
				83	(r) = Lw(t); \
				84	(c) = Hw(t); \
				85	}
				86
				87	#define mul(r, a, w, c) \
				88	{ \
				89	BN_ULLONG t; \
				90	t = (BN_ULLONG)w * (a) + (c); \
				91	(r) = Lw(t); \
				92	(c) = Hw(t); \
				93	}
				94
				95	#define sqr(r0, r1, a) \
				96	{ \
				97	BN_ULLONG t; \
				98	t = (BN_ULLONG)(a) * (a); \
				99	(r0) = Lw(t); \
				100	(r1) = Hw(t); \
				101	}
				102
				103	#elif defined(BN_UMULT_LOHI)
				104	#define mul_add(r, a, w, c) \
				105	{ \
				106	BN_ULONG high, low, ret, tmp = (a); \
				107	ret = (r); \
				108	BN_UMULT_LOHI(low, high, w, tmp); \
				109	ret += (c); \
				110	(c) = (ret < (c)) ? 1 : 0; \
				111	(c) += high; \
				112	ret += low; \
				113	(c) += (ret < low) ? 1 : 0; \
				114	(r) = ret; \
				115	}
				116
				117	#define mul(r, a, w, c) \
				118	{ \
				119	BN_ULONG high, low, ret, ta = (a); \
				120	BN_UMULT_LOHI(low, high, w, ta); \
				121	ret = low + (c); \
				122	(c) = high; \
				123	(c) += (ret < low) ? 1 : 0; \
				124	(r) = ret; \
				125	}
				126
				127	#define sqr(r0, r1, a) \
				128	{ \
				129	BN_ULONG tmp = (a); \
				130	BN_UMULT_LOHI(r0, r1, tmp, tmp); \
				131	}
				132
				133	#else
				134
				135	/*************************************************************
				136	* No long long type
				137	*/
				138
				139	#define LBITS(a) ((a) & BN_MASK2l)
				140	#define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
				141	#define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)
				142
				143	#define LLBITS(a) ((a) & BN_MASKl)
				144	#define LHBITS(a) (((a) >> BN_BITS2) & BN_MASKl)
				145	#define LL2HBITS(a) ((BN_ULLONG)((a) & BN_MASKl) << BN_BITS2)
				146
				147	#define mul64(l, h, bl, bh) \
				148	{ \
				149	BN_ULONG m, m1, lt, ht; \
				150	\
				151	lt = l; \
				152	ht = h; \
				153	m = (bh) * (lt); \
				154	lt = (bl) * (lt); \
				155	m1 = (bl) * (ht); \
				156	ht = (bh) * (ht); \
				157	m = (m + m1) & BN_MASK2; \
				158	if (m < m1) \
				159	ht += L2HBITS((BN_ULONG)1); \
				160	ht += HBITS(m); \
				161	m1 = L2HBITS(m); \
				162	lt = (lt + m1) & BN_MASK2; \
				163	if (lt < m1) \
				164	ht++; \
				165	(l) = lt; \
				166	(h) = ht; \
				167	}
				168
				169	#define sqr64(lo, ho, in) \
				170	{ \
				171	BN_ULONG l, h, m; \
				172	\
				173	h = (in); \
				174	l = LBITS(h); \
				175	h = HBITS(h); \
				176	m = (l) * (h); \
				177	l *= l; \
				178	h *= h; \
				179	h += (m & BN_MASK2h1) >> (BN_BITS4 - 1); \
				180	m = (m & BN_MASK2l) << (BN_BITS4 + 1); \
				181	l = (l + m) & BN_MASK2; \
				182	if (l < m) \
				183	h++; \
				184	(lo) = l; \
				185	(ho) = h; \
				186	}
				187
				188	#define mul_add(r, a, bl, bh, c) \
				189	{ \
				190	BN_ULONG l, h; \
				191	\
				192	h = (a); \
				193	l = LBITS(h); \
				194	h = HBITS(h); \
				195	mul64(l, h, (bl), (bh)); \
				196	\
				197	/* non-multiply part */ \
				198	l = (l + (c)) & BN_MASK2; \
				199	if (l < (c)) \
				200	h++; \
				201	(c) = (r); \
				202	l = (l + (c)) & BN_MASK2; \
				203	if (l < (c)) \
				204	h++; \
				205	(c) = h & BN_MASK2; \
				206	(r) = l; \
				207	}
				208
				209	#define mul(r, a, bl, bh, c) \
				210	{ \
				211	BN_ULONG l, h; \
				212	\
				213	h = (a); \
				214	l = LBITS(h); \
				215	h = HBITS(h); \
				216	mul64(l, h, (bl), (bh)); \
				217	\
				218	/* non-multiply part */ \
				219	l += (c); \
				220	if ((l & BN_MASK2) < (c)) \
				221	h++; \
				222	(c) = h & BN_MASK2; \
				223	(r) = l & BN_MASK2; \
				224	}
				225	#endif /* !BN_LLONG */
				226
				227	#if defined(BN_LLONG) \|\| defined(BN_UMULT_HIGH)
				228
				229	BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
				230	BN_ULONG w) {
				231	BN_ULONG c1 = 0;
				232
				233	assert(num >= 0);
				234	if (num <= 0) {
				235	return c1;
				236	}
				237
				238	while (num & ~3) {
				239	mul_add(rp[0], ap[0], w, c1);
				240	mul_add(rp[1], ap[1], w, c1);
				241	mul_add(rp[2], ap[2], w, c1);
				242	mul_add(rp[3], ap[3], w, c1);
				243	ap += 4;
				244	rp += 4;
				245	num -= 4;
				246	}
				247
				248	while (num) {
				249	mul_add(rp[0], ap[0], w, c1);
				250	ap++;
				251	rp++;
				252	num--;
				253	}
				254
				255	return c1;
				256	}
				257
				258	BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w) {
				259	BN_ULONG c1 = 0;
				260
				261	assert(num >= 0);
				262	if (num <= 0) {
				263	return c1;
				264	}
				265
				266	while (num & ~3) {
				267	mul(rp[0], ap[0], w, c1);
				268	mul(rp[1], ap[1], w, c1);
				269	mul(rp[2], ap[2], w, c1);
				270	mul(rp[3], ap[3], w, c1);
				271	ap += 4;
				272	rp += 4;
				273	num -= 4;
				274	}
				275	while (num) {
				276	mul(rp[0], ap[0], w, c1);
				277	ap++;
				278	rp++;
				279	num--;
				280	}
				281	return c1;
				282	}
				283
				284	void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n) {
				285	assert(n >= 0);
				286	if (n <= 0) {
				287	return;
				288	}
				289
				290	while (n & ~3) {
				291	sqr(r[0], r[1], a[0]);
				292	sqr(r[2], r[3], a[1]);
				293	sqr(r[4], r[5], a[2]);
				294	sqr(r[6], r[7], a[3]);
				295	a += 4;
				296	r += 8;
				297	n -= 4;
				298	}
				299	while (n) {
				300	sqr(r[0], r[1], a[0]);
				301	a++;
				302	r += 2;
				303	n--;
				304	}
				305	}
				306
				307	#else /* !(defined(BN_LLONG) \|\| defined(BN_UMULT_HIGH)) */
				308
				309	BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
				310	BN_ULONG w) {
				311	BN_ULONG c = 0;
				312	BN_ULONG bl, bh;
				313
				314	assert(num >= 0);
				315	if (num <= 0) {
				316	return (BN_ULONG)0;
				317	}
				318
				319	bl = LBITS(w);
				320	bh = HBITS(w);
				321
				322	while (num & ~3) {
				323	mul_add(rp[0], ap[0], bl, bh, c);
				324	mul_add(rp[1], ap[1], bl, bh, c);
				325	mul_add(rp[2], ap[2], bl, bh, c);
				326	mul_add(rp[3], ap[3], bl, bh, c);
				327	ap += 4;
				328	rp += 4;
				329	num -= 4;
				330	}
				331	while (num) {
				332	mul_add(rp[0], ap[0], bl, bh, c);
				333	ap++;
				334	rp++;
				335	num--;
				336	}
				337	return c;
				338	}
				339
				340	BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w) {
				341	BN_ULONG carry = 0;
				342	BN_ULONG bl, bh;
				343
				344	assert(num >= 0);
				345	if (num <= 0) {
				346	return (BN_ULONG)0;
				347	}
				348
				349	bl = LBITS(w);
				350	bh = HBITS(w);
				351
				352	while (num & ~3) {
				353	mul(rp[0], ap[0], bl, bh, carry);
				354	mul(rp[1], ap[1], bl, bh, carry);
				355	mul(rp[2], ap[2], bl, bh, carry);
				356	mul(rp[3], ap[3], bl, bh, carry);
				357	ap += 4;
				358	rp += 4;
				359	num -= 4;
				360	}
				361	while (num) {
				362	mul(rp[0], ap[0], bl, bh, carry);
				363	ap++;
				364	rp++;
				365	num--;
				366	}
				367	return carry;
				368	}
				369
				370	void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n) {
				371	assert(n >= 0);
				372	if (n <= 0) {
				373	return;
				374	}
				375
				376	while (n & ~3) {
				377	sqr64(r[0], r[1], a[0]);
				378	sqr64(r[2], r[3], a[1]);
				379	sqr64(r[4], r[5], a[2]);
				380	sqr64(r[6], r[7], a[3]);
				381	a += 4;
				382	r += 8;
				383	n -= 4;
				384	}
				385	while (n) {
				386	sqr64(r[0], r[1], a[0]);
				387	a++;
				388	r += 2;
				389	n--;
				390	}
				391	}
				392
				393	#endif /* !(defined(BN_LLONG) \|\| defined(BN_UMULT_HIGH)) */
				394
				395	#if defined(BN_LLONG)
				396
				397	BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
				398	return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) \| l) / (BN_ULLONG)d);
				399	}
				400
				401	#else
				402
				403	/* Divide h,l by d and return the result. */
				404	BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
				405	BN_ULONG dh, dl, q, ret = 0, th, tl, t;
				406	int i, count = 2;
				407
				408	if (d == 0) {
				409	return BN_MASK2;
				410	}
				411
				412	i = BN_num_bits_word(d);
				413	assert((i == BN_BITS2) \|\| (h <= (BN_ULONG)1 << i));
				414
				415	i = BN_BITS2 - i;
				416	if (h >= d) {
				417	h -= d;
				418	}
				419
				420	if (i) {
				421	d <<= i;
				422	h = (h << i) \| (l >> (BN_BITS2 - i));
				423	l <<= i;
				424	}
				425	dh = (d & BN_MASK2h) >> BN_BITS4;
				426	dl = (d & BN_MASK2l);
				427	for (;;) {
				428	if ((h >> BN_BITS4) == dh) {
				429	q = BN_MASK2l;
				430	} else {
				431	q = h / dh;
				432	}
				433
				434	th = q * dh;
				435	tl = dl * q;
				436	for (;;) {
				437	t = h - th;
				438	if ((t & BN_MASK2h) \|\|
				439	((tl) <= ((t << BN_BITS4) \| ((l & BN_MASK2h) >> BN_BITS4)))) {
				440	break;
				441	}
				442	q--;
				443	th -= dh;
				444	tl -= dl;
				445	}
				446	t = (tl >> BN_BITS4);
				447	tl = (tl << BN_BITS4) & BN_MASK2h;
				448	th += t;
				449
				450	if (l < tl) {
				451	th++;
				452	}
				453	l -= tl;
				454	if (h < th) {
				455	h += d;
				456	q--;
				457	}
				458	h -= th;
				459
				460	if (--count == 0) {
				461	break;
				462	}
				463
				464	ret = q << BN_BITS4;
				465	h = ((h << BN_BITS4) \| (l >> BN_BITS4)) & BN_MASK2;
				466	l = (l & BN_MASK2l) << BN_BITS4;
				467	}
				468
				469	ret \|= q;
				470	return ret;
				471	}
				472
				473	#endif /* !defined(BN_LLONG) */
				474
				475	#ifdef BN_LLONG
				476	BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
				477	int n) {
				478	BN_ULLONG ll = 0;
				479
				480	assert(n >= 0);
				481	if (n <= 0) {
				482	return (BN_ULONG)0;
				483	}
				484
				485	while (n & ~3) {
				486	ll += (BN_ULLONG)a[0] + b[0];
				487	r[0] = (BN_ULONG)ll & BN_MASK2;
				488	ll >>= BN_BITS2;
				489	ll += (BN_ULLONG)a[1] + b[1];
				490	r[1] = (BN_ULONG)ll & BN_MASK2;
				491	ll >>= BN_BITS2;
				492	ll += (BN_ULLONG)a[2] + b[2];
				493	r[2] = (BN_ULONG)ll & BN_MASK2;
				494	ll >>= BN_BITS2;
				495	ll += (BN_ULLONG)a[3] + b[3];
				496	r[3] = (BN_ULONG)ll & BN_MASK2;
				497	ll >>= BN_BITS2;
				498	a += 4;
				499	b += 4;
				500	r += 4;
				501	n -= 4;
				502	}
				503	while (n) {
				504	ll += (BN_ULLONG)a[0] + b[0];
				505	r[0] = (BN_ULONG)ll & BN_MASK2;
				506	ll >>= BN_BITS2;
				507	a++;
				508	b++;
				509	r++;
				510	n--;
				511	}
				512	return (BN_ULONG)ll;
				513	}
				514
				515	#else /* !BN_LLONG */
				516
				517	BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
				518	int n) {
				519	BN_ULONG c, l, t;
				520
				521	assert(n >= 0);
				522	if (n <= 0) {
				523	return (BN_ULONG)0;
				524	}
				525
				526	c = 0;
				527	while (n & ~3) {
				528	t = a[0];
				529	t = (t + c) & BN_MASK2;
				530	c = (t < c);
				531	l = (t + b[0]) & BN_MASK2;
				532	c += (l < t);
				533	r[0] = l;
				534	t = a[1];
				535	t = (t + c) & BN_MASK2;
				536	c = (t < c);
				537	l = (t + b[1]) & BN_MASK2;
				538	c += (l < t);
				539	r[1] = l;
				540	t = a[2];
				541	t = (t + c) & BN_MASK2;
				542	c = (t < c);
				543	l = (t + b[2]) & BN_MASK2;
				544	c += (l < t);
				545	r[2] = l;
				546	t = a[3];
				547	t = (t + c) & BN_MASK2;
				548	c = (t < c);
				549	l = (t + b[3]) & BN_MASK2;
				550	c += (l < t);
				551	r[3] = l;
				552	a += 4;
				553	b += 4;
				554	r += 4;
				555	n -= 4;
				556	}
				557	while (n) {
				558	t = a[0];
				559	t = (t + c) & BN_MASK2;
				560	c = (t < c);
				561	l = (t + b[0]) & BN_MASK2;
				562	c += (l < t);
				563	r[0] = l;
				564	a++;
				565	b++;
				566	r++;
				567	n--;
				568	}
				569	return (BN_ULONG)c;
				570	}
				571
				572	#endif /* !BN_LLONG */
				573
				574	BN_ULONG bn_sub_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
				575	int n) {
				576	BN_ULONG t1, t2;
				577	int c = 0;
				578
				579	assert(n >= 0);
				580	if (n <= 0) {
				581	return (BN_ULONG)0;
				582	}
				583
				584	while (n & ~3) {
				585	t1 = a[0];
				586	t2 = b[0];
				587	r[0] = (t1 - t2 - c) & BN_MASK2;
				588	if (t1 != t2)
				589	c = (t1 < t2);
				590	t1 = a[1];
				591	t2 = b[1];
				592	r[1] = (t1 - t2 - c) & BN_MASK2;
				593	if (t1 != t2)
				594	c = (t1 < t2);
				595	t1 = a[2];
				596	t2 = b[2];
				597	r[2] = (t1 - t2 - c) & BN_MASK2;
				598	if (t1 != t2)
				599	c = (t1 < t2);
				600	t1 = a[3];
				601	t2 = b[3];
				602	r[3] = (t1 - t2 - c) & BN_MASK2;
				603	if (t1 != t2)
				604	c = (t1 < t2);
				605	a += 4;
				606	b += 4;
				607	r += 4;
				608	n -= 4;
				609	}
				610	while (n) {
				611	t1 = a[0];
				612	t2 = b[0];
				613	r[0] = (t1 - t2 - c) & BN_MASK2;
				614	if (t1 != t2)
				615	c = (t1 < t2);
				616	a++;
				617	b++;
				618	r++;
				619	n--;
				620	}
				621	return c;
				622	}
				623
				624	/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
				625	/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
				626	/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
				627	/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number c=(c2,c1,c0) */
				628
				629	#ifdef BN_LLONG
				630
				631	/* Keep in mind that additions to multiplication result can not overflow,
				632	* because its high half cannot be all-ones. */
				633	#define mul_add_c(a, b, c0, c1, c2) \
				634	do { \
				635	BN_ULONG hi; \
				636	BN_ULLONG t = (BN_ULLONG)(a) * (b); \
				637	t += c0; /* no carry */ \
				638	c0 = (BN_ULONG)Lw(t); \
				639	hi = (BN_ULONG)Hw(t); \
				640	c1 = (c1 + hi) & BN_MASK2; \
				641	if (c1 < hi) \
				642	c2++; \
				643	} while (0)
				644
				645	#define mul_add_c2(a, b, c0, c1, c2) \
				646	do { \
				647	BN_ULONG hi; \
				648	BN_ULLONG t = (BN_ULLONG)(a) * (b); \
				649	BN_ULLONG tt = t + c0; /* no carry */ \
				650	c0 = (BN_ULONG)Lw(tt); \
				651	hi = (BN_ULONG)Hw(tt); \
				652	c1 = (c1 + hi) & BN_MASK2; \
				653	if (c1 < hi) \
				654	c2++; \
				655	t += c0; /* no carry */ \
				656	c0 = (BN_ULONG)Lw(t); \
				657	hi = (BN_ULONG)Hw(t); \
				658	c1 = (c1 + hi) & BN_MASK2; \
				659	if (c1 < hi) \
				660	c2++; \
				661	} while (0)
				662
				663	#define sqr_add_c(a, i, c0, c1, c2) \
				664	do { \
				665	BN_ULONG hi; \
				666	BN_ULLONG t = (BN_ULLONG)a[i] * a[i]; \
				667	t += c0; /* no carry */ \
				668	c0 = (BN_ULONG)Lw(t); \
				669	hi = (BN_ULONG)Hw(t); \
				670	c1 = (c1 + hi) & BN_MASK2; \
				671	if (c1 < hi) \
				672	c2++; \
				673	} while (0)
				674
				675	#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
				676
				677	#elif defined(BN_UMULT_LOHI)
				678
				679	/* Keep in mind that additions to hi can not overflow, because the high word of
				680	* a multiplication result cannot be all-ones. */
				681	#define mul_add_c(a, b, c0, c1, c2) \
				682	do { \
				683	BN_ULONG ta = (a), tb = (b); \
				684	BN_ULONG lo, hi; \
				685	BN_UMULT_LOHI(lo, hi, ta, tb); \
				686	c0 += lo; \
				687	hi += (c0 < lo) ? 1 : 0; \
				688	c1 += hi; \
				689	c2 += (c1 < hi) ? 1 : 0; \
				690	} while (0)
				691
				692	#define mul_add_c2(a, b, c0, c1, c2) \
				693	do { \
				694	BN_ULONG ta = (a), tb = (b); \
				695	BN_ULONG lo, hi, tt; \
				696	BN_UMULT_LOHI(lo, hi, ta, tb); \
				697	c0 += lo; \
				698	tt = hi + ((c0 < lo) ? 1 : 0); \
				699	c1 += tt; \
				700	c2 += (c1 < tt) ? 1 : 0; \
				701	c0 += lo; \
				702	hi += (c0 < lo) ? 1 : 0; \
				703	c1 += hi; \
				704	c2 += (c1 < hi) ? 1 : 0; \
				705	} while (0)
				706
				707	#define sqr_add_c(a, i, c0, c1, c2) \
				708	do { \
				709	BN_ULONG ta = (a)[i]; \
				710	BN_ULONG lo, hi; \
				711	BN_UMULT_LOHI(lo, hi, ta, ta); \
				712	c0 += lo; \
				713	hi += (c0 < lo) ? 1 : 0; \
				714	c1 += hi; \
				715	c2 += (c1 < hi) ? 1 : 0; \
				716	} while (0)
				717
				718	#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
				719
				720	#else /* !BN_LLONG */
				721
				722	/* Keep in mind that additions to hi can not overflow, because
				723	* the high word of a multiplication result cannot be all-ones. */
				724
				725	#define mul_add_c(a, b, c0, c1, c2) \
				726	do { \
				727	BN_ULONG lo = LBITS(a), hi = HBITS(a); \
				728	BN_ULONG bl = LBITS(b), bh = HBITS(b); \
				729	mul64(lo, hi, bl, bh); \
				730	c0 = (c0 + lo) & BN_MASK2; \
				731	if (c0 < lo) \
				732	hi++; \
				733	c1 = (c1 + hi) & BN_MASK2; \
				734	if (c1 < hi) \
				735	c2++; \
				736	} while (0)
				737
				738	#define mul_add_c2(a, b, c0, c1, c2) \
				739	do { \
				740	BN_ULONG tt; \
				741	BN_ULONG lo = LBITS(a), hi = HBITS(a); \
				742	BN_ULONG bl = LBITS(b), bh = HBITS(b); \
				743	mul64(lo, hi, bl, bh); \
				744	tt = hi; \
				745	c0 = (c0 + lo) & BN_MASK2; \
				746	if (c0 < lo) \
				747	tt++; \
				748	c1 = (c1 + tt) & BN_MASK2; \
				749	if (c1 < tt) \
				750	c2++; \
				751	c0 = (c0 + lo) & BN_MASK2; \
				752	if (c0 < lo) \
				753	hi++; \
				754	c1 = (c1 + hi) & BN_MASK2; \
				755	if (c1 < hi) \
				756	c2++; \
				757	} while (0)
				758
				759	#define sqr_add_c(a, i, c0, c1, c2) \
				760	do { \
				761	BN_ULONG lo, hi; \
				762	sqr64(lo, hi, (a)[i]); \
				763	c0 = (c0 + lo) & BN_MASK2; \
				764	if (c0 < lo) \
				765	hi++; \
				766	c1 = (c1 + hi) & BN_MASK2; \
				767	if (c1 < hi) \
				768	c2++; \
				769	} while (0)
				770
				771	#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
				772	#endif /* !BN_LLONG */
				773
				774	void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b) {
				775	BN_ULONG c1, c2, c3;
				776
				777	c1 = 0;
				778	c2 = 0;
				779	c3 = 0;
				780	mul_add_c(a[0], b[0], c1, c2, c3);
				781	r[0] = c1;
				782	c1 = 0;
				783	mul_add_c(a[0], b[1], c2, c3, c1);
				784	mul_add_c(a[1], b[0], c2, c3, c1);
				785	r[1] = c2;
				786	c2 = 0;
				787	mul_add_c(a[2], b[0], c3, c1, c2);
				788	mul_add_c(a[1], b[1], c3, c1, c2);
				789	mul_add_c(a[0], b[2], c3, c1, c2);
				790	r[2] = c3;
				791	c3 = 0;
				792	mul_add_c(a[0], b[3], c1, c2, c3);
				793	mul_add_c(a[1], b[2], c1, c2, c3);
				794	mul_add_c(a[2], b[1], c1, c2, c3);
				795	mul_add_c(a[3], b[0], c1, c2, c3);
				796	r[3] = c1;
				797	c1 = 0;
				798	mul_add_c(a[4], b[0], c2, c3, c1);
				799	mul_add_c(a[3], b[1], c2, c3, c1);
				800	mul_add_c(a[2], b[2], c2, c3, c1);
				801	mul_add_c(a[1], b[3], c2, c3, c1);
				802	mul_add_c(a[0], b[4], c2, c3, c1);
				803	r[4] = c2;
				804	c2 = 0;
				805	mul_add_c(a[0], b[5], c3, c1, c2);
				806	mul_add_c(a[1], b[4], c3, c1, c2);
				807	mul_add_c(a[2], b[3], c3, c1, c2);
				808	mul_add_c(a[3], b[2], c3, c1, c2);
				809	mul_add_c(a[4], b[1], c3, c1, c2);
				810	mul_add_c(a[5], b[0], c3, c1, c2);
				811	r[5] = c3;
				812	c3 = 0;
				813	mul_add_c(a[6], b[0], c1, c2, c3);
				814	mul_add_c(a[5], b[1], c1, c2, c3);
				815	mul_add_c(a[4], b[2], c1, c2, c3);
				816	mul_add_c(a[3], b[3], c1, c2, c3);
				817	mul_add_c(a[2], b[4], c1, c2, c3);
				818	mul_add_c(a[1], b[5], c1, c2, c3);
				819	mul_add_c(a[0], b[6], c1, c2, c3);
				820	r[6] = c1;
				821	c1 = 0;
				822	mul_add_c(a[0], b[7], c2, c3, c1);
				823	mul_add_c(a[1], b[6], c2, c3, c1);
				824	mul_add_c(a[2], b[5], c2, c3, c1);
				825	mul_add_c(a[3], b[4], c2, c3, c1);
				826	mul_add_c(a[4], b[3], c2, c3, c1);
				827	mul_add_c(a[5], b[2], c2, c3, c1);
				828	mul_add_c(a[6], b[1], c2, c3, c1);
				829	mul_add_c(a[7], b[0], c2, c3, c1);
				830	r[7] = c2;
				831	c2 = 0;
				832	mul_add_c(a[7], b[1], c3, c1, c2);
				833	mul_add_c(a[6], b[2], c3, c1, c2);
				834	mul_add_c(a[5], b[3], c3, c1, c2);
				835	mul_add_c(a[4], b[4], c3, c1, c2);
				836	mul_add_c(a[3], b[5], c3, c1, c2);
				837	mul_add_c(a[2], b[6], c3, c1, c2);
				838	mul_add_c(a[1], b[7], c3, c1, c2);
				839	r[8] = c3;
				840	c3 = 0;
				841	mul_add_c(a[2], b[7], c1, c2, c3);
				842	mul_add_c(a[3], b[6], c1, c2, c3);
				843	mul_add_c(a[4], b[5], c1, c2, c3);
				844	mul_add_c(a[5], b[4], c1, c2, c3);
				845	mul_add_c(a[6], b[3], c1, c2, c3);
				846	mul_add_c(a[7], b[2], c1, c2, c3);
				847	r[9] = c1;
				848	c1 = 0;
				849	mul_add_c(a[7], b[3], c2, c3, c1);
				850	mul_add_c(a[6], b[4], c2, c3, c1);
				851	mul_add_c(a[5], b[5], c2, c3, c1);
				852	mul_add_c(a[4], b[6], c2, c3, c1);
				853	mul_add_c(a[3], b[7], c2, c3, c1);
				854	r[10] = c2;
				855	c2 = 0;
				856	mul_add_c(a[4], b[7], c3, c1, c2);
				857	mul_add_c(a[5], b[6], c3, c1, c2);
				858	mul_add_c(a[6], b[5], c3, c1, c2);
				859	mul_add_c(a[7], b[4], c3, c1, c2);
				860	r[11] = c3;
				861	c3 = 0;
				862	mul_add_c(a[7], b[5], c1, c2, c3);
				863	mul_add_c(a[6], b[6], c1, c2, c3);
				864	mul_add_c(a[5], b[7], c1, c2, c3);
				865	r[12] = c1;
				866	c1 = 0;
				867	mul_add_c(a[6], b[7], c2, c3, c1);
				868	mul_add_c(a[7], b[6], c2, c3, c1);
				869	r[13] = c2;
				870	c2 = 0;
				871	mul_add_c(a[7], b[7], c3, c1, c2);
				872	r[14] = c3;
				873	r[15] = c1;
				874	}
				875
				876	void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b) {
				877	BN_ULONG c1, c2, c3;
				878
				879	c1 = 0;
				880	c2 = 0;
				881	c3 = 0;
				882	mul_add_c(a[0], b[0], c1, c2, c3);
				883	r[0] = c1;
				884	c1 = 0;
				885	mul_add_c(a[0], b[1], c2, c3, c1);
				886	mul_add_c(a[1], b[0], c2, c3, c1);
				887	r[1] = c2;
				888	c2 = 0;
				889	mul_add_c(a[2], b[0], c3, c1, c2);
				890	mul_add_c(a[1], b[1], c3, c1, c2);
				891	mul_add_c(a[0], b[2], c3, c1, c2);
				892	r[2] = c3;
				893	c3 = 0;
				894	mul_add_c(a[0], b[3], c1, c2, c3);
				895	mul_add_c(a[1], b[2], c1, c2, c3);
				896	mul_add_c(a[2], b[1], c1, c2, c3);
				897	mul_add_c(a[3], b[0], c1, c2, c3);
				898	r[3] = c1;
				899	c1 = 0;
				900	mul_add_c(a[3], b[1], c2, c3, c1);
				901	mul_add_c(a[2], b[2], c2, c3, c1);
				902	mul_add_c(a[1], b[3], c2, c3, c1);
				903	r[4] = c2;
				904	c2 = 0;
				905	mul_add_c(a[2], b[3], c3, c1, c2);
				906	mul_add_c(a[3], b[2], c3, c1, c2);
				907	r[5] = c3;
				908	c3 = 0;
				909	mul_add_c(a[3], b[3], c1, c2, c3);
				910	r[6] = c1;
				911	r[7] = c2;
				912	}
				913
				914	void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a) {
				915	BN_ULONG c1, c2, c3;
				916
				917	c1 = 0;
				918	c2 = 0;
				919	c3 = 0;
				920	sqr_add_c(a, 0, c1, c2, c3);
				921	r[0] = c1;
				922	c1 = 0;
				923	sqr_add_c2(a, 1, 0, c2, c3, c1);
				924	r[1] = c2;
				925	c2 = 0;
				926	sqr_add_c(a, 1, c3, c1, c2);
				927	sqr_add_c2(a, 2, 0, c3, c1, c2);
				928	r[2] = c3;
				929	c3 = 0;
				930	sqr_add_c2(a, 3, 0, c1, c2, c3);
				931	sqr_add_c2(a, 2, 1, c1, c2, c3);
				932	r[3] = c1;
				933	c1 = 0;
				934	sqr_add_c(a, 2, c2, c3, c1);
				935	sqr_add_c2(a, 3, 1, c2, c3, c1);
				936	sqr_add_c2(a, 4, 0, c2, c3, c1);
				937	r[4] = c2;
				938	c2 = 0;
				939	sqr_add_c2(a, 5, 0, c3, c1, c2);
				940	sqr_add_c2(a, 4, 1, c3, c1, c2);
				941	sqr_add_c2(a, 3, 2, c3, c1, c2);
				942	r[5] = c3;
				943	c3 = 0;
				944	sqr_add_c(a, 3, c1, c2, c3);
				945	sqr_add_c2(a, 4, 2, c1, c2, c3);
				946	sqr_add_c2(a, 5, 1, c1, c2, c3);
				947	sqr_add_c2(a, 6, 0, c1, c2, c3);
				948	r[6] = c1;
				949	c1 = 0;
				950	sqr_add_c2(a, 7, 0, c2, c3, c1);
				951	sqr_add_c2(a, 6, 1, c2, c3, c1);
				952	sqr_add_c2(a, 5, 2, c2, c3, c1);
				953	sqr_add_c2(a, 4, 3, c2, c3, c1);
				954	r[7] = c2;
				955	c2 = 0;
				956	sqr_add_c(a, 4, c3, c1, c2);
				957	sqr_add_c2(a, 5, 3, c3, c1, c2);
				958	sqr_add_c2(a, 6, 2, c3, c1, c2);
				959	sqr_add_c2(a, 7, 1, c3, c1, c2);
				960	r[8] = c3;
				961	c3 = 0;
				962	sqr_add_c2(a, 7, 2, c1, c2, c3);
				963	sqr_add_c2(a, 6, 3, c1, c2, c3);
				964	sqr_add_c2(a, 5, 4, c1, c2, c3);
				965	r[9] = c1;
				966	c1 = 0;
				967	sqr_add_c(a, 5, c2, c3, c1);
				968	sqr_add_c2(a, 6, 4, c2, c3, c1);
				969	sqr_add_c2(a, 7, 3, c2, c3, c1);
				970	r[10] = c2;
				971	c2 = 0;
				972	sqr_add_c2(a, 7, 4, c3, c1, c2);
				973	sqr_add_c2(a, 6, 5, c3, c1, c2);
				974	r[11] = c3;
				975	c3 = 0;
				976	sqr_add_c(a, 6, c1, c2, c3);
				977	sqr_add_c2(a, 7, 5, c1, c2, c3);
				978	r[12] = c1;
				979	c1 = 0;
				980	sqr_add_c2(a, 7, 6, c2, c3, c1);
				981	r[13] = c2;
				982	c2 = 0;
				983	sqr_add_c(a, 7, c3, c1, c2);
				984	r[14] = c3;
				985	r[15] = c1;
				986	}
				987
				988	void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a) {
				989	BN_ULONG c1, c2, c3;
				990
				991	c1 = 0;
				992	c2 = 0;
				993	c3 = 0;
				994	sqr_add_c(a, 0, c1, c2, c3);
				995	r[0] = c1;
				996	c1 = 0;
				997	sqr_add_c2(a, 1, 0, c2, c3, c1);
				998	r[1] = c2;
				999	c2 = 0;
				1000	sqr_add_c(a, 1, c3, c1, c2);
				1001	sqr_add_c2(a, 2, 0, c3, c1, c2);
				1002	r[2] = c3;
				1003	c3 = 0;
				1004	sqr_add_c2(a, 3, 0, c1, c2, c3);
				1005	sqr_add_c2(a, 2, 1, c1, c2, c3);
				1006	r[3] = c1;
				1007	c1 = 0;
				1008	sqr_add_c(a, 2, c2, c3, c1);
				1009	sqr_add_c2(a, 3, 1, c2, c3, c1);
				1010	r[4] = c2;
				1011	c2 = 0;
				1012	sqr_add_c2(a, 3, 2, c3, c1, c2);
				1013	r[5] = c3;
				1014	c3 = 0;
				1015	sqr_add_c(a, 3, c1, c2, c3);
				1016	r[6] = c1;
				1017	r[7] = c2;
				1018	}
				1019
				1020	#if defined(OPENSSL_NO_ASM) \|\| (!defined(OPENSSL_ARM) && !defined(OPENSSL_X86_64))
				1021	/* This is essentially reference implementation, which may or may not
				1022	* result in performance improvement. E.g. on IA-32 this routine was
				1023	* observed to give 40% faster rsa1024 private key operations and 10%
				1024	* faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
				1025	* by 10% and worsens rsa4096 sign by 15%. Once again, it's a
				1026	* reference implementation, one to be used as starting point for
				1027	* platform-specific assembler. Mentioned numbers apply to compiler
				1028	* generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
				1029	* can vary not only from platform to platform, but even for compiler
				1030	* versions. Assembler vs. assembler improvement coefficients can
				1031	* [and are known to] differ and are to be documented elsewhere. */
				1032	int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
				1033	const BN_ULONG np, const BN_ULONG n0p, int num) {
				1034	BN_ULONG c0, c1, ml, *tp, n0;
				1035	#ifdef mul64
				1036	BN_ULONG mh;
				1037	#endif
				1038	volatile BN_ULONG *vp;
				1039	int i = 0, j;
				1040
				1041	#if 0 /* template for platform-specific implementation */
				1042	if (ap==bp) return bn_sqr_mont(rp,ap,np,n0p,num);
				1043	#endif
				1044	vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
				1045
				1046	n0 = *n0p;
				1047
				1048	c0 = 0;
				1049	ml = bp[0];
				1050	#ifdef mul64
				1051	mh = HBITS(ml);
				1052	ml = LBITS(ml);
				1053	for (j = 0; j < num; ++j)
				1054	mul(tp[j], ap[j], ml, mh, c0);
				1055	#else
				1056	for (j = 0; j < num; ++j)
				1057	mul(tp[j], ap[j], ml, c0);
				1058	#endif
				1059
				1060	tp[num] = c0;
				1061	tp[num + 1] = 0;
				1062	goto enter;
				1063
				1064	for (i = 0; i < num; i++) {
				1065	c0 = 0;
				1066	ml = bp[i];
				1067	#ifdef mul64
				1068	mh = HBITS(ml);
				1069	ml = LBITS(ml);
				1070	for (j = 0; j < num; ++j)
				1071	mul_add(tp[j], ap[j], ml, mh, c0);
				1072	#else
				1073	for (j = 0; j < num; ++j)
				1074	mul_add(tp[j], ap[j], ml, c0);
				1075	#endif
				1076	c1 = (tp[num] + c0) & BN_MASK2;
				1077	tp[num] = c1;
				1078	tp[num + 1] = (c1 < c0 ? 1 : 0);
				1079	enter:
				1080	c1 = tp[0];
				1081	ml = (c1 * n0) & BN_MASK2;
				1082	c0 = 0;
				1083	#ifdef mul64
				1084	mh = HBITS(ml);
				1085	ml = LBITS(ml);
				1086	mul_add(c1, np[0], ml, mh, c0);
				1087	#else
				1088	mul_add(c1, ml, np[0], c0);
				1089	#endif
				1090	for (j = 1; j < num; j++) {
				1091	c1 = tp[j];
				1092	#ifdef mul64
				1093	mul_add(c1, np[j], ml, mh, c0);
				1094	#else
				1095	mul_add(c1, ml, np[j], c0);
				1096	#endif
				1097	tp[j - 1] = c1 & BN_MASK2;
				1098	}
				1099	c1 = (tp[num] + c0) & BN_MASK2;
				1100	tp[num - 1] = c1;
				1101	tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
				1102	}
				1103
				1104	if (tp[num] != 0 \|\| tp[num - 1] >= np[num - 1]) {
				1105	c0 = bn_sub_words(rp, tp, np, num);
				1106	if (tp[num] != 0 \|\| c0 == 0) {
				1107	for (i = 0; i < num + 2; i++)
				1108	vp[i] = 0;
				1109	return 1;
				1110	}
				1111	}
				1112	for (i = 0; i < num; i++)
				1113	rp[i] = tp[i], vp[i] = 0;
				1114	vp[num] = 0;
				1115	vp[num + 1] = 0;
				1116	return 1;
				1117	}
				1118	#endif
				1119
				1120	#endif