Blame - src/crypto/fipsmodule/bn/asm/x86_64-gcc.c - platform/external/boringssl

blob: 72e7689cc5a6dd5619715aabaf81ec5abcce0170 [file] [log] [blame]

Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	1	/* x86_64 BIGNUM accelerator version 0.1, December 2002.
				2	*
				3	* Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
				4	* project.
				5	*
				6	* Rights for redistribution and usage in source and binary forms are
				7	* granted according to the OpenSSL license. Warranty of any kind is
				8	* disclaimed.
				9	*
				10	* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
				11	* versions, like 1.0...
				12	* A. Well, that's because this code is basically a quick-n-dirty
				13	* proof-of-concept hack. As you can see it's implemented with
				14	* inline assembler, which means that you're bound to GCC and that
				15	* there might be enough room for further improvement.
				16	*
				17	* Q. Why inline assembler?
				18	* A. x86_64 features own ABI which I'm not familiar with. This is
				19	* why I decided to let the compiler take care of subroutine
				20	* prologue/epilogue as well as register allocation. For reference.
				21	* Win64 implements different ABI for AMD64, different from Linux.
				22	*
				23	* Q. How much faster does it get?
				24	* A. 'apps/openssl speed rsa dsa' output with no-asm:
				25	*
				26	* sign verify sign/s verify/s
				27	* rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
				28	* rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
				29	* rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
				30	* rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
				31	* sign verify sign/s verify/s
				32	* dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
				33	* dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
				34	* dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
				35	*
				36	* 'apps/openssl speed rsa dsa' output with this module:
				37	*
				38	* sign verify sign/s verify/s
				39	* rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
				40	* rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
				41	* rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
				42	* rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
				43	* sign verify sign/s verify/s
				44	* dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
				45	* dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
				46	* dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
				47	*
				48	* For the reference. IA-32 assembler implementation performs
				49	* very much like 64-bit code compiled with no-asm on the same
				50	* machine.
				51	*/
				52
David Benjamin	4969cc9	2016-04-22 15:02:23 -0400	[diff] [blame]	53	#include <openssl/bn.h>
				54
				55	/* TODO(davidben): Get this file working on Windows x64. */
				56	#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__GNUC__)
				57
				58	#include "../internal.h"
				59
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	60
				61	#undef mul
				62	#undef mul_add
				63
				64	#define asm __asm__
				65
				66	/*
				67	* "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
				68	* "g"(0) let the compiler to decide where does it
				69	* want to keep the value of zero;
				70	*/
				71	#define mul_add(r, a, word, carry) \
				72	do { \
				73	register BN_ULONG high, low; \
				74	asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \
				75	asm("addq %2,%0; adcq %3,%1" \
				76	: "+r"(carry), "+d"(high) \
				77	: "a"(low), "g"(0) \
				78	: "cc"); \
				79	asm("addq %2,%0; adcq %3,%1" \
				80	: "+m"(r), "+d"(high) \
				81	: "r"(carry), "g"(0) \
				82	: "cc"); \
David Benjamin	95add82	2016-10-19 01:09:12 -0400	[diff] [blame]	83	(carry) = high; \
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	84	} while (0)
				85
				86	#define mul(r, a, word, carry) \
				87	do { \
				88	register BN_ULONG high, low; \
				89	asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \
				90	asm("addq %2,%0; adcq %3,%1" \
				91	: "+r"(carry), "+d"(high) \
				92	: "a"(low), "g"(0) \
				93	: "cc"); \
David Benjamin	95add82	2016-10-19 01:09:12 -0400	[diff] [blame]	94	(r) = (carry); \
				95	(carry) = high; \
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	96	} while (0)
				97	#undef sqr
				98	#define sqr(r0, r1, a) asm("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc");
				99
				100	BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
				101	BN_ULONG w) {
				102	BN_ULONG c1 = 0;
				103
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	104	if (num <= 0) {
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	105	return (c1);
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	106	}
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	107
				108	while (num & ~3) {
				109	mul_add(rp[0], ap[0], w, c1);
				110	mul_add(rp[1], ap[1], w, c1);
				111	mul_add(rp[2], ap[2], w, c1);
				112	mul_add(rp[3], ap[3], w, c1);
				113	ap += 4;
				114	rp += 4;
				115	num -= 4;
				116	}
				117	if (num) {
				118	mul_add(rp[0], ap[0], w, c1);
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	119	if (--num == 0) {
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	120	return c1;
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	121	}
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	122	mul_add(rp[1], ap[1], w, c1);
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	123	if (--num == 0) {
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	124	return c1;
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	125	}
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	126	mul_add(rp[2], ap[2], w, c1);
				127	return c1;
				128	}
				129
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	130	return c1;
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	131	}
				132
				133	BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w) {
				134	BN_ULONG c1 = 0;
				135
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	136	if (num <= 0) {
				137	return c1;
				138	}
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	139
				140	while (num & ~3) {
				141	mul(rp[0], ap[0], w, c1);
				142	mul(rp[1], ap[1], w, c1);
				143	mul(rp[2], ap[2], w, c1);
				144	mul(rp[3], ap[3], w, c1);
				145	ap += 4;
				146	rp += 4;
				147	num -= 4;
				148	}
				149	if (num) {
				150	mul(rp[0], ap[0], w, c1);
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	151	if (--num == 0) {
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	152	return c1;
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	153	}
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	154	mul(rp[1], ap[1], w, c1);
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	155	if (--num == 0) {
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	156	return c1;
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	157	}
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	158	mul(rp[2], ap[2], w, c1);
				159	}
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	160	return c1;
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	161	}
				162
				163	void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n) {
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	164	if (n <= 0) {
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	165	return;
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	166	}
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	167
				168	while (n & ~3) {
				169	sqr(r[0], r[1], a[0]);
				170	sqr(r[2], r[3], a[1]);
				171	sqr(r[4], r[5], a[2]);
				172	sqr(r[6], r[7], a[3]);
				173	a += 4;
				174	r += 8;
				175	n -= 4;
				176	}
				177	if (n) {
				178	sqr(r[0], r[1], a[0]);
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	179	if (--n == 0) {
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	180	return;
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	181	}
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	182	sqr(r[2], r[3], a[1]);
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	183	if (--n == 0) {
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	184	return;
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	185	}
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	186	sqr(r[4], r[5], a[2]);
				187	}
				188	}
				189
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	190	BN_ULONG bn_add_words(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
				191	int n) {
				192	BN_ULONG ret;
				193	size_t i = 0;
				194
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	195	if (n <= 0) {
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	196	return 0;
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	197	}
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	198
				199	asm volatile (
				200	" subq %0,%0 \n" /* clear carry */
				201	" jmp 1f \n"
				202	".p2align 4 \n"
Robert Sloan	8ff0355	2017-06-14 12:40:58 -0700	[diff] [blame^]	203	"1:"
				204	" movq (%4,%2,8),%0 \n"
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	205	" adcq (%5,%2,8),%0 \n"
				206	" movq %0,(%3,%2,8) \n"
				207	" lea 1(%2),%2 \n"
				208	" loop 1b \n"
				209	" sbbq %0,%0 \n"
				210	: "=&r"(ret), "+c"(n), "+r"(i)
				211	: "r"(rp), "r"(ap), "r"(bp)
				212	: "cc", "memory");
				213
				214	return ret & 1;
				215	}
				216
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	217	BN_ULONG bn_sub_words(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
				218	int n) {
				219	BN_ULONG ret;
				220	size_t i = 0;
				221
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	222	if (n <= 0) {
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	223	return 0;
Adam Langley	e9ada86	2015-05-11 17:20:37 -0700	[diff] [blame]	224	}
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	225
				226	asm volatile (
				227	" subq %0,%0 \n" /* clear borrow */
				228	" jmp 1f \n"
				229	".p2align 4 \n"
Robert Sloan	8ff0355	2017-06-14 12:40:58 -0700	[diff] [blame^]	230	"1:"
				231	" movq (%4,%2,8),%0 \n"
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	232	" sbbq (%5,%2,8),%0 \n"
				233	" movq %0,(%3,%2,8) \n"
				234	" lea 1(%2),%2 \n"
				235	" loop 1b \n"
				236	" sbbq %0,%0 \n"
				237	: "=&r"(ret), "+c"(n), "+r"(i)
				238	: "r"(rp), "r"(ap), "r"(bp)
				239	: "cc", "memory");
				240
				241	return ret & 1;
				242	}
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	243
				244	/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
				245	/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
				246	/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
				247	/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number c=(c2,c1,c0)
				248	*/
				249
				250	/* Keep in mind that carrying into high part of multiplication result can not
				251	* overflow, because it cannot be all-ones. */
				252	#define mul_add_c(a, b, c0, c1, c2) \
				253	do { \
				254	BN_ULONG t1, t2; \
				255	asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
				256	asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
				257	: "+r"(c0), "+r"(c1), "+r"(c2) \
				258	: "r"(t1), "r"(t2), "g"(0) \
				259	: "cc"); \
				260	} while (0)
				261
David Benjamin	95add82	2016-10-19 01:09:12 -0400	[diff] [blame]	262	#define sqr_add_c(a, i, c0, c1, c2) \
				263	do { \
				264	BN_ULONG t1, t2; \
				265	asm("mulq %2" : "=a"(t1), "=d"(t2) : "a"((a)[i]) : "cc"); \
				266	asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
				267	: "+r"(c0), "+r"(c1), "+r"(c2) \
				268	: "r"(t1), "r"(t2), "g"(0) \
				269	: "cc"); \
Adam Langley	d9e397b	2015-01-22 14:27:53 -0800	[diff] [blame]	270	} while (0)
				271
				272	#define mul_add_c2(a, b, c0, c1, c2) \
				273	do { \
				274	BN_ULONG t1, t2; \
				275	asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
				276	asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
				277	: "+r"(c0), "+r"(c1), "+r"(c2) \
				278	: "r"(t1), "r"(t2), "g"(0) \
				279	: "cc"); \
				280	asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
				281	: "+r"(c0), "+r"(c1), "+r"(c2) \
				282	: "r"(t1), "r"(t2), "g"(0) \
				283	: "cc"); \
				284	} while (0)
				285
				286	#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
				287
				288	void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b) {
				289	BN_ULONG c1, c2, c3;
				290
				291	c1 = 0;
				292	c2 = 0;
				293	c3 = 0;
				294	mul_add_c(a[0], b[0], c1, c2, c3);
				295	r[0] = c1;
				296	c1 = 0;
				297	mul_add_c(a[0], b[1], c2, c3, c1);
				298	mul_add_c(a[1], b[0], c2, c3, c1);
				299	r[1] = c2;
				300	c2 = 0;
				301	mul_add_c(a[2], b[0], c3, c1, c2);
				302	mul_add_c(a[1], b[1], c3, c1, c2);
				303	mul_add_c(a[0], b[2], c3, c1, c2);
				304	r[2] = c3;
				305	c3 = 0;
				306	mul_add_c(a[0], b[3], c1, c2, c3);
				307	mul_add_c(a[1], b[2], c1, c2, c3);
				308	mul_add_c(a[2], b[1], c1, c2, c3);
				309	mul_add_c(a[3], b[0], c1, c2, c3);
				310	r[3] = c1;
				311	c1 = 0;
				312	mul_add_c(a[4], b[0], c2, c3, c1);
				313	mul_add_c(a[3], b[1], c2, c3, c1);
				314	mul_add_c(a[2], b[2], c2, c3, c1);
				315	mul_add_c(a[1], b[3], c2, c3, c1);
				316	mul_add_c(a[0], b[4], c2, c3, c1);
				317	r[4] = c2;
				318	c2 = 0;
				319	mul_add_c(a[0], b[5], c3, c1, c2);
				320	mul_add_c(a[1], b[4], c3, c1, c2);
				321	mul_add_c(a[2], b[3], c3, c1, c2);
				322	mul_add_c(a[3], b[2], c3, c1, c2);
				323	mul_add_c(a[4], b[1], c3, c1, c2);
				324	mul_add_c(a[5], b[0], c3, c1, c2);
				325	r[5] = c3;
				326	c3 = 0;
				327	mul_add_c(a[6], b[0], c1, c2, c3);
				328	mul_add_c(a[5], b[1], c1, c2, c3);
				329	mul_add_c(a[4], b[2], c1, c2, c3);
				330	mul_add_c(a[3], b[3], c1, c2, c3);
				331	mul_add_c(a[2], b[4], c1, c2, c3);
				332	mul_add_c(a[1], b[5], c1, c2, c3);
				333	mul_add_c(a[0], b[6], c1, c2, c3);
				334	r[6] = c1;
				335	c1 = 0;
				336	mul_add_c(a[0], b[7], c2, c3, c1);
				337	mul_add_c(a[1], b[6], c2, c3, c1);
				338	mul_add_c(a[2], b[5], c2, c3, c1);
				339	mul_add_c(a[3], b[4], c2, c3, c1);
				340	mul_add_c(a[4], b[3], c2, c3, c1);
				341	mul_add_c(a[5], b[2], c2, c3, c1);
				342	mul_add_c(a[6], b[1], c2, c3, c1);
				343	mul_add_c(a[7], b[0], c2, c3, c1);
				344	r[7] = c2;
				345	c2 = 0;
				346	mul_add_c(a[7], b[1], c3, c1, c2);
				347	mul_add_c(a[6], b[2], c3, c1, c2);
				348	mul_add_c(a[5], b[3], c3, c1, c2);
				349	mul_add_c(a[4], b[4], c3, c1, c2);
				350	mul_add_c(a[3], b[5], c3, c1, c2);
				351	mul_add_c(a[2], b[6], c3, c1, c2);
				352	mul_add_c(a[1], b[7], c3, c1, c2);
				353	r[8] = c3;
				354	c3 = 0;
				355	mul_add_c(a[2], b[7], c1, c2, c3);
				356	mul_add_c(a[3], b[6], c1, c2, c3);
				357	mul_add_c(a[4], b[5], c1, c2, c3);
				358	mul_add_c(a[5], b[4], c1, c2, c3);
				359	mul_add_c(a[6], b[3], c1, c2, c3);
				360	mul_add_c(a[7], b[2], c1, c2, c3);
				361	r[9] = c1;
				362	c1 = 0;
				363	mul_add_c(a[7], b[3], c2, c3, c1);
				364	mul_add_c(a[6], b[4], c2, c3, c1);
				365	mul_add_c(a[5], b[5], c2, c3, c1);
				366	mul_add_c(a[4], b[6], c2, c3, c1);
				367	mul_add_c(a[3], b[7], c2, c3, c1);
				368	r[10] = c2;
				369	c2 = 0;
				370	mul_add_c(a[4], b[7], c3, c1, c2);
				371	mul_add_c(a[5], b[6], c3, c1, c2);
				372	mul_add_c(a[6], b[5], c3, c1, c2);
				373	mul_add_c(a[7], b[4], c3, c1, c2);
				374	r[11] = c3;
				375	c3 = 0;
				376	mul_add_c(a[7], b[5], c1, c2, c3);
				377	mul_add_c(a[6], b[6], c1, c2, c3);
				378	mul_add_c(a[5], b[7], c1, c2, c3);
				379	r[12] = c1;
				380	c1 = 0;
				381	mul_add_c(a[6], b[7], c2, c3, c1);
				382	mul_add_c(a[7], b[6], c2, c3, c1);
				383	r[13] = c2;
				384	c2 = 0;
				385	mul_add_c(a[7], b[7], c3, c1, c2);
				386	r[14] = c3;
				387	r[15] = c1;
				388	}
				389
				390	void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b) {
				391	BN_ULONG c1, c2, c3;
				392
				393	c1 = 0;
				394	c2 = 0;
				395	c3 = 0;
				396	mul_add_c(a[0], b[0], c1, c2, c3);
				397	r[0] = c1;
				398	c1 = 0;
				399	mul_add_c(a[0], b[1], c2, c3, c1);
				400	mul_add_c(a[1], b[0], c2, c3, c1);
				401	r[1] = c2;
				402	c2 = 0;
				403	mul_add_c(a[2], b[0], c3, c1, c2);
				404	mul_add_c(a[1], b[1], c3, c1, c2);
				405	mul_add_c(a[0], b[2], c3, c1, c2);
				406	r[2] = c3;
				407	c3 = 0;
				408	mul_add_c(a[0], b[3], c1, c2, c3);
				409	mul_add_c(a[1], b[2], c1, c2, c3);
				410	mul_add_c(a[2], b[1], c1, c2, c3);
				411	mul_add_c(a[3], b[0], c1, c2, c3);
				412	r[3] = c1;
				413	c1 = 0;
				414	mul_add_c(a[3], b[1], c2, c3, c1);
				415	mul_add_c(a[2], b[2], c2, c3, c1);
				416	mul_add_c(a[1], b[3], c2, c3, c1);
				417	r[4] = c2;
				418	c2 = 0;
				419	mul_add_c(a[2], b[3], c3, c1, c2);
				420	mul_add_c(a[3], b[2], c3, c1, c2);
				421	r[5] = c3;
				422	c3 = 0;
				423	mul_add_c(a[3], b[3], c1, c2, c3);
				424	r[6] = c1;
				425	r[7] = c2;
				426	}
				427
				428	void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a) {
				429	BN_ULONG c1, c2, c3;
				430
				431	c1 = 0;
				432	c2 = 0;
				433	c3 = 0;
				434	sqr_add_c(a, 0, c1, c2, c3);
				435	r[0] = c1;
				436	c1 = 0;
				437	sqr_add_c2(a, 1, 0, c2, c3, c1);
				438	r[1] = c2;
				439	c2 = 0;
				440	sqr_add_c(a, 1, c3, c1, c2);
				441	sqr_add_c2(a, 2, 0, c3, c1, c2);
				442	r[2] = c3;
				443	c3 = 0;
				444	sqr_add_c2(a, 3, 0, c1, c2, c3);
				445	sqr_add_c2(a, 2, 1, c1, c2, c3);
				446	r[3] = c1;
				447	c1 = 0;
				448	sqr_add_c(a, 2, c2, c3, c1);
				449	sqr_add_c2(a, 3, 1, c2, c3, c1);
				450	sqr_add_c2(a, 4, 0, c2, c3, c1);
				451	r[4] = c2;
				452	c2 = 0;
				453	sqr_add_c2(a, 5, 0, c3, c1, c2);
				454	sqr_add_c2(a, 4, 1, c3, c1, c2);
				455	sqr_add_c2(a, 3, 2, c3, c1, c2);
				456	r[5] = c3;
				457	c3 = 0;
				458	sqr_add_c(a, 3, c1, c2, c3);
				459	sqr_add_c2(a, 4, 2, c1, c2, c3);
				460	sqr_add_c2(a, 5, 1, c1, c2, c3);
				461	sqr_add_c2(a, 6, 0, c1, c2, c3);
				462	r[6] = c1;
				463	c1 = 0;
				464	sqr_add_c2(a, 7, 0, c2, c3, c1);
				465	sqr_add_c2(a, 6, 1, c2, c3, c1);
				466	sqr_add_c2(a, 5, 2, c2, c3, c1);
				467	sqr_add_c2(a, 4, 3, c2, c3, c1);
				468	r[7] = c2;
				469	c2 = 0;
				470	sqr_add_c(a, 4, c3, c1, c2);
				471	sqr_add_c2(a, 5, 3, c3, c1, c2);
				472	sqr_add_c2(a, 6, 2, c3, c1, c2);
				473	sqr_add_c2(a, 7, 1, c3, c1, c2);
				474	r[8] = c3;
				475	c3 = 0;
				476	sqr_add_c2(a, 7, 2, c1, c2, c3);
				477	sqr_add_c2(a, 6, 3, c1, c2, c3);
				478	sqr_add_c2(a, 5, 4, c1, c2, c3);
				479	r[9] = c1;
				480	c1 = 0;
				481	sqr_add_c(a, 5, c2, c3, c1);
				482	sqr_add_c2(a, 6, 4, c2, c3, c1);
				483	sqr_add_c2(a, 7, 3, c2, c3, c1);
				484	r[10] = c2;
				485	c2 = 0;
				486	sqr_add_c2(a, 7, 4, c3, c1, c2);
				487	sqr_add_c2(a, 6, 5, c3, c1, c2);
				488	r[11] = c3;
				489	c3 = 0;
				490	sqr_add_c(a, 6, c1, c2, c3);
				491	sqr_add_c2(a, 7, 5, c1, c2, c3);
				492	r[12] = c1;
				493	c1 = 0;
				494	sqr_add_c2(a, 7, 6, c2, c3, c1);
				495	r[13] = c2;
				496	c2 = 0;
				497	sqr_add_c(a, 7, c3, c1, c2);
				498	r[14] = c3;
				499	r[15] = c1;
				500	}
				501
				502	void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a) {
				503	BN_ULONG c1, c2, c3;
				504
				505	c1 = 0;
				506	c2 = 0;
				507	c3 = 0;
				508	sqr_add_c(a, 0, c1, c2, c3);
				509	r[0] = c1;
				510	c1 = 0;
				511	sqr_add_c2(a, 1, 0, c2, c3, c1);
				512	r[1] = c2;
				513	c2 = 0;
				514	sqr_add_c(a, 1, c3, c1, c2);
				515	sqr_add_c2(a, 2, 0, c3, c1, c2);
				516	r[2] = c3;
				517	c3 = 0;
				518	sqr_add_c2(a, 3, 0, c1, c2, c3);
				519	sqr_add_c2(a, 2, 1, c1, c2, c3);
				520	r[3] = c1;
				521	c1 = 0;
				522	sqr_add_c(a, 2, c2, c3, c1);
				523	sqr_add_c2(a, 3, 1, c2, c3, c1);
				524	r[4] = c2;
				525	c2 = 0;
				526	sqr_add_c2(a, 3, 2, c3, c1, c2);
				527	r[5] = c3;
				528	c3 = 0;
				529	sqr_add_c(a, 3, c1, c2, c3);
				530	r[6] = c1;
				531	r[7] = c2;
				532	}
				533
Robert Sloan	8ff0355	2017-06-14 12:40:58 -0700	[diff] [blame^]	534	#undef mul_add
				535	#undef mul
				536	#undef sqr
				537	#undef mul_add_c
				538	#undef sqr_add_c
				539	#undef mul_add_c2
				540	#undef sqr_add_c2
				541
David Benjamin	4969cc9	2016-04-22 15:02:23 -0400	[diff] [blame]	542	#endif /* !NO_ASM && X86_64 && __GNUC__ */