Blame - src/crypto/fipsmodule/ec/asm/p256_beeu-x86_64-asm.pl - platform/external/boringssl

blob: 0bb65477ac60094e7b604d5fb36f08058d89a308 [file] [log] [blame]

Robert Sloan	a51059f	2018-11-12 13:38:50 -0800	[diff] [blame]	1	# Copyright (c) 2018, Amazon Inc.
				2	#
				3	# Permission to use, copy, modify, and/or distribute this software for any
				4	# purpose with or without fee is hereby granted, provided that the above
				5	# copyright notice and this permission notice appear in all copies.
				6	#
				7	# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
				8	# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
				9	# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
				10	# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				11	# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
				12	# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
				13	# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
				14	#
				15	# Written by Nir Drucker, and Shay Gueron
				16	# AWS Cryptographic Algorithms Group
				17	# (ndrucker@amazon.com, gueron@amazon.com)
				18	# based on BN_mod_inverse_odd
				19
				20	$flavour = shift;
				21	$output = shift;
				22	if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
				23
				24	$win64=0; $win64=1 if ($flavour =~ /[nm]asm\|mingw64/ \|\| $output =~ /\.asm$/);
				25
				26	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
				27	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
				28	( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
				29	die "can't locate x86_64-xlate.pl";
				30
				31	open OUT,"\| \"$^X\" \"$xlate\" $flavour \"$output\"";
				32	STDOUT=OUT;
				33
				34	#############################################################################
				35	# extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS],
				36	# BN_ULONG a[P256_LIMBS],
				37	# BN_ULONG n[P256_LIMBS]);
				38	#
				39	# (Binary Extended Euclidean Algorithm.
				40	# See https://en.wikipedia.org/wiki/Binary_GCD_algorithm)
				41	#
				42	# Assumption 1: n is odd for the BEEU
				43	# Assumption 2: 1 < a < n < 2^256
				44
				45	$out = "%rdi";
				46	$a = "%rsi";
				47	$n = "%rdx";
				48
				49	# X/Y will hold the inverse parameter
				50	# Assumption: X,Y<2^(256)
				51	$x0 = "%r8";
				52	$x1 = "%r9";
				53	$x2 = "%r10";
				54	$x3 = "%r11";
				55	# borrow from out (out is needed only at the end)
				56	$x4 = "%rdi";
				57	$y0 = "%r12";
				58	$y1 = "%r13";
				59	$y2 = "%r14";
				60	$y3 = "%r15";
				61	$y4 = "%rbp";
				62	$shift = "%rcx";
				63	$t0 = "%rax";
				64	$t1 = "%rbx";
				65	$t2 = "%rsi";
				66	# borrow
				67	$t3 = "%rcx";
				68
				69	$T0 = "%xmm0";
				70	$T1 = "%xmm1";
				71
				72	# Offsets on the stack
				73	$out_rsp = 0;
				74	$shift_rsp = $out_rsp+0x8;
				75	$a_rsp0 = $shift_rsp+0x8;
				76	$a_rsp1 = $a_rsp0+0x8;
				77	$a_rsp2 = $a_rsp1+0x8;
				78	$a_rsp3 = $a_rsp2+0x8;
				79	$b_rsp0 = $a_rsp3+0x8;
				80	$b_rsp1 = $b_rsp0+0x8;
				81	$b_rsp2 = $b_rsp1+0x8;
				82	$b_rsp3 = $b_rsp2+0x8;
				83
				84	# Borrow when a_rsp/b_rsp are no longer needed.
				85	$y_rsp0 = $a_rsp0;
				86	$y_rsp1 = $y_rsp0+0x8;
				87	$y_rsp2 = $y_rsp1+0x8;
				88	$y_rsp3 = $y_rsp2+0x8;
				89	$y_rsp4 = $y_rsp3+0x8;
				90	$last_rsp_offset = $b_rsp3+0x8;
				91
				92	sub TEST_B_ZERO {
				93	return <<___;
				94	xorq $t1, $t1
				95	or $b_rsp0(%rsp), $t1
				96	or $b_rsp1(%rsp), $t1
				97	or $b_rsp2(%rsp), $t1
				98	or $b_rsp3(%rsp), $t1
				99	jz .Lbeeu_loop_end
				100	___
				101	}
				102
				103	$g_next_label = 0;
				104
				105	sub SHIFT1 {
				106	my ($var0, $var1, $var2, $var3, $var4) = @_;
				107	my $label = ".Lshift1_${g_next_label}";
				108	$g_next_label++;
				109
				110	return <<___;
				111	# Ensure X is even and divide by two.
				112	movq \$1, $t1
				113	andq $var0, $t1
				114	jz $label
				115	add 0*8($n), $var0
				116	adc 1*8($n), $var1
				117	adc 2*8($n), $var2
				118	adc 3*8($n), $var3
				119	adc \$0, $var4
				120
				121	$label:
				122	shrdq \$1, $var1, $var0
				123	shrdq \$1, $var2, $var1
				124	shrdq \$1, $var3, $var2
				125	shrdq \$1, $var4, $var3
				126	shrq \$1, $var4
				127	___
				128	}
				129
				130	sub SHIFT256 {
				131	my ($var) = @_;
				132	return <<___;
				133	# Copy shifted values.
				134	# Remember not to override t3=rcx
				135	movq 1*8+$var(%rsp), $t0
				136	movq 2*8+$var(%rsp), $t1
				137	movq 3*8+$var(%rsp), $t2
				138
				139	shrdq %cl, $t0, 0*8+$var(%rsp)
				140	shrdq %cl, $t1, 1*8+$var(%rsp)
				141	shrdq %cl, $t2, 2*8+$var(%rsp)
				142
				143	shrq %cl, $t2
				144	mov $t2, 3*8+$var(%rsp)
				145	___
				146	}
				147
				148	$code.=<<___;
				149	.text
				150
				151	.type beeu_mod_inverse_vartime,\@function
				152	.hidden beeu_mod_inverse_vartime
				153	.globl beeu_mod_inverse_vartime
				154	.align 32
				155	beeu_mod_inverse_vartime:
				156	.cfi_startproc
				157	push %rbp
				158	.cfi_push rbp
Robert Sloan	a51059f	2018-11-12 13:38:50 -0800	[diff] [blame]	159	push %r12
				160	.cfi_push r12
				161	push %r13
				162	.cfi_push r13
				163	push %r14
				164	.cfi_push r14
				165	push %r15
				166	.cfi_push r15
				167	push %rbx
				168	.cfi_push rbx
				169	push %rsi
				170	.cfi_push rsi
				171
				172	sub \$$last_rsp_offset, %rsp
Robert Sloan	4c22c5f	2019-03-01 15:53:37 -0800	[diff] [blame]	173	.cfi_adjust_cfa_offset $last_rsp_offset
Robert Sloan	a51059f	2018-11-12 13:38:50 -0800	[diff] [blame]	174	movq $out, $out_rsp(%rsp)
				175
				176	# X=1, Y=0
				177	movq \$1, $x0
				178	xorq $x1, $x1
				179	xorq $x2, $x2
				180	xorq $x3, $x3
				181	xorq $x4, $x4
				182
				183	xorq $y0, $y0
				184	xorq $y1, $y1
				185	xorq $y2, $y2
				186	xorq $y3, $y3
				187	xorq $y4, $y4
				188
				189	# Copy a/n into B/A on the stack.
				190	vmovdqu 0*8($a), $T0
				191	vmovdqu 2*8($a), $T1
				192	vmovdqu $T0, $b_rsp0(%rsp)
				193	vmovdqu $T1, $b_rsp2(%rsp)
				194
				195	vmovdqu 0*8($n), $T0
				196	vmovdqu 2*8($n), $T1
				197	vmovdqu $T0, $a_rsp0(%rsp)
				198	vmovdqu $T1, $a_rsp2(%rsp)
				199
				200	.Lbeeu_loop:
				201	${\TEST_B_ZERO}
				202
				203	# 0 < B < \|n\|,
				204	# 0 < A <= \|n\|,
				205	# (1) X*a == B (mod \|n\|),
				206	# (2) (-1)Ya == A (mod \|n\|)
				207
				208	# Now divide B by the maximum possible power of two in the
				209	# integers, and divide X by the same value mod \|n\|. When we're
				210	# done, (1) still holds.
				211	movq \$1, $shift
				212
				213	# Note that B > 0
				214	.Lbeeu_shift_loop_XB:
				215	movq $shift, $t1
				216	andq $b_rsp0(%rsp), $t1
				217	jnz .Lbeeu_shift_loop_end_XB
				218
				219	${\SHIFT1($x0, $x1, $x2, $x3, $x4)}
				220	shl \$1, $shift
				221
				222	# Test wraparound of the shift parameter. The probability to have 32 zeroes
				223	# in a row is small Therefore having the value below equal \$0x8000000 or
				224	# \$0x8000 does not affect the performance. We choose 0x8000000 because it
				225	# is the maximal immediate value possible.
				226	cmp \$0x8000000, $shift
				227	jne .Lbeeu_shift_loop_XB
				228
				229	.Lbeeu_shift_loop_end_XB:
				230	bsf $shift, $shift
				231	test $shift, $shift
				232	jz .Lbeeu_no_shift_XB
				233
				234	${\SHIFT256($b_rsp0)}
				235
				236	.Lbeeu_no_shift_XB:
				237	# Same for A and Y. Afterwards, (2) still holds.
				238	movq \$1, $shift
				239
				240	# Note that A > 0
				241	.Lbeeu_shift_loop_YA:
				242	movq $shift, $t1
				243	andq $a_rsp0(%rsp), $t1
				244	jnz .Lbeeu_shift_loop_end_YA
				245
				246	${\SHIFT1($y0, $y1, $y2, $y3, $y4)}
				247	shl \$1, $shift
				248
				249	# Test wraparound of the shift parameter. The probability to have 32 zeroes
				250	# in a row is small therefore having the value below equal \$0x8000000 or
				251	# \$0x8000 Does not affect the performance. We choose 0x8000000 because it
				252	# is the maximal immediate value possible.
				253	cmp \$0x8000000, $shift
				254	jne .Lbeeu_shift_loop_YA
				255
				256	.Lbeeu_shift_loop_end_YA:
				257	bsf $shift, $shift
				258	test $shift, $shift
				259	jz .Lbeeu_no_shift_YA
				260
				261	${\SHIFT256($a_rsp0)}
				262
				263	.Lbeeu_no_shift_YA:
				264	# T = B-A (A,B < 2^256)
				265	mov $b_rsp0(%rsp), $t0
				266	mov $b_rsp1(%rsp), $t1
				267	mov $b_rsp2(%rsp), $t2
				268	mov $b_rsp3(%rsp), $t3
				269	sub $a_rsp0(%rsp), $t0
				270	sbb $a_rsp1(%rsp), $t1
				271	sbb $a_rsp2(%rsp), $t2
				272	sbb $a_rsp3(%rsp), $t3 # borrow from shift
				273	jnc .Lbeeu_B_bigger_than_A
				274
				275	# A = A - B
				276	mov $a_rsp0(%rsp), $t0
				277	mov $a_rsp1(%rsp), $t1
				278	mov $a_rsp2(%rsp), $t2
				279	mov $a_rsp3(%rsp), $t3
				280	sub $b_rsp0(%rsp), $t0
				281	sbb $b_rsp1(%rsp), $t1
				282	sbb $b_rsp2(%rsp), $t2
				283	sbb $b_rsp3(%rsp), $t3
				284	mov $t0, $a_rsp0(%rsp)
				285	mov $t1, $a_rsp1(%rsp)
				286	mov $t2, $a_rsp2(%rsp)
				287	mov $t3, $a_rsp3(%rsp)
				288
				289	# Y = Y + X
				290	add $x0, $y0
				291	adc $x1, $y1
				292	adc $x2, $y2
				293	adc $x3, $y3
				294	adc $x4, $y4
				295	jmp .Lbeeu_loop
				296
				297	.Lbeeu_B_bigger_than_A:
				298	# B = T = B - A
				299	mov $t0, $b_rsp0(%rsp)
				300	mov $t1, $b_rsp1(%rsp)
				301	mov $t2, $b_rsp2(%rsp)
				302	mov $t3, $b_rsp3(%rsp)
				303
				304	# X = Y + X
				305	add $y0, $x0
				306	adc $y1, $x1
				307	adc $y2, $x2
				308	adc $y3, $x3
				309	adc $y4, $x4
				310
				311	jmp .Lbeeu_loop
				312
				313	.Lbeeu_loop_end:
				314	# The Euclid's algorithm loop ends when A == beeu(a,n);
				315	# Therefore (-1)Ya == A (mod \|n\|), Y>0
				316
				317	# Verify that A = 1 ==> (-1)Ya = A = 1 (mod \|n\|)
				318	mov $a_rsp0(%rsp), $t1
				319	sub \$1, $t1
				320	or $a_rsp1(%rsp), $t1
				321	or $a_rsp2(%rsp), $t1
				322	or $a_rsp3(%rsp), $t1
				323	# If not, fail.
				324	jnz .Lbeeu_err
				325
				326	# From this point on, we no longer need X
				327	# Therefore we use it as a temporary storage.
				328	# X = n
				329	movq 0*8($n), $x0
				330	movq 1*8($n), $x1
				331	movq 2*8($n), $x2
				332	movq 3*8($n), $x3
				333	xorq $x4, $x4
				334
				335	.Lbeeu_reduction_loop:
				336	movq $y0, $y_rsp0(%rsp)
				337	movq $y1, $y_rsp1(%rsp)
				338	movq $y2, $y_rsp2(%rsp)
				339	movq $y3, $y_rsp3(%rsp)
				340	movq $y4, $y_rsp4(%rsp)
				341
				342	# If Y>n ==> Y=Y-n
				343	sub $x0, $y0
				344	sbb $x1, $y1
				345	sbb $x2, $y2
				346	sbb $x3, $y3
				347	sbb \$0, $y4
				348
				349	# Choose old Y or new Y
				350	cmovc $y_rsp0(%rsp), $y0
				351	cmovc $y_rsp1(%rsp), $y1
				352	cmovc $y_rsp2(%rsp), $y2
				353	cmovc $y_rsp3(%rsp), $y3
				354	jnc .Lbeeu_reduction_loop
				355
				356	# X = n - Y (n, Y < 2^256), (Cancel the (-1))
				357	sub $y0, $x0
				358	sbb $y1, $x1
				359	sbb $y2, $x2
				360	sbb $y3, $x3
				361
				362	.Lbeeu_save:
				363	# Save the inverse(<2^256) to out.
				364	mov $out_rsp(%rsp), $out
				365
				366	movq $x0, 0*8($out)
				367	movq $x1, 1*8($out)
				368	movq $x2, 2*8($out)
				369	movq $x3, 3*8($out)
				370
				371	# Return 1.
				372	movq \$1, %rax
				373	jmp .Lbeeu_finish
				374
				375	.Lbeeu_err:
				376	# Return 0.
				377	xorq %rax, %rax
				378
				379	.Lbeeu_finish:
				380	add \$$last_rsp_offset, %rsp
Robert Sloan	4c22c5f	2019-03-01 15:53:37 -0800	[diff] [blame]	381	.cfi_adjust_cfa_offset -$last_rsp_offset
Robert Sloan	a51059f	2018-11-12 13:38:50 -0800	[diff] [blame]	382	pop %rsi
				383	.cfi_pop rsi
				384	pop %rbx
				385	.cfi_pop rbx
				386	pop %r15
				387	.cfi_pop r15
				388	pop %r14
				389	.cfi_pop r14
				390	pop %r13
				391	.cfi_pop r13
				392	pop %r12
				393	.cfi_pop r12
				394	pop %rbp
				395	.cfi_pop rbp
Robert Sloan	a51059f	2018-11-12 13:38:50 -0800	[diff] [blame]	396	ret
Robert Sloan	4c22c5f	2019-03-01 15:53:37 -0800	[diff] [blame]	397	.cfi_endproc
Robert Sloan	a51059f	2018-11-12 13:38:50 -0800	[diff] [blame]	398
				399	.size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime
				400	___
				401
				402	print $code;
Srinivas Paladugu	dd42a61	2019-08-09 19:30:39 +0000	[diff] [blame]	403	close STDOUT;