Blame - src/crypto/fipsmodule/ec/asm/p256_beeu-x86_64-asm.pl - platform/external/boringssl

blob: 12b9f5af2e9db7f6e799f084495cb2828c8cab89 [file] [log] [blame]

Robert Sloan	a51059f	2018-11-12 13:38:50 -0800	[diff] [blame]	1	# Copyright (c) 2018, Amazon Inc.
				2	#
				3	# Permission to use, copy, modify, and/or distribute this software for any
				4	# purpose with or without fee is hereby granted, provided that the above
				5	# copyright notice and this permission notice appear in all copies.
				6	#
				7	# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
				8	# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
				9	# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
				10	# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				11	# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
				12	# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
				13	# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
				14	#
				15	# Written by Nir Drucker, and Shay Gueron
				16	# AWS Cryptographic Algorithms Group
				17	# (ndrucker@amazon.com, gueron@amazon.com)
				18	# based on BN_mod_inverse_odd
				19
				20	$flavour = shift;
				21	$output = shift;
				22	if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
				23
				24	$win64=0; $win64=1 if ($flavour =~ /[nm]asm\|mingw64/ \|\| $output =~ /\.asm$/);
				25
				26	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
				27	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
				28	( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
				29	die "can't locate x86_64-xlate.pl";
				30
				31	open OUT,"\| \"$^X\" \"$xlate\" $flavour \"$output\"";
				32	STDOUT=OUT;
				33
				34	#############################################################################
				35	# extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS],
				36	# BN_ULONG a[P256_LIMBS],
				37	# BN_ULONG n[P256_LIMBS]);
				38	#
				39	# (Binary Extended Euclidean Algorithm.
				40	# See https://en.wikipedia.org/wiki/Binary_GCD_algorithm)
				41	#
				42	# Assumption 1: n is odd for the BEEU
				43	# Assumption 2: 1 < a < n < 2^256
				44
				45	$out = "%rdi";
				46	$a = "%rsi";
				47	$n = "%rdx";
				48
				49	# X/Y will hold the inverse parameter
				50	# Assumption: X,Y<2^(256)
				51	$x0 = "%r8";
				52	$x1 = "%r9";
				53	$x2 = "%r10";
				54	$x3 = "%r11";
				55	# borrow from out (out is needed only at the end)
				56	$x4 = "%rdi";
				57	$y0 = "%r12";
				58	$y1 = "%r13";
				59	$y2 = "%r14";
				60	$y3 = "%r15";
				61	$y4 = "%rbp";
				62	$shift = "%rcx";
				63	$t0 = "%rax";
				64	$t1 = "%rbx";
				65	$t2 = "%rsi";
				66	# borrow
				67	$t3 = "%rcx";
				68
				69	$T0 = "%xmm0";
				70	$T1 = "%xmm1";
				71
				72	# Offsets on the stack
				73	$out_rsp = 0;
				74	$shift_rsp = $out_rsp+0x8;
				75	$a_rsp0 = $shift_rsp+0x8;
				76	$a_rsp1 = $a_rsp0+0x8;
				77	$a_rsp2 = $a_rsp1+0x8;
				78	$a_rsp3 = $a_rsp2+0x8;
				79	$b_rsp0 = $a_rsp3+0x8;
				80	$b_rsp1 = $b_rsp0+0x8;
				81	$b_rsp2 = $b_rsp1+0x8;
				82	$b_rsp3 = $b_rsp2+0x8;
				83
				84	# Borrow when a_rsp/b_rsp are no longer needed.
				85	$y_rsp0 = $a_rsp0;
				86	$y_rsp1 = $y_rsp0+0x8;
				87	$y_rsp2 = $y_rsp1+0x8;
				88	$y_rsp3 = $y_rsp2+0x8;
				89	$y_rsp4 = $y_rsp3+0x8;
				90	$last_rsp_offset = $b_rsp3+0x8;
				91
				92	sub TEST_B_ZERO {
				93	return <<___;
				94	xorq $t1, $t1
				95	or $b_rsp0(%rsp), $t1
				96	or $b_rsp1(%rsp), $t1
				97	or $b_rsp2(%rsp), $t1
				98	or $b_rsp3(%rsp), $t1
				99	jz .Lbeeu_loop_end
				100	___
				101	}
				102
				103	$g_next_label = 0;
				104
				105	sub SHIFT1 {
				106	my ($var0, $var1, $var2, $var3, $var4) = @_;
				107	my $label = ".Lshift1_${g_next_label}";
				108	$g_next_label++;
				109
				110	return <<___;
				111	# Ensure X is even and divide by two.
				112	movq \$1, $t1
				113	andq $var0, $t1
				114	jz $label
				115	add 0*8($n), $var0
				116	adc 1*8($n), $var1
				117	adc 2*8($n), $var2
				118	adc 3*8($n), $var3
				119	adc \$0, $var4
				120
				121	$label:
				122	shrdq \$1, $var1, $var0
				123	shrdq \$1, $var2, $var1
				124	shrdq \$1, $var3, $var2
				125	shrdq \$1, $var4, $var3
				126	shrq \$1, $var4
				127	___
				128	}
				129
				130	sub SHIFT256 {
				131	my ($var) = @_;
				132	return <<___;
				133	# Copy shifted values.
				134	# Remember not to override t3=rcx
				135	movq 1*8+$var(%rsp), $t0
				136	movq 2*8+$var(%rsp), $t1
				137	movq 3*8+$var(%rsp), $t2
				138
				139	shrdq %cl, $t0, 0*8+$var(%rsp)
				140	shrdq %cl, $t1, 1*8+$var(%rsp)
				141	shrdq %cl, $t2, 2*8+$var(%rsp)
				142
				143	shrq %cl, $t2
				144	mov $t2, 3*8+$var(%rsp)
				145	___
				146	}
				147
				148	$code.=<<___;
				149	.text
				150
				151	.type beeu_mod_inverse_vartime,\@function
				152	.hidden beeu_mod_inverse_vartime
				153	.globl beeu_mod_inverse_vartime
				154	.align 32
				155	beeu_mod_inverse_vartime:
				156	.cfi_startproc
				157	push %rbp
				158	.cfi_push rbp
				159	movq %rsp, %rbp
				160	.cfi_def_cfa_register rbp
				161
				162	push %r12
				163	.cfi_push r12
				164	push %r13
				165	.cfi_push r13
				166	push %r14
				167	.cfi_push r14
				168	push %r15
				169	.cfi_push r15
				170	push %rbx
				171	.cfi_push rbx
				172	push %rsi
				173	.cfi_push rsi
				174
				175	sub \$$last_rsp_offset, %rsp
				176	movq $out, $out_rsp(%rsp)
				177
				178	# X=1, Y=0
				179	movq \$1, $x0
				180	xorq $x1, $x1
				181	xorq $x2, $x2
				182	xorq $x3, $x3
				183	xorq $x4, $x4
				184
				185	xorq $y0, $y0
				186	xorq $y1, $y1
				187	xorq $y2, $y2
				188	xorq $y3, $y3
				189	xorq $y4, $y4
				190
				191	# Copy a/n into B/A on the stack.
				192	vmovdqu 0*8($a), $T0
				193	vmovdqu 2*8($a), $T1
				194	vmovdqu $T0, $b_rsp0(%rsp)
				195	vmovdqu $T1, $b_rsp2(%rsp)
				196
				197	vmovdqu 0*8($n), $T0
				198	vmovdqu 2*8($n), $T1
				199	vmovdqu $T0, $a_rsp0(%rsp)
				200	vmovdqu $T1, $a_rsp2(%rsp)
				201
				202	.Lbeeu_loop:
				203	${\TEST_B_ZERO}
				204
				205	# 0 < B < \|n\|,
				206	# 0 < A <= \|n\|,
				207	# (1) X*a == B (mod \|n\|),
				208	# (2) (-1)Ya == A (mod \|n\|)
				209
				210	# Now divide B by the maximum possible power of two in the
				211	# integers, and divide X by the same value mod \|n\|. When we're
				212	# done, (1) still holds.
				213	movq \$1, $shift
				214
				215	# Note that B > 0
				216	.Lbeeu_shift_loop_XB:
				217	movq $shift, $t1
				218	andq $b_rsp0(%rsp), $t1
				219	jnz .Lbeeu_shift_loop_end_XB
				220
				221	${\SHIFT1($x0, $x1, $x2, $x3, $x4)}
				222	shl \$1, $shift
				223
				224	# Test wraparound of the shift parameter. The probability to have 32 zeroes
				225	# in a row is small Therefore having the value below equal \$0x8000000 or
				226	# \$0x8000 does not affect the performance. We choose 0x8000000 because it
				227	# is the maximal immediate value possible.
				228	cmp \$0x8000000, $shift
				229	jne .Lbeeu_shift_loop_XB
				230
				231	.Lbeeu_shift_loop_end_XB:
				232	bsf $shift, $shift
				233	test $shift, $shift
				234	jz .Lbeeu_no_shift_XB
				235
				236	${\SHIFT256($b_rsp0)}
				237
				238	.Lbeeu_no_shift_XB:
				239	# Same for A and Y. Afterwards, (2) still holds.
				240	movq \$1, $shift
				241
				242	# Note that A > 0
				243	.Lbeeu_shift_loop_YA:
				244	movq $shift, $t1
				245	andq $a_rsp0(%rsp), $t1
				246	jnz .Lbeeu_shift_loop_end_YA
				247
				248	${\SHIFT1($y0, $y1, $y2, $y3, $y4)}
				249	shl \$1, $shift
				250
				251	# Test wraparound of the shift parameter. The probability to have 32 zeroes
				252	# in a row is small therefore having the value below equal \$0x8000000 or
				253	# \$0x8000 Does not affect the performance. We choose 0x8000000 because it
				254	# is the maximal immediate value possible.
				255	cmp \$0x8000000, $shift
				256	jne .Lbeeu_shift_loop_YA
				257
				258	.Lbeeu_shift_loop_end_YA:
				259	bsf $shift, $shift
				260	test $shift, $shift
				261	jz .Lbeeu_no_shift_YA
				262
				263	${\SHIFT256($a_rsp0)}
				264
				265	.Lbeeu_no_shift_YA:
				266	# T = B-A (A,B < 2^256)
				267	mov $b_rsp0(%rsp), $t0
				268	mov $b_rsp1(%rsp), $t1
				269	mov $b_rsp2(%rsp), $t2
				270	mov $b_rsp3(%rsp), $t3
				271	sub $a_rsp0(%rsp), $t0
				272	sbb $a_rsp1(%rsp), $t1
				273	sbb $a_rsp2(%rsp), $t2
				274	sbb $a_rsp3(%rsp), $t3 # borrow from shift
				275	jnc .Lbeeu_B_bigger_than_A
				276
				277	# A = A - B
				278	mov $a_rsp0(%rsp), $t0
				279	mov $a_rsp1(%rsp), $t1
				280	mov $a_rsp2(%rsp), $t2
				281	mov $a_rsp3(%rsp), $t3
				282	sub $b_rsp0(%rsp), $t0
				283	sbb $b_rsp1(%rsp), $t1
				284	sbb $b_rsp2(%rsp), $t2
				285	sbb $b_rsp3(%rsp), $t3
				286	mov $t0, $a_rsp0(%rsp)
				287	mov $t1, $a_rsp1(%rsp)
				288	mov $t2, $a_rsp2(%rsp)
				289	mov $t3, $a_rsp3(%rsp)
				290
				291	# Y = Y + X
				292	add $x0, $y0
				293	adc $x1, $y1
				294	adc $x2, $y2
				295	adc $x3, $y3
				296	adc $x4, $y4
				297	jmp .Lbeeu_loop
				298
				299	.Lbeeu_B_bigger_than_A:
				300	# B = T = B - A
				301	mov $t0, $b_rsp0(%rsp)
				302	mov $t1, $b_rsp1(%rsp)
				303	mov $t2, $b_rsp2(%rsp)
				304	mov $t3, $b_rsp3(%rsp)
				305
				306	# X = Y + X
				307	add $y0, $x0
				308	adc $y1, $x1
				309	adc $y2, $x2
				310	adc $y3, $x3
				311	adc $y4, $x4
				312
				313	jmp .Lbeeu_loop
				314
				315	.Lbeeu_loop_end:
				316	# The Euclid's algorithm loop ends when A == beeu(a,n);
				317	# Therefore (-1)Ya == A (mod \|n\|), Y>0
				318
				319	# Verify that A = 1 ==> (-1)Ya = A = 1 (mod \|n\|)
				320	mov $a_rsp0(%rsp), $t1
				321	sub \$1, $t1
				322	or $a_rsp1(%rsp), $t1
				323	or $a_rsp2(%rsp), $t1
				324	or $a_rsp3(%rsp), $t1
				325	# If not, fail.
				326	jnz .Lbeeu_err
				327
				328	# From this point on, we no longer need X
				329	# Therefore we use it as a temporary storage.
				330	# X = n
				331	movq 0*8($n), $x0
				332	movq 1*8($n), $x1
				333	movq 2*8($n), $x2
				334	movq 3*8($n), $x3
				335	xorq $x4, $x4
				336
				337	.Lbeeu_reduction_loop:
				338	movq $y0, $y_rsp0(%rsp)
				339	movq $y1, $y_rsp1(%rsp)
				340	movq $y2, $y_rsp2(%rsp)
				341	movq $y3, $y_rsp3(%rsp)
				342	movq $y4, $y_rsp4(%rsp)
				343
				344	# If Y>n ==> Y=Y-n
				345	sub $x0, $y0
				346	sbb $x1, $y1
				347	sbb $x2, $y2
				348	sbb $x3, $y3
				349	sbb \$0, $y4
				350
				351	# Choose old Y or new Y
				352	cmovc $y_rsp0(%rsp), $y0
				353	cmovc $y_rsp1(%rsp), $y1
				354	cmovc $y_rsp2(%rsp), $y2
				355	cmovc $y_rsp3(%rsp), $y3
				356	jnc .Lbeeu_reduction_loop
				357
				358	# X = n - Y (n, Y < 2^256), (Cancel the (-1))
				359	sub $y0, $x0
				360	sbb $y1, $x1
				361	sbb $y2, $x2
				362	sbb $y3, $x3
				363
				364	.Lbeeu_save:
				365	# Save the inverse(<2^256) to out.
				366	mov $out_rsp(%rsp), $out
				367
				368	movq $x0, 0*8($out)
				369	movq $x1, 1*8($out)
				370	movq $x2, 2*8($out)
				371	movq $x3, 3*8($out)
				372
				373	# Return 1.
				374	movq \$1, %rax
				375	jmp .Lbeeu_finish
				376
				377	.Lbeeu_err:
				378	# Return 0.
				379	xorq %rax, %rax
				380
				381	.Lbeeu_finish:
				382	add \$$last_rsp_offset, %rsp
				383	pop %rsi
				384	.cfi_pop rsi
				385	pop %rbx
				386	.cfi_pop rbx
				387	pop %r15
				388	.cfi_pop r15
				389	pop %r14
				390	.cfi_pop r14
				391	pop %r13
				392	.cfi_pop r13
				393	pop %r12
				394	.cfi_pop r12
				395	pop %rbp
				396	.cfi_pop rbp
				397	.cfi_def_cfa rsp, 8
				398	.cfi_endproc
				399	ret
				400
				401	.size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime
				402	___
				403
				404	print $code;
				405	close STDOUT;