blob: 12b9f5af2e9db7f6e799f084495cb2828c8cab89 [file] [log] [blame]
Robert Sloana51059f2018-11-12 13:38:50 -08001# Copyright (c) 2018, Amazon Inc.
2#
3# Permission to use, copy, modify, and/or distribute this software for any
4# purpose with or without fee is hereby granted, provided that the above
5# copyright notice and this permission notice appear in all copies.
6#
7# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14#
15# Written by Nir Drucker, and Shay Gueron
16# AWS Cryptographic Algorithms Group
17# (ndrucker@amazon.com, gueron@amazon.com)
18# based on BN_mod_inverse_odd
19
20$flavour = shift;
21$output = shift;
22if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
23
24$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
25
26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
27( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
28( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
29die "can't locate x86_64-xlate.pl";
30
31open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
32*STDOUT=*OUT;
33
34#############################################################################
35# extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS],
36# BN_ULONG a[P256_LIMBS],
37# BN_ULONG n[P256_LIMBS]);
38#
39# (Binary Extended Euclidean Algorithm.
40# See https://en.wikipedia.org/wiki/Binary_GCD_algorithm)
41#
42# Assumption 1: n is odd for the BEEU
43# Assumption 2: 1 < a < n < 2^256
44
45$out = "%rdi";
46$a = "%rsi";
47$n = "%rdx";
48
49# X/Y will hold the inverse parameter
50# Assumption: X,Y<2^(256)
51$x0 = "%r8";
52$x1 = "%r9";
53$x2 = "%r10";
54$x3 = "%r11";
55# borrow from out (out is needed only at the end)
56$x4 = "%rdi";
57$y0 = "%r12";
58$y1 = "%r13";
59$y2 = "%r14";
60$y3 = "%r15";
61$y4 = "%rbp";
62$shift = "%rcx";
63$t0 = "%rax";
64$t1 = "%rbx";
65$t2 = "%rsi";
66# borrow
67$t3 = "%rcx";
68
69$T0 = "%xmm0";
70$T1 = "%xmm1";
71
72# Offsets on the stack
73$out_rsp = 0;
74$shift_rsp = $out_rsp+0x8;
75$a_rsp0 = $shift_rsp+0x8;
76$a_rsp1 = $a_rsp0+0x8;
77$a_rsp2 = $a_rsp1+0x8;
78$a_rsp3 = $a_rsp2+0x8;
79$b_rsp0 = $a_rsp3+0x8;
80$b_rsp1 = $b_rsp0+0x8;
81$b_rsp2 = $b_rsp1+0x8;
82$b_rsp3 = $b_rsp2+0x8;
83
84# Borrow when a_rsp/b_rsp are no longer needed.
85$y_rsp0 = $a_rsp0;
86$y_rsp1 = $y_rsp0+0x8;
87$y_rsp2 = $y_rsp1+0x8;
88$y_rsp3 = $y_rsp2+0x8;
89$y_rsp4 = $y_rsp3+0x8;
90$last_rsp_offset = $b_rsp3+0x8;
91
92sub TEST_B_ZERO {
93 return <<___;
94 xorq $t1, $t1
95 or $b_rsp0(%rsp), $t1
96 or $b_rsp1(%rsp), $t1
97 or $b_rsp2(%rsp), $t1
98 or $b_rsp3(%rsp), $t1
99 jz .Lbeeu_loop_end
100___
101}
102
103$g_next_label = 0;
104
105sub SHIFT1 {
106 my ($var0, $var1, $var2, $var3, $var4) = @_;
107 my $label = ".Lshift1_${g_next_label}";
108 $g_next_label++;
109
110 return <<___;
111 # Ensure X is even and divide by two.
112 movq \$1, $t1
113 andq $var0, $t1
114 jz $label
115 add 0*8($n), $var0
116 adc 1*8($n), $var1
117 adc 2*8($n), $var2
118 adc 3*8($n), $var3
119 adc \$0, $var4
120
121$label:
122 shrdq \$1, $var1, $var0
123 shrdq \$1, $var2, $var1
124 shrdq \$1, $var3, $var2
125 shrdq \$1, $var4, $var3
126 shrq \$1, $var4
127___
128}
129
130sub SHIFT256 {
131 my ($var) = @_;
132 return <<___;
133 # Copy shifted values.
134 # Remember not to override t3=rcx
135 movq 1*8+$var(%rsp), $t0
136 movq 2*8+$var(%rsp), $t1
137 movq 3*8+$var(%rsp), $t2
138
139 shrdq %cl, $t0, 0*8+$var(%rsp)
140 shrdq %cl, $t1, 1*8+$var(%rsp)
141 shrdq %cl, $t2, 2*8+$var(%rsp)
142
143 shrq %cl, $t2
144 mov $t2, 3*8+$var(%rsp)
145___
146}
147
148$code.=<<___;
149.text
150
151.type beeu_mod_inverse_vartime,\@function
152.hidden beeu_mod_inverse_vartime
153.globl beeu_mod_inverse_vartime
154.align 32
155beeu_mod_inverse_vartime:
156.cfi_startproc
157 push %rbp
158.cfi_push rbp
159 movq %rsp, %rbp
160.cfi_def_cfa_register rbp
161
162 push %r12
163.cfi_push r12
164 push %r13
165.cfi_push r13
166 push %r14
167.cfi_push r14
168 push %r15
169.cfi_push r15
170 push %rbx
171.cfi_push rbx
172 push %rsi
173.cfi_push rsi
174
175 sub \$$last_rsp_offset, %rsp
176 movq $out, $out_rsp(%rsp)
177
178 # X=1, Y=0
179 movq \$1, $x0
180 xorq $x1, $x1
181 xorq $x2, $x2
182 xorq $x3, $x3
183 xorq $x4, $x4
184
185 xorq $y0, $y0
186 xorq $y1, $y1
187 xorq $y2, $y2
188 xorq $y3, $y3
189 xorq $y4, $y4
190
191 # Copy a/n into B/A on the stack.
192 vmovdqu 0*8($a), $T0
193 vmovdqu 2*8($a), $T1
194 vmovdqu $T0, $b_rsp0(%rsp)
195 vmovdqu $T1, $b_rsp2(%rsp)
196
197 vmovdqu 0*8($n), $T0
198 vmovdqu 2*8($n), $T1
199 vmovdqu $T0, $a_rsp0(%rsp)
200 vmovdqu $T1, $a_rsp2(%rsp)
201
202.Lbeeu_loop:
203 ${\TEST_B_ZERO}
204
205 # 0 < B < |n|,
206 # 0 < A <= |n|,
207 # (1) X*a == B (mod |n|),
208 # (2) (-1)*Y*a == A (mod |n|)
209
210 # Now divide B by the maximum possible power of two in the
211 # integers, and divide X by the same value mod |n|. When we're
212 # done, (1) still holds.
213 movq \$1, $shift
214
215 # Note that B > 0
216.Lbeeu_shift_loop_XB:
217 movq $shift, $t1
218 andq $b_rsp0(%rsp), $t1
219 jnz .Lbeeu_shift_loop_end_XB
220
221 ${\SHIFT1($x0, $x1, $x2, $x3, $x4)}
222 shl \$1, $shift
223
224 # Test wraparound of the shift parameter. The probability to have 32 zeroes
225 # in a row is small Therefore having the value below equal \$0x8000000 or
226 # \$0x8000 does not affect the performance. We choose 0x8000000 because it
227 # is the maximal immediate value possible.
228 cmp \$0x8000000, $shift
229 jne .Lbeeu_shift_loop_XB
230
231.Lbeeu_shift_loop_end_XB:
232 bsf $shift, $shift
233 test $shift, $shift
234 jz .Lbeeu_no_shift_XB
235
236 ${\SHIFT256($b_rsp0)}
237
238.Lbeeu_no_shift_XB:
239 # Same for A and Y. Afterwards, (2) still holds.
240 movq \$1, $shift
241
242 # Note that A > 0
243.Lbeeu_shift_loop_YA:
244 movq $shift, $t1
245 andq $a_rsp0(%rsp), $t1
246 jnz .Lbeeu_shift_loop_end_YA
247
248 ${\SHIFT1($y0, $y1, $y2, $y3, $y4)}
249 shl \$1, $shift
250
251 # Test wraparound of the shift parameter. The probability to have 32 zeroes
252 # in a row is small therefore having the value below equal \$0x8000000 or
253 # \$0x8000 Does not affect the performance. We choose 0x8000000 because it
254 # is the maximal immediate value possible.
255 cmp \$0x8000000, $shift
256 jne .Lbeeu_shift_loop_YA
257
258.Lbeeu_shift_loop_end_YA:
259 bsf $shift, $shift
260 test $shift, $shift
261 jz .Lbeeu_no_shift_YA
262
263 ${\SHIFT256($a_rsp0)}
264
265.Lbeeu_no_shift_YA:
266 # T = B-A (A,B < 2^256)
267 mov $b_rsp0(%rsp), $t0
268 mov $b_rsp1(%rsp), $t1
269 mov $b_rsp2(%rsp), $t2
270 mov $b_rsp3(%rsp), $t3
271 sub $a_rsp0(%rsp), $t0
272 sbb $a_rsp1(%rsp), $t1
273 sbb $a_rsp2(%rsp), $t2
274 sbb $a_rsp3(%rsp), $t3 # borrow from shift
275 jnc .Lbeeu_B_bigger_than_A
276
277 # A = A - B
278 mov $a_rsp0(%rsp), $t0
279 mov $a_rsp1(%rsp), $t1
280 mov $a_rsp2(%rsp), $t2
281 mov $a_rsp3(%rsp), $t3
282 sub $b_rsp0(%rsp), $t0
283 sbb $b_rsp1(%rsp), $t1
284 sbb $b_rsp2(%rsp), $t2
285 sbb $b_rsp3(%rsp), $t3
286 mov $t0, $a_rsp0(%rsp)
287 mov $t1, $a_rsp1(%rsp)
288 mov $t2, $a_rsp2(%rsp)
289 mov $t3, $a_rsp3(%rsp)
290
291 # Y = Y + X
292 add $x0, $y0
293 adc $x1, $y1
294 adc $x2, $y2
295 adc $x3, $y3
296 adc $x4, $y4
297 jmp .Lbeeu_loop
298
299.Lbeeu_B_bigger_than_A:
300 # B = T = B - A
301 mov $t0, $b_rsp0(%rsp)
302 mov $t1, $b_rsp1(%rsp)
303 mov $t2, $b_rsp2(%rsp)
304 mov $t3, $b_rsp3(%rsp)
305
306 # X = Y + X
307 add $y0, $x0
308 adc $y1, $x1
309 adc $y2, $x2
310 adc $y3, $x3
311 adc $y4, $x4
312
313 jmp .Lbeeu_loop
314
315.Lbeeu_loop_end:
316 # The Euclid's algorithm loop ends when A == beeu(a,n);
317 # Therefore (-1)*Y*a == A (mod |n|), Y>0
318
319 # Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|)
320 mov $a_rsp0(%rsp), $t1
321 sub \$1, $t1
322 or $a_rsp1(%rsp), $t1
323 or $a_rsp2(%rsp), $t1
324 or $a_rsp3(%rsp), $t1
325 # If not, fail.
326 jnz .Lbeeu_err
327
328 # From this point on, we no longer need X
329 # Therefore we use it as a temporary storage.
330 # X = n
331 movq 0*8($n), $x0
332 movq 1*8($n), $x1
333 movq 2*8($n), $x2
334 movq 3*8($n), $x3
335 xorq $x4, $x4
336
337.Lbeeu_reduction_loop:
338 movq $y0, $y_rsp0(%rsp)
339 movq $y1, $y_rsp1(%rsp)
340 movq $y2, $y_rsp2(%rsp)
341 movq $y3, $y_rsp3(%rsp)
342 movq $y4, $y_rsp4(%rsp)
343
344 # If Y>n ==> Y=Y-n
345 sub $x0, $y0
346 sbb $x1, $y1
347 sbb $x2, $y2
348 sbb $x3, $y3
349 sbb \$0, $y4
350
351 # Choose old Y or new Y
352 cmovc $y_rsp0(%rsp), $y0
353 cmovc $y_rsp1(%rsp), $y1
354 cmovc $y_rsp2(%rsp), $y2
355 cmovc $y_rsp3(%rsp), $y3
356 jnc .Lbeeu_reduction_loop
357
358 # X = n - Y (n, Y < 2^256), (Cancel the (-1))
359 sub $y0, $x0
360 sbb $y1, $x1
361 sbb $y2, $x2
362 sbb $y3, $x3
363
364.Lbeeu_save:
365 # Save the inverse(<2^256) to out.
366 mov $out_rsp(%rsp), $out
367
368 movq $x0, 0*8($out)
369 movq $x1, 1*8($out)
370 movq $x2, 2*8($out)
371 movq $x3, 3*8($out)
372
373 # Return 1.
374 movq \$1, %rax
375 jmp .Lbeeu_finish
376
377.Lbeeu_err:
378 # Return 0.
379 xorq %rax, %rax
380
381.Lbeeu_finish:
382 add \$$last_rsp_offset, %rsp
383 pop %rsi
384.cfi_pop rsi
385 pop %rbx
386.cfi_pop rbx
387 pop %r15
388.cfi_pop r15
389 pop %r14
390.cfi_pop r14
391 pop %r13
392.cfi_pop r13
393 pop %r12
394.cfi_pop r12
395 pop %rbp
396.cfi_pop rbp
397.cfi_def_cfa rsp, 8
398.cfi_endproc
399 ret
400
401.size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime
402___
403
404print $code;
405close STDOUT;