Robert Sloan | a51059f | 2018-11-12 13:38:50 -0800 | [diff] [blame] | 1 | # Copyright (c) 2018, Amazon Inc. |
| 2 | # |
| 3 | # Permission to use, copy, modify, and/or distribute this software for any |
| 4 | # purpose with or without fee is hereby granted, provided that the above |
| 5 | # copyright notice and this permission notice appear in all copies. |
| 6 | # |
| 7 | # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
| 8 | # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
| 9 | # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
| 10 | # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| 11 | # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION |
| 12 | # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
| 13 | # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ |
| 14 | # |
| 15 | # Written by Nir Drucker, and Shay Gueron |
| 16 | # AWS Cryptographic Algorithms Group |
| 17 | # (ndrucker@amazon.com, gueron@amazon.com) |
| 18 | # based on BN_mod_inverse_odd |
| 19 | |
| 20 | $flavour = shift; |
| 21 | $output = shift; |
| 22 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } |
| 23 | |
| 24 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); |
| 25 | |
| 26 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 27 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or |
| 28 | ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or |
| 29 | die "can't locate x86_64-xlate.pl"; |
| 30 | |
| 31 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; |
| 32 | *STDOUT=*OUT; |
| 33 | |
| 34 | ############################################################################# |
| 35 | # extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS], |
| 36 | # BN_ULONG a[P256_LIMBS], |
| 37 | # BN_ULONG n[P256_LIMBS]); |
| 38 | # |
| 39 | # (Binary Extended Euclidean Algorithm. |
| 40 | # See https://en.wikipedia.org/wiki/Binary_GCD_algorithm) |
| 41 | # |
| 42 | # Assumption 1: n is odd for the BEEU |
| 43 | # Assumption 2: 1 < a < n < 2^256 |
| 44 | |
| 45 | $out = "%rdi"; |
| 46 | $a = "%rsi"; |
| 47 | $n = "%rdx"; |
| 48 | |
| 49 | # X/Y will hold the inverse parameter |
| 50 | # Assumption: X,Y<2^(256) |
| 51 | $x0 = "%r8"; |
| 52 | $x1 = "%r9"; |
| 53 | $x2 = "%r10"; |
| 54 | $x3 = "%r11"; |
| 55 | # borrow from out (out is needed only at the end) |
| 56 | $x4 = "%rdi"; |
| 57 | $y0 = "%r12"; |
| 58 | $y1 = "%r13"; |
| 59 | $y2 = "%r14"; |
| 60 | $y3 = "%r15"; |
| 61 | $y4 = "%rbp"; |
| 62 | $shift = "%rcx"; |
| 63 | $t0 = "%rax"; |
| 64 | $t1 = "%rbx"; |
| 65 | $t2 = "%rsi"; |
| 66 | # borrow |
| 67 | $t3 = "%rcx"; |
| 68 | |
| 69 | $T0 = "%xmm0"; |
| 70 | $T1 = "%xmm1"; |
| 71 | |
| 72 | # Offsets on the stack |
| 73 | $out_rsp = 0; |
| 74 | $shift_rsp = $out_rsp+0x8; |
| 75 | $a_rsp0 = $shift_rsp+0x8; |
| 76 | $a_rsp1 = $a_rsp0+0x8; |
| 77 | $a_rsp2 = $a_rsp1+0x8; |
| 78 | $a_rsp3 = $a_rsp2+0x8; |
| 79 | $b_rsp0 = $a_rsp3+0x8; |
| 80 | $b_rsp1 = $b_rsp0+0x8; |
| 81 | $b_rsp2 = $b_rsp1+0x8; |
| 82 | $b_rsp3 = $b_rsp2+0x8; |
| 83 | |
| 84 | # Borrow when a_rsp/b_rsp are no longer needed. |
| 85 | $y_rsp0 = $a_rsp0; |
| 86 | $y_rsp1 = $y_rsp0+0x8; |
| 87 | $y_rsp2 = $y_rsp1+0x8; |
| 88 | $y_rsp3 = $y_rsp2+0x8; |
| 89 | $y_rsp4 = $y_rsp3+0x8; |
| 90 | $last_rsp_offset = $b_rsp3+0x8; |
| 91 | |
| 92 | sub TEST_B_ZERO { |
| 93 | return <<___; |
| 94 | xorq $t1, $t1 |
| 95 | or $b_rsp0(%rsp), $t1 |
| 96 | or $b_rsp1(%rsp), $t1 |
| 97 | or $b_rsp2(%rsp), $t1 |
| 98 | or $b_rsp3(%rsp), $t1 |
| 99 | jz .Lbeeu_loop_end |
| 100 | ___ |
| 101 | } |
| 102 | |
| 103 | $g_next_label = 0; |
| 104 | |
| 105 | sub SHIFT1 { |
| 106 | my ($var0, $var1, $var2, $var3, $var4) = @_; |
| 107 | my $label = ".Lshift1_${g_next_label}"; |
| 108 | $g_next_label++; |
| 109 | |
| 110 | return <<___; |
| 111 | # Ensure X is even and divide by two. |
| 112 | movq \$1, $t1 |
| 113 | andq $var0, $t1 |
| 114 | jz $label |
| 115 | add 0*8($n), $var0 |
| 116 | adc 1*8($n), $var1 |
| 117 | adc 2*8($n), $var2 |
| 118 | adc 3*8($n), $var3 |
| 119 | adc \$0, $var4 |
| 120 | |
| 121 | $label: |
| 122 | shrdq \$1, $var1, $var0 |
| 123 | shrdq \$1, $var2, $var1 |
| 124 | shrdq \$1, $var3, $var2 |
| 125 | shrdq \$1, $var4, $var3 |
| 126 | shrq \$1, $var4 |
| 127 | ___ |
| 128 | } |
| 129 | |
| 130 | sub SHIFT256 { |
| 131 | my ($var) = @_; |
| 132 | return <<___; |
| 133 | # Copy shifted values. |
| 134 | # Remember not to override t3=rcx |
| 135 | movq 1*8+$var(%rsp), $t0 |
| 136 | movq 2*8+$var(%rsp), $t1 |
| 137 | movq 3*8+$var(%rsp), $t2 |
| 138 | |
| 139 | shrdq %cl, $t0, 0*8+$var(%rsp) |
| 140 | shrdq %cl, $t1, 1*8+$var(%rsp) |
| 141 | shrdq %cl, $t2, 2*8+$var(%rsp) |
| 142 | |
| 143 | shrq %cl, $t2 |
| 144 | mov $t2, 3*8+$var(%rsp) |
| 145 | ___ |
| 146 | } |
| 147 | |
| 148 | $code.=<<___; |
| 149 | .text |
| 150 | |
| 151 | .type beeu_mod_inverse_vartime,\@function |
| 152 | .hidden beeu_mod_inverse_vartime |
| 153 | .globl beeu_mod_inverse_vartime |
| 154 | .align 32 |
| 155 | beeu_mod_inverse_vartime: |
| 156 | .cfi_startproc |
| 157 | push %rbp |
| 158 | .cfi_push rbp |
Robert Sloan | a51059f | 2018-11-12 13:38:50 -0800 | [diff] [blame] | 159 | push %r12 |
| 160 | .cfi_push r12 |
| 161 | push %r13 |
| 162 | .cfi_push r13 |
| 163 | push %r14 |
| 164 | .cfi_push r14 |
| 165 | push %r15 |
| 166 | .cfi_push r15 |
| 167 | push %rbx |
| 168 | .cfi_push rbx |
| 169 | push %rsi |
| 170 | .cfi_push rsi |
| 171 | |
| 172 | sub \$$last_rsp_offset, %rsp |
Robert Sloan | 4c22c5f | 2019-03-01 15:53:37 -0800 | [diff] [blame] | 173 | .cfi_adjust_cfa_offset $last_rsp_offset |
Robert Sloan | a51059f | 2018-11-12 13:38:50 -0800 | [diff] [blame] | 174 | movq $out, $out_rsp(%rsp) |
| 175 | |
| 176 | # X=1, Y=0 |
| 177 | movq \$1, $x0 |
| 178 | xorq $x1, $x1 |
| 179 | xorq $x2, $x2 |
| 180 | xorq $x3, $x3 |
| 181 | xorq $x4, $x4 |
| 182 | |
| 183 | xorq $y0, $y0 |
| 184 | xorq $y1, $y1 |
| 185 | xorq $y2, $y2 |
| 186 | xorq $y3, $y3 |
| 187 | xorq $y4, $y4 |
| 188 | |
| 189 | # Copy a/n into B/A on the stack. |
| 190 | vmovdqu 0*8($a), $T0 |
| 191 | vmovdqu 2*8($a), $T1 |
| 192 | vmovdqu $T0, $b_rsp0(%rsp) |
| 193 | vmovdqu $T1, $b_rsp2(%rsp) |
| 194 | |
| 195 | vmovdqu 0*8($n), $T0 |
| 196 | vmovdqu 2*8($n), $T1 |
| 197 | vmovdqu $T0, $a_rsp0(%rsp) |
| 198 | vmovdqu $T1, $a_rsp2(%rsp) |
| 199 | |
| 200 | .Lbeeu_loop: |
| 201 | ${\TEST_B_ZERO} |
| 202 | |
| 203 | # 0 < B < |n|, |
| 204 | # 0 < A <= |n|, |
| 205 | # (1) X*a == B (mod |n|), |
| 206 | # (2) (-1)*Y*a == A (mod |n|) |
| 207 | |
| 208 | # Now divide B by the maximum possible power of two in the |
| 209 | # integers, and divide X by the same value mod |n|. When we're |
| 210 | # done, (1) still holds. |
| 211 | movq \$1, $shift |
| 212 | |
| 213 | # Note that B > 0 |
| 214 | .Lbeeu_shift_loop_XB: |
| 215 | movq $shift, $t1 |
| 216 | andq $b_rsp0(%rsp), $t1 |
| 217 | jnz .Lbeeu_shift_loop_end_XB |
| 218 | |
| 219 | ${\SHIFT1($x0, $x1, $x2, $x3, $x4)} |
| 220 | shl \$1, $shift |
| 221 | |
| 222 | # Test wraparound of the shift parameter. The probability to have 32 zeroes |
| 223 | # in a row is small Therefore having the value below equal \$0x8000000 or |
| 224 | # \$0x8000 does not affect the performance. We choose 0x8000000 because it |
| 225 | # is the maximal immediate value possible. |
| 226 | cmp \$0x8000000, $shift |
| 227 | jne .Lbeeu_shift_loop_XB |
| 228 | |
| 229 | .Lbeeu_shift_loop_end_XB: |
| 230 | bsf $shift, $shift |
| 231 | test $shift, $shift |
| 232 | jz .Lbeeu_no_shift_XB |
| 233 | |
| 234 | ${\SHIFT256($b_rsp0)} |
| 235 | |
| 236 | .Lbeeu_no_shift_XB: |
| 237 | # Same for A and Y. Afterwards, (2) still holds. |
| 238 | movq \$1, $shift |
| 239 | |
| 240 | # Note that A > 0 |
| 241 | .Lbeeu_shift_loop_YA: |
| 242 | movq $shift, $t1 |
| 243 | andq $a_rsp0(%rsp), $t1 |
| 244 | jnz .Lbeeu_shift_loop_end_YA |
| 245 | |
| 246 | ${\SHIFT1($y0, $y1, $y2, $y3, $y4)} |
| 247 | shl \$1, $shift |
| 248 | |
| 249 | # Test wraparound of the shift parameter. The probability to have 32 zeroes |
| 250 | # in a row is small therefore having the value below equal \$0x8000000 or |
| 251 | # \$0x8000 Does not affect the performance. We choose 0x8000000 because it |
| 252 | # is the maximal immediate value possible. |
| 253 | cmp \$0x8000000, $shift |
| 254 | jne .Lbeeu_shift_loop_YA |
| 255 | |
| 256 | .Lbeeu_shift_loop_end_YA: |
| 257 | bsf $shift, $shift |
| 258 | test $shift, $shift |
| 259 | jz .Lbeeu_no_shift_YA |
| 260 | |
| 261 | ${\SHIFT256($a_rsp0)} |
| 262 | |
| 263 | .Lbeeu_no_shift_YA: |
| 264 | # T = B-A (A,B < 2^256) |
| 265 | mov $b_rsp0(%rsp), $t0 |
| 266 | mov $b_rsp1(%rsp), $t1 |
| 267 | mov $b_rsp2(%rsp), $t2 |
| 268 | mov $b_rsp3(%rsp), $t3 |
| 269 | sub $a_rsp0(%rsp), $t0 |
| 270 | sbb $a_rsp1(%rsp), $t1 |
| 271 | sbb $a_rsp2(%rsp), $t2 |
| 272 | sbb $a_rsp3(%rsp), $t3 # borrow from shift |
| 273 | jnc .Lbeeu_B_bigger_than_A |
| 274 | |
| 275 | # A = A - B |
| 276 | mov $a_rsp0(%rsp), $t0 |
| 277 | mov $a_rsp1(%rsp), $t1 |
| 278 | mov $a_rsp2(%rsp), $t2 |
| 279 | mov $a_rsp3(%rsp), $t3 |
| 280 | sub $b_rsp0(%rsp), $t0 |
| 281 | sbb $b_rsp1(%rsp), $t1 |
| 282 | sbb $b_rsp2(%rsp), $t2 |
| 283 | sbb $b_rsp3(%rsp), $t3 |
| 284 | mov $t0, $a_rsp0(%rsp) |
| 285 | mov $t1, $a_rsp1(%rsp) |
| 286 | mov $t2, $a_rsp2(%rsp) |
| 287 | mov $t3, $a_rsp3(%rsp) |
| 288 | |
| 289 | # Y = Y + X |
| 290 | add $x0, $y0 |
| 291 | adc $x1, $y1 |
| 292 | adc $x2, $y2 |
| 293 | adc $x3, $y3 |
| 294 | adc $x4, $y4 |
| 295 | jmp .Lbeeu_loop |
| 296 | |
| 297 | .Lbeeu_B_bigger_than_A: |
| 298 | # B = T = B - A |
| 299 | mov $t0, $b_rsp0(%rsp) |
| 300 | mov $t1, $b_rsp1(%rsp) |
| 301 | mov $t2, $b_rsp2(%rsp) |
| 302 | mov $t3, $b_rsp3(%rsp) |
| 303 | |
| 304 | # X = Y + X |
| 305 | add $y0, $x0 |
| 306 | adc $y1, $x1 |
| 307 | adc $y2, $x2 |
| 308 | adc $y3, $x3 |
| 309 | adc $y4, $x4 |
| 310 | |
| 311 | jmp .Lbeeu_loop |
| 312 | |
| 313 | .Lbeeu_loop_end: |
| 314 | # The Euclid's algorithm loop ends when A == beeu(a,n); |
| 315 | # Therefore (-1)*Y*a == A (mod |n|), Y>0 |
| 316 | |
| 317 | # Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|) |
| 318 | mov $a_rsp0(%rsp), $t1 |
| 319 | sub \$1, $t1 |
| 320 | or $a_rsp1(%rsp), $t1 |
| 321 | or $a_rsp2(%rsp), $t1 |
| 322 | or $a_rsp3(%rsp), $t1 |
| 323 | # If not, fail. |
| 324 | jnz .Lbeeu_err |
| 325 | |
| 326 | # From this point on, we no longer need X |
| 327 | # Therefore we use it as a temporary storage. |
| 328 | # X = n |
| 329 | movq 0*8($n), $x0 |
| 330 | movq 1*8($n), $x1 |
| 331 | movq 2*8($n), $x2 |
| 332 | movq 3*8($n), $x3 |
| 333 | xorq $x4, $x4 |
| 334 | |
| 335 | .Lbeeu_reduction_loop: |
| 336 | movq $y0, $y_rsp0(%rsp) |
| 337 | movq $y1, $y_rsp1(%rsp) |
| 338 | movq $y2, $y_rsp2(%rsp) |
| 339 | movq $y3, $y_rsp3(%rsp) |
| 340 | movq $y4, $y_rsp4(%rsp) |
| 341 | |
| 342 | # If Y>n ==> Y=Y-n |
| 343 | sub $x0, $y0 |
| 344 | sbb $x1, $y1 |
| 345 | sbb $x2, $y2 |
| 346 | sbb $x3, $y3 |
| 347 | sbb \$0, $y4 |
| 348 | |
| 349 | # Choose old Y or new Y |
| 350 | cmovc $y_rsp0(%rsp), $y0 |
| 351 | cmovc $y_rsp1(%rsp), $y1 |
| 352 | cmovc $y_rsp2(%rsp), $y2 |
| 353 | cmovc $y_rsp3(%rsp), $y3 |
| 354 | jnc .Lbeeu_reduction_loop |
| 355 | |
| 356 | # X = n - Y (n, Y < 2^256), (Cancel the (-1)) |
| 357 | sub $y0, $x0 |
| 358 | sbb $y1, $x1 |
| 359 | sbb $y2, $x2 |
| 360 | sbb $y3, $x3 |
| 361 | |
| 362 | .Lbeeu_save: |
| 363 | # Save the inverse(<2^256) to out. |
| 364 | mov $out_rsp(%rsp), $out |
| 365 | |
| 366 | movq $x0, 0*8($out) |
| 367 | movq $x1, 1*8($out) |
| 368 | movq $x2, 2*8($out) |
| 369 | movq $x3, 3*8($out) |
| 370 | |
| 371 | # Return 1. |
| 372 | movq \$1, %rax |
| 373 | jmp .Lbeeu_finish |
| 374 | |
| 375 | .Lbeeu_err: |
| 376 | # Return 0. |
| 377 | xorq %rax, %rax |
| 378 | |
| 379 | .Lbeeu_finish: |
| 380 | add \$$last_rsp_offset, %rsp |
Robert Sloan | 4c22c5f | 2019-03-01 15:53:37 -0800 | [diff] [blame] | 381 | .cfi_adjust_cfa_offset -$last_rsp_offset |
Robert Sloan | a51059f | 2018-11-12 13:38:50 -0800 | [diff] [blame] | 382 | pop %rsi |
| 383 | .cfi_pop rsi |
| 384 | pop %rbx |
| 385 | .cfi_pop rbx |
| 386 | pop %r15 |
| 387 | .cfi_pop r15 |
| 388 | pop %r14 |
| 389 | .cfi_pop r14 |
| 390 | pop %r13 |
| 391 | .cfi_pop r13 |
| 392 | pop %r12 |
| 393 | .cfi_pop r12 |
| 394 | pop %rbp |
| 395 | .cfi_pop rbp |
Robert Sloan | a51059f | 2018-11-12 13:38:50 -0800 | [diff] [blame] | 396 | ret |
Robert Sloan | 4c22c5f | 2019-03-01 15:53:37 -0800 | [diff] [blame] | 397 | .cfi_endproc |
Robert Sloan | a51059f | 2018-11-12 13:38:50 -0800 | [diff] [blame] | 398 | |
| 399 | .size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime |
| 400 | ___ |
| 401 | |
| 402 | print $code; |
Srinivas Paladugu | dd42a61 | 2019-08-09 19:30:39 +0000 | [diff] [blame] | 403 | close STDOUT; |