external/boringssl: Sync to fa3aadcd40ec4fd27a6e9492ef099b3dcc6eb2af.

This includes the following changes:

https://boringssl.googlesource.com/boringssl/+log/7f7e5e231efec6e86d6c7d3fd1b759be1cece156..fa3aadcd40ec4fd27a6e9492ef099b3dcc6eb2af

Test: BoringSSL CTS Presubmits.
Change-Id: I5381241ee7b94e1076d04090a0bc468b7816a1a1
diff --git a/src/crypto/fipsmodule/ec/asm/p256_beeu-x86_64-asm.pl b/src/crypto/fipsmodule/ec/asm/p256_beeu-x86_64-asm.pl
new file mode 100644
index 0000000..12b9f5a
--- /dev/null
+++ b/src/crypto/fipsmodule/ec/asm/p256_beeu-x86_64-asm.pl
@@ -0,0 +1,405 @@
+# Copyright (c) 2018, Amazon Inc.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+#
+# Written by Nir Drucker, and Shay Gueron
+# AWS Cryptographic Algorithms Group
+# (ndrucker@amazon.com, gueron@amazon.com)
+# based on BN_mod_inverse_odd
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+#############################################################################
+# extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS],
+#                                     BN_ULONG a[P256_LIMBS],
+#                                     BN_ULONG n[P256_LIMBS]);
+#
+# (Binary Extended Euclidean Algorithm.
+#  See https://en.wikipedia.org/wiki/Binary_GCD_algorithm)
+#
+# Assumption 1: n is odd for the BEEU
+# Assumption 2: 1 < a < n < 2^256
+
+$out = "%rdi";
+$a = "%rsi";
+$n = "%rdx";
+
+# X/Y will hold the inverse parameter
+# Assumption: X,Y<2^(256)
+$x0 = "%r8";
+$x1 = "%r9";
+$x2 = "%r10";
+$x3 = "%r11";
+# borrow from out (out is needed only at the end)
+$x4 = "%rdi";
+$y0 = "%r12";
+$y1 = "%r13";
+$y2 = "%r14";
+$y3 = "%r15";
+$y4 = "%rbp";
+$shift = "%rcx";
+$t0 = "%rax";
+$t1 = "%rbx";
+$t2 = "%rsi";
+# borrow
+$t3 = "%rcx";
+
+$T0 = "%xmm0";
+$T1 = "%xmm1";
+
+# Offsets on the stack
+$out_rsp = 0;
+$shift_rsp = $out_rsp+0x8;
+$a_rsp0 = $shift_rsp+0x8;
+$a_rsp1 = $a_rsp0+0x8;
+$a_rsp2 = $a_rsp1+0x8;
+$a_rsp3 = $a_rsp2+0x8;
+$b_rsp0 = $a_rsp3+0x8;
+$b_rsp1 = $b_rsp0+0x8;
+$b_rsp2 = $b_rsp1+0x8;
+$b_rsp3 = $b_rsp2+0x8;
+
+# Borrow when a_rsp/b_rsp are no longer needed.
+$y_rsp0 = $a_rsp0;
+$y_rsp1 = $y_rsp0+0x8;
+$y_rsp2 = $y_rsp1+0x8;
+$y_rsp3 = $y_rsp2+0x8;
+$y_rsp4 = $y_rsp3+0x8;
+$last_rsp_offset = $b_rsp3+0x8;
+
+sub TEST_B_ZERO {
+  return <<___;
+    xorq $t1, $t1
+    or $b_rsp0(%rsp), $t1
+    or $b_rsp1(%rsp), $t1
+    or $b_rsp2(%rsp), $t1
+    or $b_rsp3(%rsp), $t1
+    jz .Lbeeu_loop_end
+___
+}
+
+$g_next_label = 0;
+
+sub SHIFT1 {
+  my ($var0, $var1, $var2, $var3, $var4) = @_;
+  my $label = ".Lshift1_${g_next_label}";
+  $g_next_label++;
+
+  return <<___;
+    # Ensure X is even and divide by two.
+    movq \$1, $t1
+    andq $var0, $t1
+    jz $label
+    add 0*8($n), $var0
+    adc 1*8($n), $var1
+    adc 2*8($n), $var2
+    adc 3*8($n), $var3
+    adc \$0, $var4
+
+$label:
+    shrdq \$1, $var1, $var0
+    shrdq \$1, $var2, $var1
+    shrdq \$1, $var3, $var2
+    shrdq \$1, $var4, $var3
+    shrq  \$1, $var4
+___
+}
+
+sub SHIFT256 {
+  my ($var) = @_;
+  return <<___;
+    # Copy shifted values.
+    # Remember not to override t3=rcx
+    movq 1*8+$var(%rsp), $t0
+    movq 2*8+$var(%rsp), $t1
+    movq 3*8+$var(%rsp), $t2
+
+    shrdq %cl, $t0, 0*8+$var(%rsp)
+    shrdq %cl, $t1, 1*8+$var(%rsp)
+    shrdq %cl, $t2, 2*8+$var(%rsp)
+
+    shrq  %cl, $t2
+    mov $t2, 3*8+$var(%rsp)
+___
+}
+
+$code.=<<___;
+.text
+
+.type beeu_mod_inverse_vartime,\@function
+.hidden beeu_mod_inverse_vartime
+.globl  beeu_mod_inverse_vartime
+.align 32
+beeu_mod_inverse_vartime:
+.cfi_startproc
+    push %rbp
+.cfi_push rbp
+    movq %rsp, %rbp
+.cfi_def_cfa_register rbp
+
+    push %r12
+.cfi_push r12
+    push %r13
+.cfi_push r13
+    push %r14
+.cfi_push r14
+    push %r15
+.cfi_push r15
+    push %rbx
+.cfi_push rbx
+    push %rsi
+.cfi_push rsi
+
+    sub \$$last_rsp_offset, %rsp
+    movq $out, $out_rsp(%rsp)
+
+    # X=1, Y=0
+    movq \$1, $x0
+    xorq $x1, $x1
+    xorq $x2, $x2
+    xorq $x3, $x3
+    xorq $x4, $x4
+
+    xorq $y0, $y0
+    xorq $y1, $y1
+    xorq $y2, $y2
+    xorq $y3, $y3
+    xorq $y4, $y4
+
+    # Copy a/n into B/A on the stack.
+    vmovdqu 0*8($a), $T0
+    vmovdqu 2*8($a), $T1
+    vmovdqu $T0, $b_rsp0(%rsp)
+    vmovdqu $T1, $b_rsp2(%rsp)
+
+    vmovdqu 0*8($n), $T0
+    vmovdqu 2*8($n), $T1
+    vmovdqu $T0, $a_rsp0(%rsp)
+    vmovdqu $T1, $a_rsp2(%rsp)
+
+.Lbeeu_loop:
+    ${\TEST_B_ZERO}
+
+    # 0 < B < |n|,
+    # 0 < A <= |n|,
+    # (1)      X*a  ==  B   (mod |n|),
+    # (2) (-1)*Y*a  ==  A   (mod |n|)
+
+    # Now divide B by the maximum possible power of two in the
+    # integers, and divide X by the same value mod |n|. When we're
+    # done, (1) still holds.
+    movq \$1, $shift
+
+    # Note that B > 0
+.Lbeeu_shift_loop_XB:
+    movq $shift, $t1
+    andq $b_rsp0(%rsp), $t1
+    jnz .Lbeeu_shift_loop_end_XB
+
+    ${\SHIFT1($x0, $x1, $x2, $x3, $x4)}
+    shl \$1, $shift
+
+    # Test wraparound of the shift parameter. The probability to have 32 zeroes
+    # in a row is small Therefore having the value below equal \$0x8000000 or
+    # \$0x8000 does not affect the performance. We choose 0x8000000 because it
+    # is the maximal immediate value possible.
+    cmp \$0x8000000, $shift
+    jne .Lbeeu_shift_loop_XB
+
+.Lbeeu_shift_loop_end_XB:
+    bsf $shift, $shift
+    test $shift, $shift
+    jz .Lbeeu_no_shift_XB
+
+    ${\SHIFT256($b_rsp0)}
+
+.Lbeeu_no_shift_XB:
+    # Same for A and Y.  Afterwards, (2) still holds.
+    movq \$1, $shift
+
+    # Note that A > 0
+.Lbeeu_shift_loop_YA:
+    movq $shift, $t1
+    andq $a_rsp0(%rsp), $t1
+    jnz .Lbeeu_shift_loop_end_YA
+
+    ${\SHIFT1($y0, $y1, $y2, $y3, $y4)}
+    shl \$1, $shift
+
+    # Test wraparound of the shift parameter. The probability to have 32 zeroes
+    # in a row is small therefore having the value below equal \$0x8000000 or
+    # \$0x8000 Does not affect the performance. We choose 0x8000000 because it
+    # is the maximal immediate value possible.
+    cmp \$0x8000000, $shift
+    jne .Lbeeu_shift_loop_YA
+
+.Lbeeu_shift_loop_end_YA:
+    bsf $shift, $shift
+    test $shift, $shift
+    jz .Lbeeu_no_shift_YA
+
+    ${\SHIFT256($a_rsp0)}
+
+.Lbeeu_no_shift_YA:
+    # T = B-A (A,B < 2^256)
+    mov $b_rsp0(%rsp), $t0
+    mov $b_rsp1(%rsp), $t1
+    mov $b_rsp2(%rsp), $t2
+    mov $b_rsp3(%rsp), $t3
+    sub $a_rsp0(%rsp), $t0
+    sbb $a_rsp1(%rsp), $t1
+    sbb $a_rsp2(%rsp), $t2
+    sbb $a_rsp3(%rsp), $t3  # borrow from shift
+    jnc .Lbeeu_B_bigger_than_A
+
+    # A = A - B
+    mov $a_rsp0(%rsp), $t0
+    mov $a_rsp1(%rsp), $t1
+    mov $a_rsp2(%rsp), $t2
+    mov $a_rsp3(%rsp), $t3
+    sub $b_rsp0(%rsp), $t0
+    sbb $b_rsp1(%rsp), $t1
+    sbb $b_rsp2(%rsp), $t2
+    sbb $b_rsp3(%rsp), $t3
+    mov $t0, $a_rsp0(%rsp)
+    mov $t1, $a_rsp1(%rsp)
+    mov $t2, $a_rsp2(%rsp)
+    mov $t3, $a_rsp3(%rsp)
+
+    # Y = Y + X
+    add $x0, $y0
+    adc $x1, $y1
+    adc $x2, $y2
+    adc $x3, $y3
+    adc $x4, $y4
+    jmp .Lbeeu_loop
+
+.Lbeeu_B_bigger_than_A:
+    # B = T = B - A
+    mov $t0, $b_rsp0(%rsp)
+    mov $t1, $b_rsp1(%rsp)
+    mov $t2, $b_rsp2(%rsp)
+    mov $t3, $b_rsp3(%rsp)
+
+    # X = Y + X
+    add $y0, $x0
+    adc $y1, $x1
+    adc $y2, $x2
+    adc $y3, $x3
+    adc $y4, $x4
+
+    jmp .Lbeeu_loop
+
+.Lbeeu_loop_end:
+    # The Euclid's algorithm loop ends when A == beeu(a,n);
+    # Therefore (-1)*Y*a == A (mod |n|), Y>0
+
+    # Verify that A = 1 ==> (-1)*Y*a = A = 1  (mod |n|)
+    mov $a_rsp0(%rsp), $t1
+    sub \$1, $t1
+    or $a_rsp1(%rsp), $t1
+    or $a_rsp2(%rsp), $t1
+    or $a_rsp3(%rsp), $t1
+    # If not, fail.
+    jnz .Lbeeu_err
+
+    # From this point on, we no longer need X
+    # Therefore we use it as a temporary storage.
+    # X = n
+    movq 0*8($n), $x0
+    movq 1*8($n), $x1
+    movq 2*8($n), $x2
+    movq 3*8($n), $x3
+    xorq $x4, $x4
+
+.Lbeeu_reduction_loop:
+    movq $y0, $y_rsp0(%rsp)
+    movq $y1, $y_rsp1(%rsp)
+    movq $y2, $y_rsp2(%rsp)
+    movq $y3, $y_rsp3(%rsp)
+    movq $y4, $y_rsp4(%rsp)
+
+    # If Y>n ==> Y=Y-n
+    sub $x0, $y0
+    sbb $x1, $y1
+    sbb $x2, $y2
+    sbb $x3, $y3
+    sbb \$0, $y4
+
+    # Choose old Y or new Y
+    cmovc $y_rsp0(%rsp), $y0
+    cmovc $y_rsp1(%rsp), $y1
+    cmovc $y_rsp2(%rsp), $y2
+    cmovc $y_rsp3(%rsp), $y3
+    jnc .Lbeeu_reduction_loop
+
+    # X = n - Y (n, Y < 2^256), (Cancel the (-1))
+    sub $y0, $x0
+    sbb $y1, $x1
+    sbb $y2, $x2
+    sbb $y3, $x3
+
+.Lbeeu_save:
+    # Save the inverse(<2^256) to out.
+    mov $out_rsp(%rsp), $out
+
+    movq $x0, 0*8($out)
+    movq $x1, 1*8($out)
+    movq $x2, 2*8($out)
+    movq $x3, 3*8($out)
+
+    # Return 1.
+    movq \$1, %rax
+    jmp .Lbeeu_finish
+
+.Lbeeu_err:
+    # Return 0.
+    xorq %rax, %rax
+
+.Lbeeu_finish:
+    add \$$last_rsp_offset, %rsp
+    pop %rsi
+.cfi_pop rsi
+    pop %rbx
+.cfi_pop rbx
+    pop %r15
+.cfi_pop r15
+    pop %r14
+.cfi_pop r14
+    pop %r13
+.cfi_pop r13
+    pop %r12
+.cfi_pop r12
+    pop %rbp
+.cfi_pop rbp
+.cfi_def_cfa rsp, 8
+.cfi_endproc
+    ret
+
+.size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime
+___
+
+print $code;
+close STDOUT;
diff --git a/src/crypto/fipsmodule/ec/ec.c b/src/crypto/fipsmodule/ec/ec.c
index 908e35e..ba101fe 100644
--- a/src/crypto/fipsmodule/ec/ec.c
+++ b/src/crypto/fipsmodule/ec/ec.c
@@ -619,7 +619,7 @@
 int EC_GROUP_get_curve_name(const EC_GROUP *group) { return group->curve_name; }
 
 unsigned EC_GROUP_get_degree(const EC_GROUP *group) {
-  return ec_GFp_simple_group_get_degree(group);
+  return BN_num_bits(&group->field);
 }
 
 const char *EC_curve_nid2nist(int nid) {
@@ -743,7 +743,15 @@
     OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS);
     return 0;
   }
-  return group->meth->point_get_affine_coordinates(group, &point->raw, x, y);
+  EC_FELEM x_felem, y_felem;
+  if (!group->meth->point_get_affine_coordinates(group, &point->raw,
+                                                 x == NULL ? NULL : &x_felem,
+                                                 y == NULL ? NULL : &y_felem) ||
+      (x != NULL && !bn_set_words(x, x_felem.words, group->field.width)) ||
+      (y != NULL && !bn_set_words(y, y_felem.words, group->field.width))) {
+    return 0;
+  }
+  return 1;
 }
 
 int EC_POINT_set_affine_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
@@ -782,7 +790,7 @@
     OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS);
     return 0;
   }
-  ec_GFp_simple_add(group, &r->raw, &a->raw, &b->raw);
+  group->meth->add(group, &r->raw, &a->raw, &b->raw);
   return 1;
 }
 
@@ -793,7 +801,7 @@
     OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS);
     return 0;
   }
-  ec_GFp_simple_dbl(group, &r->raw, &a->raw);
+  group->meth->dbl(group, &r->raw, &a->raw);
   return 1;
 }
 
@@ -911,6 +919,43 @@
   return 1;
 }
 
+int ec_cmp_x_coordinate(int *out_result, const EC_GROUP *group,
+                        const EC_POINT *p, const BIGNUM *r, BN_CTX *ctx) {
+  return group->meth->cmp_x_coordinate(out_result, group, p, r, ctx);
+}
+
+int ec_field_element_to_scalar(const EC_GROUP *group, BIGNUM *r) {
+  // We must have p < 2×order, assuming p is not tiny (p >= 17). Thus rather we
+  // can reduce by performing at most one subtraction.
+  //
+  // Proof: We only work with prime order curves, so the number of points on
+  // the curve is the order. Thus Hasse's theorem gives:
+  //
+  //     |order - (p + 1)| <= 2×sqrt(p)
+  //         p + 1 - order <= 2×sqrt(p)
+  //     p + 1 - 2×sqrt(p) <= order
+  //       p + 1 - 2×(p/4)  < order       (p/4 > sqrt(p) for p >= 17)
+  //         p/2 < p/2 + 1  < order
+  //                     p  < 2×order
+  //
+  // Additionally, one can manually check this property for built-in curves. It
+  // is enforced for legacy custom curves in |EC_GROUP_set_generator|.
+  //
+  // TODO(davidben): Introduce |EC_FIELD_ELEMENT|, make this a function from
+  // |EC_FIELD_ELEMENT| to |EC_SCALAR|, and cut out the |BIGNUM|. Does this need
+  // to be constant-time for signing? |r| is the x-coordinate for kG, which is
+  // public unless k was rerolled because |s| was zero.
+  assert(!BN_is_negative(r));
+  assert(BN_cmp(r, &group->field) < 0);
+  if (BN_cmp(r, &group->order) >= 0 &&
+      !BN_sub(r, r, &group->order)) {
+    return 0;
+  }
+  assert(!BN_is_negative(r));
+  assert(BN_cmp(r, &group->order) < 0);
+  return 1;
+}
+
 void EC_GROUP_set_asn1_flag(EC_GROUP *group, int flag) {}
 
 const EC_METHOD *EC_GROUP_method_of(const EC_GROUP *group) {
diff --git a/src/crypto/fipsmodule/ec/ec_montgomery.c b/src/crypto/fipsmodule/ec/ec_montgomery.c
index 9eace95..4961a7c 100644
--- a/src/crypto/fipsmodule/ec/ec_montgomery.c
+++ b/src/crypto/fipsmodule/ec/ec_montgomery.c
@@ -182,7 +182,7 @@
 
 static int ec_GFp_mont_point_get_affine_coordinates(const EC_GROUP *group,
                                                     const EC_RAW_POINT *point,
-                                                    BIGNUM *x, BIGNUM *y) {
+                                                    EC_FELEM *x, EC_FELEM *y) {
   if (ec_GFp_simple_is_at_infinity(group, point)) {
     OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY);
     return 0;
@@ -201,35 +201,236 @@
   ec_GFp_mont_felem_from_montgomery(group, &z1, &z1);
 
   if (x != NULL) {
-    EC_FELEM tmp;
-    ec_GFp_mont_felem_mul(group, &tmp, &point->X, &z1);
-    if (!bn_set_words(x, tmp.words, group->field.width)) {
-      return 0;
-    }
+    ec_GFp_mont_felem_mul(group, x, &point->X, &z1);
   }
 
   if (y != NULL) {
-    EC_FELEM tmp;
     ec_GFp_mont_felem_mul(group, &z1, &z1, &z2);
-    ec_GFp_mont_felem_mul(group, &tmp, &point->Y, &z1);
-    if (!bn_set_words(y, tmp.words, group->field.width)) {
-      return 0;
-    }
+    ec_GFp_mont_felem_mul(group, y, &point->Y, &z1);
   }
 
   return 1;
 }
 
+void ec_GFp_mont_add(const EC_GROUP *group, EC_RAW_POINT *out,
+                     const EC_RAW_POINT *a, const EC_RAW_POINT *b) {
+  if (a == b) {
+    ec_GFp_mont_dbl(group, out, a);
+    return;
+  }
+
+  // The method is taken from:
+  //   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#addition-add-2007-bl
+  //
+  // Coq transcription and correctness proof:
+  // <https://github.com/davidben/fiat-crypto/blob/c7b95f62b2a54b559522573310e9b487327d219a/src/Curves/Weierstrass/Jacobian.v#L467>
+  // <https://github.com/davidben/fiat-crypto/blob/c7b95f62b2a54b559522573310e9b487327d219a/src/Curves/Weierstrass/Jacobian.v#L544>
+  EC_FELEM x_out, y_out, z_out;
+  BN_ULONG z1nz = ec_felem_non_zero_mask(group, &a->Z);
+  BN_ULONG z2nz = ec_felem_non_zero_mask(group, &b->Z);
+
+  // z1z1 = z1z1 = z1**2
+  EC_FELEM z1z1;
+  ec_GFp_mont_felem_sqr(group, &z1z1, &a->Z);
+
+  // z2z2 = z2**2
+  EC_FELEM z2z2;
+  ec_GFp_mont_felem_sqr(group, &z2z2, &b->Z);
+
+  // u1 = x1*z2z2
+  EC_FELEM u1;
+  ec_GFp_mont_felem_mul(group, &u1, &a->X, &z2z2);
+
+  // two_z1z2 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2
+  EC_FELEM two_z1z2;
+  ec_felem_add(group, &two_z1z2, &a->Z, &b->Z);
+  ec_GFp_mont_felem_sqr(group, &two_z1z2, &two_z1z2);
+  ec_felem_sub(group, &two_z1z2, &two_z1z2, &z1z1);
+  ec_felem_sub(group, &two_z1z2, &two_z1z2, &z2z2);
+
+  // s1 = y1 * z2**3
+  EC_FELEM s1;
+  ec_GFp_mont_felem_mul(group, &s1, &b->Z, &z2z2);
+  ec_GFp_mont_felem_mul(group, &s1, &s1, &a->Y);
+
+  // u2 = x2*z1z1
+  EC_FELEM u2;
+  ec_GFp_mont_felem_mul(group, &u2, &b->X, &z1z1);
+
+  // h = u2 - u1
+  EC_FELEM h;
+  ec_felem_sub(group, &h, &u2, &u1);
+
+  BN_ULONG xneq = ec_felem_non_zero_mask(group, &h);
+
+  // z_out = two_z1z2 * h
+  ec_GFp_mont_felem_mul(group, &z_out, &h, &two_z1z2);
+
+  // z1z1z1 = z1 * z1z1
+  EC_FELEM z1z1z1;
+  ec_GFp_mont_felem_mul(group, &z1z1z1, &a->Z, &z1z1);
+
+  // s2 = y2 * z1**3
+  EC_FELEM s2;
+  ec_GFp_mont_felem_mul(group, &s2, &b->Y, &z1z1z1);
+
+  // r = (s2 - s1)*2
+  EC_FELEM r;
+  ec_felem_sub(group, &r, &s2, &s1);
+  ec_felem_add(group, &r, &r, &r);
+
+  BN_ULONG yneq = ec_felem_non_zero_mask(group, &r);
+
+  // This case will never occur in the constant-time |ec_GFp_mont_mul|.
+  if (!xneq && !yneq && z1nz && z2nz) {
+    ec_GFp_mont_dbl(group, out, a);
+    return;
+  }
+
+  // I = (2h)**2
+  EC_FELEM i;
+  ec_felem_add(group, &i, &h, &h);
+  ec_GFp_mont_felem_sqr(group, &i, &i);
+
+  // J = h * I
+  EC_FELEM j;
+  ec_GFp_mont_felem_mul(group, &j, &h, &i);
+
+  // V = U1 * I
+  EC_FELEM v;
+  ec_GFp_mont_felem_mul(group, &v, &u1, &i);
+
+  // x_out = r**2 - J - 2V
+  ec_GFp_mont_felem_sqr(group, &x_out, &r);
+  ec_felem_sub(group, &x_out, &x_out, &j);
+  ec_felem_sub(group, &x_out, &x_out, &v);
+  ec_felem_sub(group, &x_out, &x_out, &v);
+
+  // y_out = r(V-x_out) - 2 * s1 * J
+  ec_felem_sub(group, &y_out, &v, &x_out);
+  ec_GFp_mont_felem_mul(group, &y_out, &y_out, &r);
+  EC_FELEM s1j;
+  ec_GFp_mont_felem_mul(group, &s1j, &s1, &j);
+  ec_felem_sub(group, &y_out, &y_out, &s1j);
+  ec_felem_sub(group, &y_out, &y_out, &s1j);
+
+  ec_felem_select(group, &x_out, z1nz, &x_out, &b->X);
+  ec_felem_select(group, &out->X, z2nz, &x_out, &a->X);
+  ec_felem_select(group, &y_out, z1nz, &y_out, &b->Y);
+  ec_felem_select(group, &out->Y, z2nz, &y_out, &a->Y);
+  ec_felem_select(group, &z_out, z1nz, &z_out, &b->Z);
+  ec_felem_select(group, &out->Z, z2nz, &z_out, &a->Z);
+}
+
+void ec_GFp_mont_dbl(const EC_GROUP *group, EC_RAW_POINT *r,
+                     const EC_RAW_POINT *a) {
+  if (group->a_is_minus3) {
+    // The method is taken from:
+    //   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
+    //
+    // Coq transcription and correctness proof:
+    // <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L93>
+    // <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L201>
+    EC_FELEM delta, gamma, beta, ftmp, ftmp2, tmptmp, alpha, fourbeta;
+    // delta = z^2
+    ec_GFp_mont_felem_sqr(group, &delta, &a->Z);
+    // gamma = y^2
+    ec_GFp_mont_felem_sqr(group, &gamma, &a->Y);
+    // beta = x*gamma
+    ec_GFp_mont_felem_mul(group, &beta, &a->X, &gamma);
+
+    // alpha = 3*(x-delta)*(x+delta)
+    ec_felem_sub(group, &ftmp, &a->X, &delta);
+    ec_felem_add(group, &ftmp2, &a->X, &delta);
+
+    ec_felem_add(group, &tmptmp, &ftmp2, &ftmp2);
+    ec_felem_add(group, &ftmp2, &ftmp2, &tmptmp);
+    ec_GFp_mont_felem_mul(group, &alpha, &ftmp, &ftmp2);
+
+    // x' = alpha^2 - 8*beta
+    ec_GFp_mont_felem_sqr(group, &r->X, &alpha);
+    ec_felem_add(group, &fourbeta, &beta, &beta);
+    ec_felem_add(group, &fourbeta, &fourbeta, &fourbeta);
+    ec_felem_add(group, &tmptmp, &fourbeta, &fourbeta);
+    ec_felem_sub(group, &r->X, &r->X, &tmptmp);
+
+    // z' = (y + z)^2 - gamma - delta
+    ec_felem_add(group, &delta, &gamma, &delta);
+    ec_felem_add(group, &ftmp, &a->Y, &a->Z);
+    ec_GFp_mont_felem_sqr(group, &r->Z, &ftmp);
+    ec_felem_sub(group, &r->Z, &r->Z, &delta);
+
+    // y' = alpha*(4*beta - x') - 8*gamma^2
+    ec_felem_sub(group, &r->Y, &fourbeta, &r->X);
+    ec_felem_add(group, &gamma, &gamma, &gamma);
+    ec_GFp_mont_felem_sqr(group, &gamma, &gamma);
+    ec_GFp_mont_felem_mul(group, &r->Y, &alpha, &r->Y);
+    ec_felem_add(group, &gamma, &gamma, &gamma);
+    ec_felem_sub(group, &r->Y, &r->Y, &gamma);
+  } else {
+    // The method is taken from:
+    //   http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
+    //
+    // Coq transcription and correctness proof:
+    // <https://github.com/davidben/fiat-crypto/blob/c7b95f62b2a54b559522573310e9b487327d219a/src/Curves/Weierstrass/Jacobian.v#L102>
+    // <https://github.com/davidben/fiat-crypto/blob/c7b95f62b2a54b559522573310e9b487327d219a/src/Curves/Weierstrass/Jacobian.v#L534>
+    EC_FELEM xx, yy, yyyy, zz;
+    ec_GFp_mont_felem_sqr(group, &xx, &a->X);
+    ec_GFp_mont_felem_sqr(group, &yy, &a->Y);
+    ec_GFp_mont_felem_sqr(group, &yyyy, &yy);
+    ec_GFp_mont_felem_sqr(group, &zz, &a->Z);
+
+    // s = 2*((x_in + yy)^2 - xx - yyyy)
+    EC_FELEM s;
+    ec_felem_add(group, &s, &a->X, &yy);
+    ec_GFp_mont_felem_sqr(group, &s, &s);
+    ec_felem_sub(group, &s, &s, &xx);
+    ec_felem_sub(group, &s, &s, &yyyy);
+    ec_felem_add(group, &s, &s, &s);
+
+    // m = 3*xx + a*zz^2
+    EC_FELEM m;
+    ec_GFp_mont_felem_sqr(group, &m, &zz);
+    ec_GFp_mont_felem_mul(group, &m, &group->a, &m);
+    ec_felem_add(group, &m, &m, &xx);
+    ec_felem_add(group, &m, &m, &xx);
+    ec_felem_add(group, &m, &m, &xx);
+
+    // x_out = m^2 - 2*s
+    ec_GFp_mont_felem_sqr(group, &r->X, &m);
+    ec_felem_sub(group, &r->X, &r->X, &s);
+    ec_felem_sub(group, &r->X, &r->X, &s);
+
+    // z_out = (y_in + z_in)^2 - yy - zz
+    ec_felem_add(group, &r->Z, &a->Y, &a->Z);
+    ec_GFp_mont_felem_sqr(group, &r->Z, &r->Z);
+    ec_felem_sub(group, &r->Z, &r->Z, &yy);
+    ec_felem_sub(group, &r->Z, &r->Z, &zz);
+
+    // y_out = m*(s-x_out) - 8*yyyy
+    ec_felem_add(group, &yyyy, &yyyy, &yyyy);
+    ec_felem_add(group, &yyyy, &yyyy, &yyyy);
+    ec_felem_add(group, &yyyy, &yyyy, &yyyy);
+    ec_felem_sub(group, &r->Y, &s, &r->X);
+    ec_GFp_mont_felem_mul(group, &r->Y, &r->Y, &m);
+    ec_felem_sub(group, &r->Y, &r->Y, &yyyy);
+  }
+}
+
 DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_mont_method) {
   out->group_init = ec_GFp_mont_group_init;
   out->group_finish = ec_GFp_mont_group_finish;
   out->group_set_curve = ec_GFp_mont_group_set_curve;
   out->point_get_affine_coordinates = ec_GFp_mont_point_get_affine_coordinates;
-  out->mul = ec_GFp_simple_mul;
-  out->mul_public = ec_GFp_simple_mul_public;
+  out->add = ec_GFp_mont_add;
+  out->dbl = ec_GFp_mont_dbl;
+  out->mul = ec_GFp_mont_mul;
+  out->mul_public = ec_GFp_mont_mul_public;
   out->felem_mul = ec_GFp_mont_felem_mul;
   out->felem_sqr = ec_GFp_mont_felem_sqr;
   out->bignum_to_felem = ec_GFp_mont_bignum_to_felem;
   out->felem_to_bignum = ec_GFp_mont_felem_to_bignum;
   out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery;
+  out->scalar_inv_montgomery_vartime = ec_GFp_simple_mont_inv_mod_ord_vartime;
+  out->cmp_x_coordinate = ec_GFp_simple_cmp_x_coordinate;
 }
diff --git a/src/crypto/fipsmodule/ec/internal.h b/src/crypto/fipsmodule/ec/internal.h
index bb172b2..4afaef9 100644
--- a/src/crypto/fipsmodule/ec/internal.h
+++ b/src/crypto/fipsmodule/ec/internal.h
@@ -124,8 +124,21 @@
   void (*group_finish)(EC_GROUP *);
   int (*group_set_curve)(EC_GROUP *, const BIGNUM *p, const BIGNUM *a,
                          const BIGNUM *b, BN_CTX *);
-  int (*point_get_affine_coordinates)(const EC_GROUP *, const EC_RAW_POINT *,
-                                      BIGNUM *x, BIGNUM *y);
+
+  // point_get_affine_coordinates sets |*x| and |*y| to the affine coordinates
+  // of |p|. Either |x| or |y| may be NULL to omit it. It returns one on success
+  // and zero if |p| is the point at infinity.
+  //
+  // Note: unlike |EC_FELEM|s used as intermediate values internal to the
+  // |EC_METHOD|, |*x| and |*y| are not encoded in Montgomery form.
+  int (*point_get_affine_coordinates)(const EC_GROUP *, const EC_RAW_POINT *p,
+                                      EC_FELEM *x, EC_FELEM *y);
+
+  // add sets |r| to |a| + |b|.
+  void (*add)(const EC_GROUP *group, EC_RAW_POINT *r, const EC_RAW_POINT *a,
+              const EC_RAW_POINT *b);
+  // dbl sets |r| to |a| + |a|.
+  void (*dbl)(const EC_GROUP *group, EC_RAW_POINT *r, const EC_RAW_POINT *a);
 
   // Computes |r = g_scalar*generator + p_scalar*p| if |g_scalar| and |p_scalar|
   // are both non-null. Computes |r = g_scalar*generator| if |p_scalar| is null.
@@ -149,10 +162,9 @@
   // TODO(davidben): This constrains |EC_FELEM|'s internal representation, adds
   // many indirect calls in the middle of the generic code, and a bunch of
   // conversions. If p224-64.c were easily convertable to Montgomery form, we
-  // could say |EC_FELEM| is always in Montgomery form. If we exposed the
-  // internal add and double implementations in each of the curves, we could
-  // give |EC_POINT| an |EC_METHOD|-specific representation and |EC_FELEM| is
-  // purely a |EC_GFp_mont_method| type.
+  // could say |EC_FELEM| is always in Montgomery form. If we routed the rest of
+  // simple.c to |EC_METHOD|, we could give |EC_POINT| an |EC_METHOD|-specific
+  // representation and say |EC_FELEM| is purely a |EC_GFp_mont_method| type.
   void (*felem_mul)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a,
                     const EC_FELEM *b);
   void (*felem_sqr)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a);
@@ -162,11 +174,22 @@
   int (*felem_to_bignum)(const EC_GROUP *group, BIGNUM *out,
                          const EC_FELEM *in);
 
-  // scalar_inv_mont sets |out| to |in|^-1, where both input and output are in
-  // Montgomery form.
+  // scalar_inv_montgomery sets |out| to |in|^-1, where both input and output
+  // are in Montgomery form.
   void (*scalar_inv_montgomery)(const EC_GROUP *group, EC_SCALAR *out,
                                 const EC_SCALAR *in);
 
+  // scalar_inv_montgomery_vartime performs the same computation as
+  // |scalar_inv_montgomery|. It further assumes that the inputs are public so
+  // there is no concern about leaking their values through timing.
+  int (*scalar_inv_montgomery_vartime)(const EC_GROUP *group, EC_SCALAR *out,
+                                       const EC_SCALAR *in);
+
+  // cmp_x_coordinate compares the x (affine) coordinate of |p|, mod the group
+  // order, with |r|. On error it returns zero. Otherwise it sets |*out_result|
+  // to one iff the values match.
+  int (*cmp_x_coordinate)(int *out_result, const EC_GROUP *group,
+                          const EC_POINT *p, const BIGNUM *r, BN_CTX *ctx);
 } /* EC_METHOD */;
 
 const EC_METHOD *EC_GFp_mont_method(void);
@@ -272,6 +295,11 @@
 void ec_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r,
                               const EC_SCALAR *a);
 
+// ec_scalar_inv_montgomery_vartime performs the same actions as
+// |ec_scalar_inv_montgomery|, but in variable time.
+int ec_scalar_inv_montgomery_vartime(const EC_GROUP *group, EC_SCALAR *r,
+                                     const EC_SCALAR *a);
+
 // ec_point_mul_scalar sets |r| to generator * |g_scalar| + |p| *
 // |p_scalar|. Unlike other functions which take |EC_SCALAR|, |g_scalar| and
 // |p_scalar| need not be fully reduced. They need only contain as many bits as
@@ -287,9 +315,19 @@
     const EC_GROUP *group, EC_POINT *r, const EC_SCALAR *g_scalar,
     const EC_POINT *p, const EC_SCALAR *p_scalar, BN_CTX *ctx);
 
-void ec_GFp_simple_mul(const EC_GROUP *group, EC_RAW_POINT *r,
-                       const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
-                       const EC_SCALAR *p_scalar);
+// ec_cmp_x_coordinate compares the x (affine) coordinate of |p| with |r|. It
+// returns zero on error. Otherwise it sets |*out_result| to one iff the values
+// match.
+int ec_cmp_x_coordinate(int *out_result, const EC_GROUP *group,
+                        const EC_POINT *p, const BIGNUM *r, BN_CTX *ctx);
+
+// ec_field_element_to_scalar reduces |r| modulo |group->order|. |r| must
+// previously have been reduced modulo |group->field|.
+int ec_field_element_to_scalar(const EC_GROUP *group, BIGNUM *r);
+
+void ec_GFp_mont_mul(const EC_GROUP *group, EC_RAW_POINT *r,
+                     const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
+                     const EC_SCALAR *p_scalar);
 
 // ec_compute_wNAF writes the modified width-(w+1) Non-Adjacent Form (wNAF) of
 // |scalar| to |out|. |out| must have room for |bits| + 1 elements, each of
@@ -302,9 +340,9 @@
 void ec_compute_wNAF(const EC_GROUP *group, int8_t *out,
                      const EC_SCALAR *scalar, size_t bits, int w);
 
-void ec_GFp_simple_mul_public(const EC_GROUP *group, EC_RAW_POINT *r,
-                              const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
-                              const EC_SCALAR *p_scalar);
+void ec_GFp_mont_mul_public(const EC_GROUP *group, EC_RAW_POINT *r,
+                            const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
+                            const EC_SCALAR *p_scalar);
 
 // method functions in simple.c
 int ec_GFp_simple_group_init(EC_GROUP *);
@@ -313,17 +351,15 @@
                                   const BIGNUM *b, BN_CTX *);
 int ec_GFp_simple_group_get_curve(const EC_GROUP *, BIGNUM *p, BIGNUM *a,
                                   BIGNUM *b);
-unsigned ec_GFp_simple_group_get_degree(const EC_GROUP *);
 void ec_GFp_simple_point_init(EC_RAW_POINT *);
 void ec_GFp_simple_point_copy(EC_RAW_POINT *, const EC_RAW_POINT *);
 void ec_GFp_simple_point_set_to_infinity(const EC_GROUP *, EC_RAW_POINT *);
 int ec_GFp_simple_point_set_affine_coordinates(const EC_GROUP *, EC_RAW_POINT *,
                                                const BIGNUM *x,
                                                const BIGNUM *y);
-void ec_GFp_simple_add(const EC_GROUP *, EC_RAW_POINT *r, const EC_RAW_POINT *a,
-                       const EC_RAW_POINT *b);
-void ec_GFp_simple_dbl(const EC_GROUP *, EC_RAW_POINT *r,
-                       const EC_RAW_POINT *a);
+void ec_GFp_mont_add(const EC_GROUP *, EC_RAW_POINT *r, const EC_RAW_POINT *a,
+                     const EC_RAW_POINT *b);
+void ec_GFp_mont_dbl(const EC_GROUP *, EC_RAW_POINT *r, const EC_RAW_POINT *a);
 void ec_GFp_simple_invert(const EC_GROUP *, EC_RAW_POINT *);
 int ec_GFp_simple_is_at_infinity(const EC_GROUP *, const EC_RAW_POINT *);
 int ec_GFp_simple_is_on_curve(const EC_GROUP *, const EC_RAW_POINT *);
@@ -332,6 +368,16 @@
 void ec_simple_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r,
                                      const EC_SCALAR *a);
 
+int ec_GFp_simple_mont_inv_mod_ord_vartime(const EC_GROUP *group, EC_SCALAR *r,
+                                           const EC_SCALAR *a);
+
+// ec_GFp_simple_cmp_x_coordinate compares the x (affine) coordinate of |p|, mod
+// the group order, with |r|. It returns zero on error. Otherwise it sets
+// |*out_result| to one iff the values match.
+int ec_GFp_simple_cmp_x_coordinate(int *out_result, const EC_GROUP *group,
+                                   const EC_POINT *p, const BIGNUM *r,
+                                   BN_CTX *ctx);
+
 // method functions in montgomery.c
 int ec_GFp_mont_group_init(EC_GROUP *);
 int ec_GFp_mont_group_set_curve(EC_GROUP *, const BIGNUM *p, const BIGNUM *a,
diff --git a/src/crypto/fipsmodule/ec/p224-64.c b/src/crypto/fipsmodule/ec/p224-64.c
index 606108f..49d5328 100644
--- a/src/crypto/fipsmodule/ec/p224-64.c
+++ b/src/crypto/fipsmodule/ec/p224-64.c
@@ -203,13 +203,6 @@
   }
 }
 
-// From internal representation to OpenSSL BIGNUM
-static BIGNUM *p224_felem_to_BN(BIGNUM *out, const p224_felem in) {
-  p224_felem_bytearray b_out;
-  p224_felem_to_bin28(b_out, in);
-  return BN_le2bn(b_out, sizeof(b_out), out);
-}
-
 static void p224_generic_to_felem(p224_felem out, const EC_FELEM *in) {
   p224_bin28_to_felem(out, in->bytes);
 }
@@ -917,7 +910,7 @@
 
       if (!skip) {
         p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */,
-                  tmp[0], tmp[1], tmp[2]);
+                       tmp[0], tmp[1], tmp[2]);
       } else {
         OPENSSL_memcpy(nq, tmp, 3 * sizeof(p224_felem));
         skip = 0;
@@ -951,7 +944,7 @@
 
       if (!skip) {
         p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 0 /* mixed */,
-                  tmp[0], tmp[1], tmp[2]);
+                       tmp[0], tmp[1], tmp[2]);
       } else {
         OPENSSL_memcpy(nq, tmp, 3 * sizeof(p224_felem));
         skip = 0;
@@ -971,7 +964,8 @@
 // Takes the Jacobian coordinates (X, Y, Z) of a point and returns
 // (X', Y') = (X/Z^2, Y/Z^3)
 static int ec_GFp_nistp224_point_get_affine_coordinates(
-    const EC_GROUP *group, const EC_RAW_POINT *point, BIGNUM *x, BIGNUM *y) {
+    const EC_GROUP *group, const EC_RAW_POINT *point, EC_FELEM *x,
+    EC_FELEM *y) {
   if (ec_GFp_simple_is_at_infinity(group, point)) {
     OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY);
     return 0;
@@ -990,10 +984,7 @@
     p224_felem_mul(tmp, x_in, z1);
     p224_felem_reduce(x_in, tmp);
     p224_felem_contract(x_out, x_in);
-    if (!p224_felem_to_BN(x, x_out)) {
-      OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB);
-      return 0;
-    }
+    p224_felem_to_generic(x, x_out);
   }
 
   if (y != NULL) {
@@ -1004,15 +995,39 @@
     p224_felem_mul(tmp, y_in, z1);
     p224_felem_reduce(y_in, tmp);
     p224_felem_contract(y_out, y_in);
-    if (!p224_felem_to_BN(y, y_out)) {
-      OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB);
-      return 0;
-    }
+    p224_felem_to_generic(y, y_out);
   }
 
   return 1;
 }
 
+static void ec_GFp_nistp224_add(const EC_GROUP *group, EC_RAW_POINT *r,
+                                const EC_RAW_POINT *a, const EC_RAW_POINT *b) {
+  p224_felem x1, y1, z1, x2, y2, z2;
+  p224_generic_to_felem(x1, &a->X);
+  p224_generic_to_felem(y1, &a->Y);
+  p224_generic_to_felem(z1, &a->Z);
+  p224_generic_to_felem(x2, &b->X);
+  p224_generic_to_felem(y2, &b->Y);
+  p224_generic_to_felem(z2, &b->Z);
+  p224_point_add(x1, y1, z1, x1, y1, z1, 0 /* both Jacobian */, x2, y2, z2);
+  p224_felem_to_generic(&r->X, x1);
+  p224_felem_to_generic(&r->Y, y1);
+  p224_felem_to_generic(&r->Z, z1);
+}
+
+static void ec_GFp_nistp224_dbl(const EC_GROUP *group, EC_RAW_POINT *r,
+                                const EC_RAW_POINT *a) {
+  p224_felem x, y, z;
+  p224_generic_to_felem(x, &a->X);
+  p224_generic_to_felem(y, &a->Y);
+  p224_generic_to_felem(z, &a->Z);
+  p224_point_double(x, y, z, x, y, z);
+  p224_felem_to_generic(&r->X, x);
+  p224_felem_to_generic(&r->Y, y);
+  p224_felem_to_generic(&r->Z, z);
+}
+
 static void ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_RAW_POINT *r,
                                        const EC_SCALAR *g_scalar,
                                        const EC_RAW_POINT *p,
@@ -1036,13 +1051,13 @@
     for (size_t j = 2; j <= 16; ++j) {
       if (j & 1) {
         p224_point_add(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2],
-                  p_pre_comp[1][0], p_pre_comp[1][1], p_pre_comp[1][2],
-                  0, p_pre_comp[j - 1][0], p_pre_comp[j - 1][1],
-                  p_pre_comp[j - 1][2]);
+                       p_pre_comp[1][0], p_pre_comp[1][1], p_pre_comp[1][2], 0,
+                       p_pre_comp[j - 1][0], p_pre_comp[j - 1][1],
+                       p_pre_comp[j - 1][2]);
       } else {
-        p224_point_double(p_pre_comp[j][0], p_pre_comp[j][1],
-                     p_pre_comp[j][2], p_pre_comp[j / 2][0],
-                     p_pre_comp[j / 2][1], p_pre_comp[j / 2][2]);
+        p224_point_double(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2],
+                          p_pre_comp[j / 2][0], p_pre_comp[j / 2][1],
+                          p_pre_comp[j / 2][2]);
       }
     }
   }
@@ -1100,6 +1115,8 @@
   out->group_set_curve = ec_GFp_simple_group_set_curve;
   out->point_get_affine_coordinates =
       ec_GFp_nistp224_point_get_affine_coordinates;
+  out->add = ec_GFp_nistp224_add;
+  out->dbl = ec_GFp_nistp224_dbl;
   out->mul = ec_GFp_nistp224_points_mul;
   out->mul_public = ec_GFp_nistp224_points_mul;
   out->felem_mul = ec_GFp_nistp224_felem_mul;
@@ -1107,6 +1124,8 @@
   out->bignum_to_felem = ec_GFp_nistp224_bignum_to_felem;
   out->felem_to_bignum = ec_GFp_nistp224_felem_to_bignum;
   out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery;
+  out->scalar_inv_montgomery_vartime = ec_GFp_simple_mont_inv_mod_ord_vartime;
+  out->cmp_x_coordinate = ec_GFp_simple_cmp_x_coordinate;
 };
 
 #endif  // BORINGSSL_HAS_UINT128 && !SMALL
diff --git a/src/crypto/fipsmodule/ec/p256-x86_64.c b/src/crypto/fipsmodule/ec/p256-x86_64.c
index a4f6515..e7f4909 100644
--- a/src/crypto/fipsmodule/ec/p256-x86_64.c
+++ b/src/crypto/fipsmodule/ec/p256-x86_64.c
@@ -44,6 +44,12 @@
     TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe),
 };
 
+// P256_ORDER is the order of the P-256 group, not in Montgomery form.
+static const BN_ULONG P256_ORDER[P256_LIMBS] = {
+    TOBN(0xf3b9cac2, 0xfc632551), TOBN(0xbce6faad, 0xa7179e84),
+    TOBN(0xffffffff, 0xffffffff), TOBN(0xffffffff, 0x00000000),
+};
+
 // Precomputed tables for the default generator
 #include "p256-x86_64-table.h"
 
@@ -289,19 +295,63 @@
   ecp_nistz256_point_add(r, r, &h);
 }
 
+typedef union {
+  P256_POINT p;
+  P256_POINT_AFFINE a;
+} p256_point_union_t;
+
+static unsigned calc_first_wvalue(unsigned *index, const uint8_t p_str[33]) {
+  static const unsigned kWindowSize = 7;
+  static const unsigned kMask = (1 << (7 /* kWindowSize */ + 1)) - 1;
+  *index = kWindowSize;
+
+  unsigned wvalue = (p_str[0] << 1) & kMask;
+  return booth_recode_w7(wvalue);
+}
+
+static unsigned calc_wvalue(unsigned *index, const uint8_t p_str[33]) {
+  static const unsigned kWindowSize = 7;
+  static const unsigned kMask = (1 << (7 /* kWindowSize */ + 1)) - 1;
+
+  const unsigned off = (*index - 1) / 8;
+  unsigned wvalue = p_str[off] | p_str[off + 1] << 8;
+  wvalue = (wvalue >> ((*index - 1) % 8)) & kMask;
+  *index += kWindowSize;
+
+  return booth_recode_w7(wvalue);
+}
+
+static void mul_p_add_and_store(const EC_GROUP *group, EC_RAW_POINT *r,
+                                const EC_SCALAR *g_scalar,
+                                const EC_RAW_POINT *p_,
+                                const EC_SCALAR *p_scalar,
+                                p256_point_union_t *t, p256_point_union_t *p) {
+  const int p_is_infinity = g_scalar == NULL;
+  if (p_scalar != NULL) {
+    P256_POINT *out = &t->p;
+    if (p_is_infinity) {
+      out = &p->p;
+    }
+
+    ecp_nistz256_windowed_mul(group, out, p_, p_scalar);
+    if (!p_is_infinity) {
+      ecp_nistz256_point_add(&p->p, &p->p, out);
+    }
+  }
+
+  assert(group->field.width == P256_LIMBS);
+  OPENSSL_memcpy(r->X.words, p->p.X, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(r->Y.words, p->p.Y, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(r->Z.words, p->p.Z, P256_LIMBS * sizeof(BN_ULONG));
+}
+
 static void ecp_nistz256_points_mul(const EC_GROUP *group, EC_RAW_POINT *r,
                                     const EC_SCALAR *g_scalar,
                                     const EC_RAW_POINT *p_,
                                     const EC_SCALAR *p_scalar) {
   assert((p_ != NULL) == (p_scalar != NULL));
 
-  static const unsigned kWindowSize = 7;
-  static const unsigned kMask = (1 << (7 /* kWindowSize */ + 1)) - 1;
-
-  alignas(32) union {
-    P256_POINT p;
-    P256_POINT_AFFINE a;
-  } t, p;
+  alignas(32) p256_point_union_t t, p;
 
   if (g_scalar != NULL) {
     uint8_t p_str[33];
@@ -309,10 +359,8 @@
     p_str[32] = 0;
 
     // First window
-    unsigned wvalue = (p_str[0] << 1) & kMask;
-    unsigned index = kWindowSize;
-
-    wvalue = booth_recode_w7(wvalue);
+    unsigned index = 0;
+    unsigned wvalue = calc_first_wvalue(&index, p_str);
 
     const PRECOMP256_ROW *const precomputed_table =
         (const PRECOMP256_ROW *)ecp_nistz256_precomputed;
@@ -328,12 +376,7 @@
     copy_conditional(p.p.Z, ONE, is_not_zero(wvalue >> 1));
 
     for (int i = 1; i < 37; i++) {
-      unsigned off = (index - 1) / 8;
-      wvalue = p_str[off] | p_str[off + 1] << 8;
-      wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
-      index += kWindowSize;
-
-      wvalue = booth_recode_w7(wvalue);
+      wvalue = calc_wvalue(&index, p_str);
 
       ecp_nistz256_select_w7(&t.a, precomputed_table[i], wvalue >> 1);
 
@@ -344,28 +387,65 @@
     }
   }
 
-  const int p_is_infinity = g_scalar == NULL;
-  if (p_scalar != NULL) {
-    P256_POINT *out = &t.p;
-    if (p_is_infinity) {
-      out = &p.p;
-    }
+  mul_p_add_and_store(group, r, g_scalar, p_, p_scalar, &t, &p);
+}
 
-    ecp_nistz256_windowed_mul(group, out, p_, p_scalar);
-    if (!p_is_infinity) {
-      ecp_nistz256_point_add(&p.p, &p.p, out);
-    }
+static void ecp_nistz256_points_mul_public(const EC_GROUP *group,
+                                           EC_RAW_POINT *r,
+                                           const EC_SCALAR *g_scalar,
+                                           const EC_RAW_POINT *p_,
+                                           const EC_SCALAR *p_scalar) {
+  assert(p_ != NULL && p_scalar != NULL && g_scalar != NULL);
+
+  alignas(32) p256_point_union_t t, p;
+  uint8_t p_str[33];
+  OPENSSL_memcpy(p_str, g_scalar->bytes, 32);
+  p_str[32] = 0;
+
+  // First window
+  unsigned index = 0;
+  unsigned wvalue = calc_first_wvalue(&index, p_str);
+
+  const PRECOMP256_ROW *const precomputed_table =
+      (const PRECOMP256_ROW *)ecp_nistz256_precomputed;
+
+  // Convert |p| from affine to Jacobian coordinates. We set Z to zero if |p|
+  // is infinity and |ONE| otherwise. |p| was computed from the table, so it
+  // is infinity iff |wvalue >> 1| is zero.
+  if ((wvalue >> 1) != 0) {
+    OPENSSL_memcpy(&p.a, &precomputed_table[0][(wvalue >> 1) - 1], sizeof(p.a));
+    OPENSSL_memcpy(&p.p.Z, ONE, sizeof(p.p.Z));
+  } else {
+    OPENSSL_memset(&p.a, 0, sizeof(p.a));
+    OPENSSL_memset(p.p.Z, 0, sizeof(p.p.Z));
   }
 
-  assert(group->field.width == P256_LIMBS);
-  OPENSSL_memcpy(r->X.words, p.p.X, P256_LIMBS * sizeof(BN_ULONG));
-  OPENSSL_memcpy(r->Y.words, p.p.Y, P256_LIMBS * sizeof(BN_ULONG));
-  OPENSSL_memcpy(r->Z.words, p.p.Z, P256_LIMBS * sizeof(BN_ULONG));
+  if ((wvalue & 1) == 1) {
+    ecp_nistz256_neg(p.p.Y, p.p.Y);
+  }
+
+  for (int i = 1; i < 37; i++) {
+    wvalue = calc_wvalue(&index, p_str);
+
+    if ((wvalue >> 1) == 0) {
+      continue;
+    }
+
+    OPENSSL_memcpy(&t.a, &precomputed_table[i][(wvalue >> 1) - 1], sizeof(p.a));
+
+    if ((wvalue & 1) == 1) {
+      ecp_nistz256_neg(t.a.Y, t.a.Y);
+    }
+
+    ecp_nistz256_point_add_affine(&p.p, &p.p, &t.a);
+  }
+
+  mul_p_add_and_store(group, r, g_scalar, p_, p_scalar, &t, &p);
 }
 
 static int ecp_nistz256_get_affine(const EC_GROUP *group,
-                                   const EC_RAW_POINT *point, BIGNUM *x,
-                                   BIGNUM *y) {
+                                   const EC_RAW_POINT *point, EC_FELEM *x,
+                                   EC_FELEM *y) {
   if (ec_GFp_simple_is_at_infinity(group, point)) {
     OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY);
     return 0;
@@ -384,27 +464,44 @@
   ecp_nistz256_from_mont(z_inv2, z_inv2);
 
   if (x != NULL) {
-    BN_ULONG x_aff[P256_LIMBS];
-    ecp_nistz256_mul_mont(x_aff, z_inv2, point->X.words);
-    if (!bn_set_words(x, x_aff, P256_LIMBS)) {
-      OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE);
-      return 0;
-    }
+    ecp_nistz256_mul_mont(x->words, z_inv2, point->X.words);
   }
 
   if (y != NULL) {
-    BN_ULONG y_aff[P256_LIMBS];
     ecp_nistz256_mul_mont(z_inv3, z_inv3, z_inv2);
-    ecp_nistz256_mul_mont(y_aff, z_inv3, point->Y.words);
-    if (!bn_set_words(y, y_aff, P256_LIMBS)) {
-      OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE);
-      return 0;
-    }
+    ecp_nistz256_mul_mont(y->words, z_inv3, point->Y.words);
   }
 
   return 1;
 }
 
+static void ecp_nistz256_add(const EC_GROUP *group, EC_RAW_POINT *r,
+                             const EC_RAW_POINT *a_, const EC_RAW_POINT *b_) {
+  P256_POINT a, b;
+  OPENSSL_memcpy(a.X, a_->X.words, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(a.Y, a_->Y.words, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(a.Z, a_->Z.words, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(b.X, b_->X.words, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(b.Y, b_->Y.words, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(b.Z, b_->Z.words, P256_LIMBS * sizeof(BN_ULONG));
+  ecp_nistz256_point_add(&a, &a, &b);
+  OPENSSL_memcpy(r->X.words, a.X, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(r->Y.words, a.Y, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(r->Z.words, a.Z, P256_LIMBS * sizeof(BN_ULONG));
+}
+
+static void ecp_nistz256_dbl(const EC_GROUP *group, EC_RAW_POINT *r,
+                             const EC_RAW_POINT *a_) {
+  P256_POINT a;
+  OPENSSL_memcpy(a.X, a_->X.words, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(a.Y, a_->Y.words, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(a.Z, a_->Z.words, P256_LIMBS * sizeof(BN_ULONG));
+  ecp_nistz256_point_double(&a, &a);
+  OPENSSL_memcpy(r->X.words, a.X, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(r->Y.words, a.Y, P256_LIMBS * sizeof(BN_ULONG));
+  OPENSSL_memcpy(r->Z.words, a.Z, P256_LIMBS * sizeof(BN_ULONG));
+}
+
 static void ecp_nistz256_inv_mod_ord(const EC_GROUP *group, EC_SCALAR *out,
                                      const EC_SCALAR *in) {
   // table[i] stores a power of |in| corresponding to the matching enum value.
@@ -486,18 +583,92 @@
   }
 }
 
+static int ecp_nistz256_mont_inv_mod_ord_vartime(const EC_GROUP *group,
+                                                 EC_SCALAR *out,
+                                                 const EC_SCALAR *in) {
+  if ((OPENSSL_ia32cap_P[1] & (1 << 28)) == 0) {
+    // No AVX support; fallback to generic code.
+    return ec_GFp_simple_mont_inv_mod_ord_vartime(group, out, in);
+  }
+
+  if (!beeu_mod_inverse_vartime(out->words, in->words, P256_ORDER)) {
+    return 0;
+  }
+
+  // The result should be returned in the Montgomery domain.
+  ec_scalar_to_montgomery(group, out, out);
+  return 1;
+}
+
+static int ecp_nistz256_cmp_x_coordinate(int *out_result, const EC_GROUP *group,
+                                         const EC_POINT *p, const BIGNUM *r,
+                                         BN_CTX *ctx) {
+  *out_result = 0;
+
+  if (ec_GFp_simple_is_at_infinity(group, &p->raw)) {
+    OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY);
+    return 0;
+  }
+
+  BN_ULONG r_words[P256_LIMBS];
+  if (!bn_copy_words(r_words, P256_LIMBS, r)) {
+    return 0;
+  }
+
+  // We wish to compare X/Z^2 with r. This is equivalent to comparing X with
+  // r*Z^2. Note that X and Z are represented in Montgomery form, while r is
+  // not.
+  BN_ULONG r_Z2[P256_LIMBS], Z2_mont[P256_LIMBS], X[P256_LIMBS];
+  ecp_nistz256_mul_mont(Z2_mont, p->raw.Z.words, p->raw.Z.words);
+  ecp_nistz256_mul_mont(r_Z2, r_words, Z2_mont);
+  ecp_nistz256_from_mont(X, p->raw.X.words);
+
+  if (OPENSSL_memcmp(r_Z2, X, sizeof(r_Z2)) == 0) {
+    *out_result = 1;
+    return 1;
+  }
+
+  // During signing the x coefficient is reduced modulo the group order.
+  // Therefore there is a small possibility, less than 1/2^128, that group_order
+  // < p.x < P. in that case we need not only to compare against |r| but also to
+  // compare against r+group_order.
+
+  // P_MINUS_ORDER is the difference between the field order (p) and the group
+  // order (N). This value is not in the Montgomery domain.
+  static const BN_ULONG P_MINUS_ORDER[P256_LIMBS] = {
+      TOBN(0x0c46353d, 0x039cdaae), TOBN(0x43190553, 0x58e8617b),
+      TOBN(0x00000000, 0x00000000), TOBN(0x00000000, 0x00000000)};
+
+  if (bn_less_than_words(r_words, P_MINUS_ORDER, P256_LIMBS)) {
+    // We can add in-place, ignoring the carry, because: r + group_order < p <
+    // 2^256
+    bn_add_words(r_words, r_words, P256_ORDER, P256_LIMBS);
+    ecp_nistz256_mul_mont(r_Z2, r_words, Z2_mont);
+    if (OPENSSL_memcmp(r_Z2, X, sizeof(r_Z2)) == 0) {
+      *out_result = 1;
+      return 1;
+    }
+  }
+
+  return 1;
+}
+
 DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistz256_method) {
   out->group_init = ec_GFp_mont_group_init;
   out->group_finish = ec_GFp_mont_group_finish;
   out->group_set_curve = ec_GFp_mont_group_set_curve;
   out->point_get_affine_coordinates = ecp_nistz256_get_affine;
+  out->add = ecp_nistz256_add;
+  out->dbl = ecp_nistz256_dbl;
   out->mul = ecp_nistz256_points_mul;
-  out->mul_public = ecp_nistz256_points_mul;
+  out->mul_public = ecp_nistz256_points_mul_public;
   out->felem_mul = ec_GFp_mont_felem_mul;
   out->felem_sqr = ec_GFp_mont_felem_sqr;
   out->bignum_to_felem = ec_GFp_mont_bignum_to_felem;
   out->felem_to_bignum = ec_GFp_mont_felem_to_bignum;
   out->scalar_inv_montgomery = ecp_nistz256_inv_mod_ord;
+  out->scalar_inv_montgomery_vartime = ecp_nistz256_mont_inv_mod_ord_vartime;
+  out->cmp_x_coordinate = ecp_nistz256_cmp_x_coordinate;
 };
 
 #endif /* !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
diff --git a/src/crypto/fipsmodule/ec/p256-x86_64.h b/src/crypto/fipsmodule/ec/p256-x86_64.h
index 21b461c..9de3240 100644
--- a/src/crypto/fipsmodule/ec/p256-x86_64.h
+++ b/src/crypto/fipsmodule/ec/p256-x86_64.h
@@ -61,6 +61,16 @@
   ecp_nistz256_mul_mont(res, in, ONE);
 }
 
+// ecp_nistz256_to_mont sets |res| to |in|, converted to Montgomery domain
+// by multiplying with RR = 2^512 mod P precomputed for NIST P256 curve.
+static inline void ecp_nistz256_to_mont(BN_ULONG res[P256_LIMBS],
+                                        const BN_ULONG in[P256_LIMBS]) {
+  static const BN_ULONG RR[P256_LIMBS] = {
+      TOBN(0x00000000, 0x00000003), TOBN(0xfffffffb, 0xffffffff),
+      TOBN(0xffffffff, 0xfffffffe), TOBN(0x00000004, 0xfffffffd)};
+  ecp_nistz256_mul_mont(res, in, RR);
+}
+
 
 // P-256 scalar operations.
 //
@@ -79,6 +89,12 @@
 void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
                                const BN_ULONG a[P256_LIMBS], int rep);
 
+// beeu_mod_inverse_vartime sets out = a^-1 mod p using a Euclidean algorithm.
+// Assumption: 0 < a < p < 2^(256) and p is odd.
+int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS],
+                             const BN_ULONG a[P256_LIMBS],
+                             const BN_ULONG p[P256_LIMBS]);
+
 
 // P-256 point operations.
 //
diff --git a/src/crypto/fipsmodule/ec/p256-x86_64_test.cc b/src/crypto/fipsmodule/ec/p256-x86_64_test.cc
index 8ed1dd4..ab93dfb 100644
--- a/src/crypto/fipsmodule/ec/p256-x86_64_test.cc
+++ b/src/crypto/fipsmodule/ec/p256-x86_64_test.cc
@@ -24,8 +24,12 @@
 #include <gtest/gtest.h>
 
 #include <openssl/bn.h>
+#include <openssl/cpu.h>
+#include <openssl/ec.h>
 #include <openssl/mem.h>
+#include <openssl/nid.h>
 
+#include "internal.h"
 #include "../bn/internal.h"
 #include "../../internal.h"
 #include "../../test/file_test.h"
@@ -87,6 +91,64 @@
   }
 }
 
+TEST(P256_X86_64Test, BEEU) {
+  if ((OPENSSL_ia32cap_P[1] & (1 << 28)) == 0) {
+    // No AVX support; cannot run the BEEU code.
+    return;
+  }
+
+  bssl::UniquePtr<EC_GROUP> group(
+      EC_GROUP_new_by_curve_name(NID_X9_62_prime256v1));
+  ASSERT_TRUE(group);
+
+  BN_ULONG order_words[P256_LIMBS];
+  ASSERT_TRUE(
+      bn_copy_words(order_words, P256_LIMBS, EC_GROUP_get0_order(group.get())));
+
+  BN_ULONG in[P256_LIMBS], out[P256_LIMBS];
+  EC_SCALAR in_scalar, out_scalar, result;
+  OPENSSL_memset(in, 0, sizeof(in));
+
+  // Trying to find the inverse of zero should fail.
+  ASSERT_FALSE(beeu_mod_inverse_vartime(out, in, order_words));
+
+  // kOneMont is 1, in Montgomery form.
+  static const BN_ULONG kOneMont[P256_LIMBS] = {
+      TOBN(0xc46353d, 0x039cdaaf), TOBN(0x43190552, 0x58e8617b),
+      0, 0xffffffff,
+  };
+
+  for (BN_ULONG i = 1; i < 2000; i++) {
+    SCOPED_TRACE(i);
+
+    in[0] = i;
+    if (i >= 1000) {
+      in[1] = i << 8;
+      in[2] = i << 32;
+      in[3] = i << 48;
+    } else {
+      in[1] = in[2] = in[3] = 0;
+    }
+
+    EXPECT_TRUE(bn_less_than_words(in, order_words, P256_LIMBS));
+    ASSERT_TRUE(beeu_mod_inverse_vartime(out, in, order_words));
+    EXPECT_TRUE(bn_less_than_words(out, order_words, P256_LIMBS));
+
+    // Calculate out*in and confirm that it equals one, modulo the order.
+    OPENSSL_memcpy(in_scalar.bytes, in, sizeof(in));
+    OPENSSL_memcpy(out_scalar.bytes, out, sizeof(out));
+    ec_scalar_to_montgomery(group.get(), &in_scalar, &in_scalar);
+    ec_scalar_to_montgomery(group.get(), &out_scalar, &out_scalar);
+    ec_scalar_mul_montgomery(group.get(), &result, &in_scalar, &out_scalar);
+
+    EXPECT_EQ(0, OPENSSL_memcmp(kOneMont, &result, sizeof(kOneMont)));
+
+    // Invert the result and expect to get back to the original value.
+    ASSERT_TRUE(beeu_mod_inverse_vartime(out, out, order_words));
+    EXPECT_EQ(0, OPENSSL_memcmp(in, out, sizeof(in)));
+  }
+}
+
 static bool GetFieldElement(FileTest *t, BN_ULONG out[P256_LIMBS],
                             const char *name) {
   std::vector<uint8_t> bytes;
diff --git a/src/crypto/fipsmodule/ec/scalar.c b/src/crypto/fipsmodule/ec/scalar.c
index 1bd6b02..88678a9 100644
--- a/src/crypto/fipsmodule/ec/scalar.c
+++ b/src/crypto/fipsmodule/ec/scalar.c
@@ -74,3 +74,8 @@
                               const EC_SCALAR *a) {
   group->meth->scalar_inv_montgomery(group, r, a);
 }
+
+int ec_scalar_inv_montgomery_vartime(const EC_GROUP *group, EC_SCALAR *r,
+                                     const EC_SCALAR *a) {
+  return group->meth->scalar_inv_montgomery_vartime(group, r, a);
+}
diff --git a/src/crypto/fipsmodule/ec/simple.c b/src/crypto/fipsmodule/ec/simple.c
index 5c63711..8b862ff 100644
--- a/src/crypto/fipsmodule/ec/simple.c
+++ b/src/crypto/fipsmodule/ec/simple.c
@@ -171,10 +171,6 @@
   return 1;
 }
 
-unsigned ec_GFp_simple_group_get_degree(const EC_GROUP *group) {
-  return BN_num_bits(&group->field);
-}
-
 void ec_GFp_simple_point_init(EC_RAW_POINT *point) {
   OPENSSL_memset(&point->X, 0, sizeof(EC_FELEM));
   OPENSSL_memset(&point->Y, 0, sizeof(EC_FELEM));
@@ -212,222 +208,6 @@
   return 1;
 }
 
-void ec_GFp_simple_add(const EC_GROUP *group, EC_RAW_POINT *out,
-                       const EC_RAW_POINT *a, const EC_RAW_POINT *b) {
-  if (a == b) {
-    ec_GFp_simple_dbl(group, out, a);
-    return;
-  }
-
-
-  // The method is taken from:
-  //   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#addition-add-2007-bl
-  //
-  // Coq transcription and correctness proof:
-  // <https://github.com/davidben/fiat-crypto/blob/c7b95f62b2a54b559522573310e9b487327d219a/src/Curves/Weierstrass/Jacobian.v#L467>
-  // <https://github.com/davidben/fiat-crypto/blob/c7b95f62b2a54b559522573310e9b487327d219a/src/Curves/Weierstrass/Jacobian.v#L544>
-  void (*const felem_mul)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a,
-                          const EC_FELEM *b) = group->meth->felem_mul;
-  void (*const felem_sqr)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a) =
-      group->meth->felem_sqr;
-
-  EC_FELEM x_out, y_out, z_out;
-  BN_ULONG z1nz = ec_felem_non_zero_mask(group, &a->Z);
-  BN_ULONG z2nz = ec_felem_non_zero_mask(group, &b->Z);
-
-  // z1z1 = z1z1 = z1**2
-  EC_FELEM z1z1;
-  felem_sqr(group, &z1z1, &a->Z);
-
-  // z2z2 = z2**2
-  EC_FELEM z2z2;
-  felem_sqr(group, &z2z2, &b->Z);
-
-  // u1 = x1*z2z2
-  EC_FELEM u1;
-  felem_mul(group, &u1, &a->X, &z2z2);
-
-  // two_z1z2 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2
-  EC_FELEM two_z1z2;
-  ec_felem_add(group, &two_z1z2, &a->Z, &b->Z);
-  felem_sqr(group, &two_z1z2, &two_z1z2);
-  ec_felem_sub(group, &two_z1z2, &two_z1z2, &z1z1);
-  ec_felem_sub(group, &two_z1z2, &two_z1z2, &z2z2);
-
-  // s1 = y1 * z2**3
-  EC_FELEM s1;
-  felem_mul(group, &s1, &b->Z, &z2z2);
-  felem_mul(group, &s1, &s1, &a->Y);
-
-  // u2 = x2*z1z1
-  EC_FELEM u2;
-  felem_mul(group, &u2, &b->X, &z1z1);
-
-  // h = u2 - u1
-  EC_FELEM h;
-  ec_felem_sub(group, &h, &u2, &u1);
-
-  BN_ULONG xneq = ec_felem_non_zero_mask(group, &h);
-
-  // z_out = two_z1z2 * h
-  felem_mul(group, &z_out, &h, &two_z1z2);
-
-  // z1z1z1 = z1 * z1z1
-  EC_FELEM z1z1z1;
-  felem_mul(group, &z1z1z1, &a->Z, &z1z1);
-
-  // s2 = y2 * z1**3
-  EC_FELEM s2;
-  felem_mul(group, &s2, &b->Y, &z1z1z1);
-
-  // r = (s2 - s1)*2
-  EC_FELEM r;
-  ec_felem_sub(group, &r, &s2, &s1);
-  ec_felem_add(group, &r, &r, &r);
-
-  BN_ULONG yneq = ec_felem_non_zero_mask(group, &r);
-
-  // This case will never occur in the constant-time |ec_GFp_simple_mul|.
-  if (!xneq && !yneq && z1nz && z2nz) {
-    ec_GFp_simple_dbl(group, out, a);
-    return;
-  }
-
-  // I = (2h)**2
-  EC_FELEM i;
-  ec_felem_add(group, &i, &h, &h);
-  felem_sqr(group, &i, &i);
-
-  // J = h * I
-  EC_FELEM j;
-  felem_mul(group, &j, &h, &i);
-
-  // V = U1 * I
-  EC_FELEM v;
-  felem_mul(group, &v, &u1, &i);
-
-  // x_out = r**2 - J - 2V
-  felem_sqr(group, &x_out, &r);
-  ec_felem_sub(group, &x_out, &x_out, &j);
-  ec_felem_sub(group, &x_out, &x_out, &v);
-  ec_felem_sub(group, &x_out, &x_out, &v);
-
-  // y_out = r(V-x_out) - 2 * s1 * J
-  ec_felem_sub(group, &y_out, &v, &x_out);
-  felem_mul(group, &y_out, &y_out, &r);
-  EC_FELEM s1j;
-  felem_mul(group, &s1j, &s1, &j);
-  ec_felem_sub(group, &y_out, &y_out, &s1j);
-  ec_felem_sub(group, &y_out, &y_out, &s1j);
-
-  ec_felem_select(group, &x_out, z1nz, &x_out, &b->X);
-  ec_felem_select(group, &out->X, z2nz, &x_out, &a->X);
-  ec_felem_select(group, &y_out, z1nz, &y_out, &b->Y);
-  ec_felem_select(group, &out->Y, z2nz, &y_out, &a->Y);
-  ec_felem_select(group, &z_out, z1nz, &z_out, &b->Z);
-  ec_felem_select(group, &out->Z, z2nz, &z_out, &a->Z);
-}
-
-void ec_GFp_simple_dbl(const EC_GROUP *group, EC_RAW_POINT *r,
-                       const EC_RAW_POINT *a) {
-  void (*const felem_mul)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a,
-                          const EC_FELEM *b) = group->meth->felem_mul;
-  void (*const felem_sqr)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a) =
-      group->meth->felem_sqr;
-
-  if (group->a_is_minus3) {
-    // The method is taken from:
-    //   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
-    //
-    // Coq transcription and correctness proof:
-    // <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L93>
-    // <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L201>
-    EC_FELEM delta, gamma, beta, ftmp, ftmp2, tmptmp, alpha, fourbeta;
-    // delta = z^2
-    felem_sqr(group, &delta, &a->Z);
-    // gamma = y^2
-    felem_sqr(group, &gamma, &a->Y);
-    // beta = x*gamma
-    felem_mul(group, &beta, &a->X, &gamma);
-
-    // alpha = 3*(x-delta)*(x+delta)
-    ec_felem_sub(group, &ftmp, &a->X, &delta);
-    ec_felem_add(group, &ftmp2, &a->X, &delta);
-
-    ec_felem_add(group, &tmptmp, &ftmp2, &ftmp2);
-    ec_felem_add(group, &ftmp2, &ftmp2, &tmptmp);
-    felem_mul(group, &alpha, &ftmp, &ftmp2);
-
-    // x' = alpha^2 - 8*beta
-    felem_sqr(group, &r->X, &alpha);
-    ec_felem_add(group, &fourbeta, &beta, &beta);
-    ec_felem_add(group, &fourbeta, &fourbeta, &fourbeta);
-    ec_felem_add(group, &tmptmp, &fourbeta, &fourbeta);
-    ec_felem_sub(group, &r->X, &r->X, &tmptmp);
-
-    // z' = (y + z)^2 - gamma - delta
-    ec_felem_add(group, &delta, &gamma, &delta);
-    ec_felem_add(group, &ftmp, &a->Y, &a->Z);
-    felem_sqr(group, &r->Z, &ftmp);
-    ec_felem_sub(group, &r->Z, &r->Z, &delta);
-
-    // y' = alpha*(4*beta - x') - 8*gamma^2
-    ec_felem_sub(group, &r->Y, &fourbeta, &r->X);
-    ec_felem_add(group, &gamma, &gamma, &gamma);
-    felem_sqr(group, &gamma, &gamma);
-    felem_mul(group, &r->Y, &alpha, &r->Y);
-    ec_felem_add(group, &gamma, &gamma, &gamma);
-    ec_felem_sub(group, &r->Y, &r->Y, &gamma);
-  } else {
-    // The method is taken from:
-    //   http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
-    //
-    // Coq transcription and correctness proof:
-    // <https://github.com/davidben/fiat-crypto/blob/c7b95f62b2a54b559522573310e9b487327d219a/src/Curves/Weierstrass/Jacobian.v#L102>
-    // <https://github.com/davidben/fiat-crypto/blob/c7b95f62b2a54b559522573310e9b487327d219a/src/Curves/Weierstrass/Jacobian.v#L534>
-    EC_FELEM xx, yy, yyyy, zz;
-    felem_sqr(group, &xx, &a->X);
-    felem_sqr(group, &yy, &a->Y);
-    felem_sqr(group, &yyyy, &yy);
-    felem_sqr(group, &zz, &a->Z);
-
-    // s = 2*((x_in + yy)^2 - xx - yyyy)
-    EC_FELEM s;
-    ec_felem_add(group, &s, &a->X, &yy);
-    felem_sqr(group, &s, &s);
-    ec_felem_sub(group, &s, &s, &xx);
-    ec_felem_sub(group, &s, &s, &yyyy);
-    ec_felem_add(group, &s, &s, &s);
-
-    // m = 3*xx + a*zz^2
-    EC_FELEM m;
-    felem_sqr(group, &m, &zz);
-    felem_mul(group, &m, &group->a, &m);
-    ec_felem_add(group, &m, &m, &xx);
-    ec_felem_add(group, &m, &m, &xx);
-    ec_felem_add(group, &m, &m, &xx);
-
-    // x_out = m^2 - 2*s
-    felem_sqr(group, &r->X, &m);
-    ec_felem_sub(group, &r->X, &r->X, &s);
-    ec_felem_sub(group, &r->X, &r->X, &s);
-
-    // z_out = (y_in + z_in)^2 - yy - zz
-    ec_felem_add(group, &r->Z, &a->Y, &a->Z);
-    felem_sqr(group, &r->Z, &r->Z);
-    ec_felem_sub(group, &r->Z, &r->Z, &yy);
-    ec_felem_sub(group, &r->Z, &r->Z, &zz);
-
-    // y_out = m*(s-x_out) - 8*yyyy
-    ec_felem_add(group, &yyyy, &yyyy, &yyyy);
-    ec_felem_add(group, &yyyy, &yyyy, &yyyy);
-    ec_felem_add(group, &yyyy, &yyyy, &yyyy);
-    ec_felem_sub(group, &r->Y, &s, &r->X);
-    felem_mul(group, &r->Y, &r->Y, &m);
-    ec_felem_sub(group, &r->Y, &r->Y, &yyyy);
-  }
-}
-
 void ec_GFp_simple_invert(const EC_GROUP *group, EC_RAW_POINT *point) {
   ec_felem_neg(group, &point->Y, &point->Y);
 }
@@ -570,3 +350,52 @@
   // The points are equal.
   return 0;
 }
+
+int ec_GFp_simple_mont_inv_mod_ord_vartime(const EC_GROUP *group,
+                                           EC_SCALAR *out,
+                                           const EC_SCALAR *in) {
+  // This implementation (in fact) runs in constant time,
+  // even though for this interface it is not mandatory.
+
+  // out = in^-1 in the Montgomery domain. This is
+  // |ec_scalar_to_montgomery| followed by |ec_scalar_inv_montgomery|, but
+  // |ec_scalar_inv_montgomery| followed by |ec_scalar_from_montgomery| is
+  // equivalent and slightly more efficient.
+  ec_scalar_inv_montgomery(group, out, in);
+  ec_scalar_from_montgomery(group, out, out);
+  return 1;
+}
+
+// Compares the x (affine) coordinate of the point p with x.
+// Return 1 on success 0 otherwise
+int ec_GFp_simple_cmp_x_coordinate(int *out_result, const EC_GROUP *group,
+                                   const EC_POINT *p, const BIGNUM *r,
+                                   BN_CTX *ctx) {
+  *out_result = 0;
+  int ret = 0;
+  BN_CTX_start(ctx);
+
+  BIGNUM *X = BN_CTX_get(ctx);
+  if (X == NULL) {
+    OPENSSL_PUT_ERROR(ECDSA, ERR_R_BN_LIB);
+    goto out;
+  }
+
+  if (!EC_POINT_get_affine_coordinates_GFp(group, p, X, NULL, ctx)) {
+    OPENSSL_PUT_ERROR(ECDSA, ERR_R_EC_LIB);
+    goto out;
+  }
+
+  if (!ec_field_element_to_scalar(group, X)) {
+    OPENSSL_PUT_ERROR(ECDSA, ERR_R_BN_LIB);
+    goto out;
+  }
+
+  // The signature is correct iff |X| is equal to |r|.
+  *out_result = (BN_ucmp(X, r) == 0);
+  ret = 1;
+
+out:
+  BN_CTX_end(ctx);
+  return ret;
+}
diff --git a/src/crypto/fipsmodule/ec/simple_mul.c b/src/crypto/fipsmodule/ec/simple_mul.c
index 93ed0a8..e05f491 100644
--- a/src/crypto/fipsmodule/ec/simple_mul.c
+++ b/src/crypto/fipsmodule/ec/simple_mul.c
@@ -21,12 +21,12 @@
 #include "../../internal.h"
 
 
-static void ec_GFp_simple_mul_single(const EC_GROUP *group, EC_RAW_POINT *r,
-                                     const EC_RAW_POINT *p,
-                                     const EC_SCALAR *scalar) {
+static void ec_GFp_mont_mul_single(const EC_GROUP *group, EC_RAW_POINT *r,
+                                   const EC_RAW_POINT *p,
+                                   const EC_SCALAR *scalar) {
   // This is a generic implementation for uncommon curves that not do not
   // warrant a tuned one. It uses unsigned digits so that the doubling case in
-  // |ec_GFp_simple_add| is always unreachable, erring on safety and simplicity.
+  // |ec_GFp_mont_add| is always unreachable, erring on safety and simplicity.
 
   // Compute a table of the first 32 multiples of |p| (including infinity).
   EC_RAW_POINT precomp[32];
@@ -34,9 +34,9 @@
   ec_GFp_simple_point_copy(&precomp[1], p);
   for (size_t j = 2; j < OPENSSL_ARRAY_SIZE(precomp); j++) {
     if (j & 1) {
-      ec_GFp_simple_add(group, &precomp[j], &precomp[1], &precomp[j - 1]);
+      ec_GFp_mont_add(group, &precomp[j], &precomp[1], &precomp[j - 1]);
     } else {
-      ec_GFp_simple_dbl(group, &precomp[j], &precomp[j / 2]);
+      ec_GFp_mont_dbl(group, &precomp[j], &precomp[j / 2]);
     }
   }
 
@@ -45,7 +45,7 @@
   int r_is_at_infinity = 1;
   for (unsigned i = bits - 1; i < bits; i--) {
     if (!r_is_at_infinity) {
-      ec_GFp_simple_dbl(group, r, r);
+      ec_GFp_mont_dbl(group, r, r);
     }
     if (i % 5 == 0) {
       // Compute the next window value.
@@ -70,7 +70,7 @@
         ec_GFp_simple_point_copy(r, &tmp);
         r_is_at_infinity = 0;
       } else {
-        ec_GFp_simple_add(group, r, r, &tmp);
+        ec_GFp_mont_add(group, r, r, &tmp);
       }
     }
   }
@@ -79,21 +79,21 @@
   }
 }
 
-void ec_GFp_simple_mul(const EC_GROUP *group, EC_RAW_POINT *r,
-                       const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
-                       const EC_SCALAR *p_scalar) {
+void ec_GFp_mont_mul(const EC_GROUP *group, EC_RAW_POINT *r,
+                     const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
+                     const EC_SCALAR *p_scalar) {
   assert(g_scalar != NULL || p_scalar != NULL);
   if (p_scalar == NULL) {
-    ec_GFp_simple_mul_single(group, r, &group->generator->raw, g_scalar);
+    ec_GFp_mont_mul_single(group, r, &group->generator->raw, g_scalar);
   } else if (g_scalar == NULL) {
-    ec_GFp_simple_mul_single(group, r, p, p_scalar);
+    ec_GFp_mont_mul_single(group, r, p, p_scalar);
   } else {
     // Support constant-time two-point multiplication for compatibility.  This
     // does not actually come up in keygen, ECDH, or ECDSA, so we implement it
     // the naive way.
-    ec_GFp_simple_mul_single(group, r, &group->generator->raw, g_scalar);
+    ec_GFp_mont_mul_single(group, r, &group->generator->raw, g_scalar);
     EC_RAW_POINT tmp;
-    ec_GFp_simple_mul_single(group, &tmp, p, p_scalar);
-    ec_GFp_simple_add(group, r, r, &tmp);
+    ec_GFp_mont_mul_single(group, &tmp, p, p_scalar);
+    ec_GFp_mont_add(group, r, r, &tmp);
   }
 }
diff --git a/src/crypto/fipsmodule/ec/wnaf.c b/src/crypto/fipsmodule/ec/wnaf.c
index 145caa0..c0c2809 100644
--- a/src/crypto/fipsmodule/ec/wnaf.c
+++ b/src/crypto/fipsmodule/ec/wnaf.c
@@ -151,9 +151,9 @@
                             const EC_RAW_POINT *p, size_t len) {
   ec_GFp_simple_point_copy(&out[0], p);
   EC_RAW_POINT two_p;
-  ec_GFp_simple_dbl(group, &two_p, p);
+  ec_GFp_mont_dbl(group, &two_p, p);
   for (size_t i = 1; i < len; i++) {
-    ec_GFp_simple_add(group, &out[i], &out[i - 1], &two_p);
+    ec_GFp_mont_add(group, &out[i], &out[i - 1], &two_p);
   }
 }
 
@@ -168,15 +168,15 @@
   }
 }
 
-// EC_WNAF_WINDOW_BITS is the window size to use for |ec_GFp_simple_mul_public|.
+// EC_WNAF_WINDOW_BITS is the window size to use for |ec_GFp_mont_mul_public|.
 #define EC_WNAF_WINDOW_BITS 4
 
-// EC_WNAF_TABLE_SIZE is the table size to use for |ec_GFp_simple_mul_public|.
+// EC_WNAF_TABLE_SIZE is the table size to use for |ec_GFp_mont_mul_public|.
 #define EC_WNAF_TABLE_SIZE (1 << (EC_WNAF_WINDOW_BITS - 1))
 
-void ec_GFp_simple_mul_public(const EC_GROUP *group, EC_RAW_POINT *r,
-                              const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
-                              const EC_SCALAR *p_scalar) {
+void ec_GFp_mont_mul_public(const EC_GROUP *group, EC_RAW_POINT *r,
+                            const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
+                            const EC_SCALAR *p_scalar) {
   size_t bits = BN_num_bits(&group->order);
   size_t wNAF_len = bits + 1;
 
@@ -197,7 +197,7 @@
   int r_is_at_infinity = 1;
   for (size_t k = wNAF_len - 1; k < wNAF_len; k--) {
     if (!r_is_at_infinity) {
-      ec_GFp_simple_dbl(group, r, r);
+      ec_GFp_mont_dbl(group, r, r);
     }
 
     if (g_wNAF[k] != 0) {
@@ -206,7 +206,7 @@
         ec_GFp_simple_point_copy(r, &tmp);
         r_is_at_infinity = 0;
       } else {
-        ec_GFp_simple_add(group, r, r, &tmp);
+        ec_GFp_mont_add(group, r, r, &tmp);
       }
     }
 
@@ -216,7 +216,7 @@
         ec_GFp_simple_point_copy(r, &tmp);
         r_is_at_infinity = 0;
       } else {
-        ec_GFp_simple_add(group, r, r, &tmp);
+        ec_GFp_mont_add(group, r, r, &tmp);
       }
     }
   }