external/boringssl: Sync to a63d0ad40dd621d5b9472dc9f1756692f969451e.

This includes the following changes:

https://boringssl.googlesource.com/boringssl/+log/9f0e7cb314ae64234b928fd379381ae9760a9a5f..a63d0ad40dd621d5b9472dc9f1756692f969451e

Test: BoringSSL CTS Presubmits.
Change-Id: I283b7d8f01ceef3becb152708b65894c717e3680
diff --git a/src/crypto/bio/connect.c b/src/crypto/bio/connect.c
index 0b60f6a..604803a 100644
--- a/src/crypto/bio/connect.c
+++ b/src/crypto/bio/connect.c
@@ -56,6 +56,8 @@
 
 #include <openssl/bio.h>
 
+#if !defined(OPENSSL_TRUSTY)
+
 #include <assert.h>
 #include <errno.h>
 #include <string.h>
@@ -540,3 +542,5 @@
 int BIO_do_connect(BIO *bio) {
   return BIO_ctrl(bio, BIO_C_DO_STATE_MACHINE, 0, NULL);
 }
+
+#endif  // OPENSSL_TRUSTY
diff --git a/src/crypto/bio/fd.c b/src/crypto/bio/fd.c
index fed5228..877f53d 100644
--- a/src/crypto/bio/fd.c
+++ b/src/crypto/bio/fd.c
@@ -56,6 +56,8 @@
 
 #include <openssl/bio.h>
 
+#if !defined(OPENSSL_TRUSTY)
+
 #include <errno.h>
 #include <string.h>
 
@@ -274,3 +276,5 @@
 int BIO_get_fd(BIO *bio, int *out_fd) {
   return BIO_ctrl(bio, BIO_C_GET_FD, 0, (char *) out_fd);
 }
+
+#endif  // OPENSSL_TRUSTY
diff --git a/src/crypto/bio/file.c b/src/crypto/bio/file.c
index f61dbe4..6a0b9a9 100644
--- a/src/crypto/bio/file.c
+++ b/src/crypto/bio/file.c
@@ -73,6 +73,8 @@
 
 #include <openssl/bio.h>
 
+#if !defined(OPENSSL_TRUSTY)
+
 #include <errno.h>
 #include <stdio.h>
 #include <string.h>
@@ -313,3 +315,5 @@
   return BIO_ctrl(bio, BIO_C_SET_FILENAME,
                   BIO_CLOSE | BIO_FP_READ | BIO_FP_WRITE, (char *)filename);
 }
+
+#endif  // OPENSSL_TRUSTY
diff --git a/src/crypto/bio/socket.c b/src/crypto/bio/socket.c
index 111761f..081ce01 100644
--- a/src/crypto/bio/socket.c
+++ b/src/crypto/bio/socket.c
@@ -57,6 +57,8 @@
 
 #include <openssl/bio.h>
 
+#if !defined(OPENSSL_TRUSTY)
+
 #include <fcntl.h>
 #include <string.h>
 
@@ -200,3 +202,5 @@
   BIO_set_fd(ret, fd, close_flag);
   return ret;
 }
+
+#endif  // OPENSSL_TRUSTY
diff --git a/src/crypto/bio/socket_helper.c b/src/crypto/bio/socket_helper.c
index 268405a..d4209d0 100644
--- a/src/crypto/bio/socket_helper.c
+++ b/src/crypto/bio/socket_helper.c
@@ -18,6 +18,8 @@
 #include <openssl/bio.h>
 #include <openssl/err.h>
 
+#if !defined(OPENSSL_TRUSTY)
+
 #include <fcntl.h>
 #include <string.h>
 #include <sys/types.h>
@@ -112,3 +114,5 @@
   }
   return error;
 }
+
+#endif  // OPENSSL_TRUSTY
diff --git a/src/crypto/cipher_extra/tls_cbc.c b/src/crypto/cipher_extra/tls_cbc.c
index 6f95130..a24602b 100644
--- a/src/crypto/cipher_extra/tls_cbc.c
+++ b/src/crypto/cipher_extra/tls_cbc.c
@@ -271,7 +271,7 @@
   HASH_CTX md_state;
   void (*md_final_raw)(HASH_CTX *ctx, uint8_t *md_out);
   void (*md_transform)(HASH_CTX *ctx, const uint8_t *block);
-  unsigned md_size, md_block_size = 64;
+  unsigned md_size, md_block_size = 64, md_block_shift = 6;
   // md_length_size is the number of bytes in the length field that terminates
   // the hash.
   unsigned md_length_size = 8;
@@ -305,6 +305,7 @@
       md_transform = tls1_sha512_transform;
       md_size = SHA384_DIGEST_LENGTH;
       md_block_size = 128;
+      md_block_shift = 7;
       md_length_size = 16;
       break;
 
@@ -318,6 +319,7 @@
 
   assert(md_length_size <= MAX_HASH_BIT_COUNT_BYTES);
   assert(md_block_size <= MAX_HASH_BLOCK_SIZE);
+  assert(md_block_size == (1u << md_block_shift));
   assert(md_size <= EVP_MAX_MD_SIZE);
 
   static const size_t kHeaderLength = 13;
@@ -350,18 +352,16 @@
   // k is the starting byte offset into the conceptual header||data where
   // we start processing.
   size_t k = 0;
-  // mac_end_offset is the index just past the end of the data to be
-  // MACed.
+  // mac_end_offset is the index just past the end of the data to be MACed.
   size_t mac_end_offset = data_plus_mac_size + kHeaderLength - md_size;
-  // c is the index of the 0x80 byte in the final hash block that
-  // contains application data.
-  size_t c = mac_end_offset % md_block_size;
-  // index_a is the hash block number that contains the 0x80 terminating
-  // value.
-  size_t index_a = mac_end_offset / md_block_size;
-  // index_b is the hash block number that contains the 64-bit hash
-  // length, in bits.
-  size_t index_b = (mac_end_offset + md_length_size) / md_block_size;
+  // c is the index of the 0x80 byte in the final hash block that contains
+  // application data.
+  size_t c = mac_end_offset & (md_block_size - 1);
+  // index_a is the hash block number that contains the 0x80 terminating value.
+  size_t index_a = mac_end_offset >> md_block_shift;
+  // index_b is the hash block number that contains the 64-bit hash length, in
+  // bits.
+  size_t index_b = (mac_end_offset + md_length_size) >> md_block_shift;
 
   if (num_blocks > kVarianceBlocks) {
     num_starting_blocks = num_blocks - kVarianceBlocks;
diff --git a/src/crypto/cpu-intel.c b/src/crypto/cpu-intel.c
index 1ac280c..701ebed 100644
--- a/src/crypto/cpu-intel.c
+++ b/src/crypto/cpu-intel.c
@@ -170,10 +170,11 @@
     }
   }
 
-  uint32_t extended_features = 0;
+  uint32_t extended_features[2] = {0};
   if (num_ids >= 7) {
     OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 7);
-    extended_features = ebx;
+    extended_features[0] = ebx;
+    extended_features[1] = ecx;
   }
 
   // Determine the number of cores sharing an L1 data cache to adjust the
@@ -241,26 +242,26 @@
     //
     // TODO(davidben): Should bits 17 and 26-28 also be cleared? Upstream
     // doesn't clear those.
-    extended_features &=
+    extended_features[0] &=
         ~((1u << 5) | (1u << 16) | (1u << 21) | (1u << 30) | (1u << 31));
   }
   // See Intel manual, volume 1, section 15.2.
   if ((xcr0 & 0xe6) != 0xe6) {
     // Clear AVX512F. Note we don't touch other AVX512 extensions because they
     // can be used with YMM.
-    extended_features &= ~(1u << 16);
+    extended_features[0] &= ~(1u << 16);
   }
 
   // Disable ADX instructions on Knights Landing. See OpenSSL commit
   // 64d92d74985ebb3d0be58a9718f9e080a14a8e7f.
   if ((ecx & (1u << 26)) == 0) {
-    extended_features &= ~(1u << 19);
+    extended_features[0] &= ~(1u << 19);
   }
 
   OPENSSL_ia32cap_P[0] = edx;
   OPENSSL_ia32cap_P[1] = ecx;
-  OPENSSL_ia32cap_P[2] = extended_features;
-  OPENSSL_ia32cap_P[3] = 0;
+  OPENSSL_ia32cap_P[2] = extended_features[0];
+  OPENSSL_ia32cap_P[3] = extended_features[1];
 
   const char *env1, *env2;
   env1 = getenv("OPENSSL_ia32cap");
diff --git a/src/crypto/fipsmodule/bcm.c b/src/crypto/fipsmodule/bcm.c
index 028ec4e..f382313 100644
--- a/src/crypto/fipsmodule/bcm.c
+++ b/src/crypto/fipsmodule/bcm.c
@@ -63,6 +63,7 @@
 #include "ec/p224-64.c"
 #include "../../third_party/fiat/p256.c"
 #include "ec/p256-x86_64.c"
+#include "ec/scalar.c"
 #include "ec/simple.c"
 #include "ec/util.c"
 #include "ec/wnaf.c"
diff --git a/src/crypto/fipsmodule/bn/bn_test.cc b/src/crypto/fipsmodule/bn/bn_test.cc
index 93d6d0f..a25d487 100644
--- a/src/crypto/fipsmodule/bn/bn_test.cc
+++ b/src/crypto/fipsmodule/bn/bn_test.cc
@@ -467,13 +467,13 @@
           r_words(new BN_ULONG[num_r]);
       ASSERT_TRUE(bn_copy_words(a_words.get(), num_a, a.get()));
 
-      ASSERT_TRUE(bn_mul_small(r_words.get(), num_r, a_words.get(), num_a,
-                               a_words.get(), num_a));
+      bn_mul_small(r_words.get(), num_r, a_words.get(), num_a, a_words.get(),
+                   num_a);
       ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), num_r));
       EXPECT_BIGNUMS_EQUAL("A * A (words)", square.get(), ret.get());
 
       OPENSSL_memset(r_words.get(), 'A', num_r * sizeof(BN_ULONG));
-      ASSERT_TRUE(bn_sqr_small(r_words.get(), num_r, a_words.get(), num_a));
+      bn_sqr_small(r_words.get(), num_r, a_words.get(), num_a);
 
       ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), num_r));
       EXPECT_BIGNUMS_EQUAL("A^2 (words)", square.get(), ret.get());
@@ -535,8 +535,8 @@
         ASSERT_TRUE(bn_copy_words(a_words.get(), num_a, a.get()));
         ASSERT_TRUE(bn_copy_words(b_words.get(), num_b, b.get()));
 
-        ASSERT_TRUE(bn_mul_small(r_words.get(), num_r, a_words.get(), num_a,
-                                 b_words.get(), num_b));
+        bn_mul_small(r_words.get(), num_r, a_words.get(), num_a, b_words.get(),
+                     num_b);
         ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), num_r));
         EXPECT_BIGNUMS_EQUAL("A * B (words)", product.get(), ret.get());
       }
@@ -630,8 +630,17 @@
     // Reduce |a| and |b| and test the Montgomery version.
     bssl::UniquePtr<BN_MONT_CTX> mont(
         BN_MONT_CTX_new_for_modulus(m.get(), ctx));
-    bssl::UniquePtr<BIGNUM> a_tmp(BN_new()), b_tmp(BN_new());
     ASSERT_TRUE(mont);
+
+    // Sanity-check that the constant-time version computes the same n0 and RR.
+    bssl::UniquePtr<BN_MONT_CTX> mont2(
+        BN_MONT_CTX_new_consttime(m.get(), ctx));
+    ASSERT_TRUE(mont2);
+    EXPECT_BIGNUMS_EQUAL("RR (mod M) (constant-time)", &mont->RR, &mont2->RR);
+    EXPECT_EQ(mont->n0[0], mont2->n0[0]);
+    EXPECT_EQ(mont->n0[1], mont2->n0[1]);
+
+    bssl::UniquePtr<BIGNUM> a_tmp(BN_new()), b_tmp(BN_new());
     ASSERT_TRUE(a_tmp);
     ASSERT_TRUE(b_tmp);
     ASSERT_TRUE(BN_nnmod(a.get(), a.get(), m.get(), ctx));
@@ -651,16 +660,13 @@
           b_words(new BN_ULONG[m_width]), r_words(new BN_ULONG[m_width]);
       ASSERT_TRUE(bn_copy_words(a_words.get(), m_width, a.get()));
       ASSERT_TRUE(bn_copy_words(b_words.get(), m_width, b.get()));
-      ASSERT_TRUE(bn_to_montgomery_small(a_words.get(), m_width, a_words.get(),
-                                         m_width, mont.get()));
-      ASSERT_TRUE(bn_to_montgomery_small(b_words.get(), m_width, b_words.get(),
-                                         m_width, mont.get()));
-      ASSERT_TRUE(bn_mod_mul_montgomery_small(
-          r_words.get(), m_width, a_words.get(), m_width, b_words.get(), m_width,
-          mont.get()));
+      bn_to_montgomery_small(a_words.get(), a_words.get(), m_width, mont.get());
+      bn_to_montgomery_small(b_words.get(), b_words.get(), m_width, mont.get());
+      bn_mod_mul_montgomery_small(r_words.get(), a_words.get(), b_words.get(),
+                                  m_width, mont.get());
       // Use the second half of |tmp| so ASan will catch out-of-bounds writes.
-      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m_width, r_words.get(),
-                                           m_width, mont.get()));
+      bn_from_montgomery_small(r_words.get(), r_words.get(), m_width,
+                               mont.get());
       ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m_width));
       EXPECT_BIGNUMS_EQUAL("A * B (mod M) (Montgomery, words)", mod_mul.get(),
                            ret.get());
@@ -718,13 +724,10 @@
       std::unique_ptr<BN_ULONG[]> a_words(new BN_ULONG[m_width]),
           a_copy_words(new BN_ULONG[m_width]), r_words(new BN_ULONG[m_width]);
       ASSERT_TRUE(bn_copy_words(a_words.get(), m_width, a.get()));
-      ASSERT_TRUE(bn_to_montgomery_small(a_words.get(), m_width, a_words.get(),
-                                         m_width, mont.get()));
-      ASSERT_TRUE(bn_mod_mul_montgomery_small(
-          r_words.get(), m_width, a_words.get(), m_width, a_words.get(),
-          m_width, mont.get()));
-      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m_width,
-                                           r_words.get(), m_width, mont.get()));
+      bn_to_montgomery_small(a_words.get(), a_words.get(), m_width, mont.get());
+      bn_mod_mul_montgomery_small(r_words.get(), a_words.get(), a_words.get(),
+                                  m_width, mont.get());
+      bn_from_montgomery_small(r_words.get(), r_words.get(), m_width, mont.get());
       ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m_width));
       EXPECT_BIGNUMS_EQUAL("A * A (mod M) (Montgomery, words)",
                            mod_square.get(), ret.get());
@@ -732,12 +735,11 @@
       // Repeat the operation with |a_copy_words|.
       OPENSSL_memcpy(a_copy_words.get(), a_words.get(),
                      m_width * sizeof(BN_ULONG));
-      ASSERT_TRUE(bn_mod_mul_montgomery_small(
-          r_words.get(), m_width, a_words.get(), m_width, a_copy_words.get(),
-          m_width, mont.get()));
+      bn_mod_mul_montgomery_small(r_words.get(), a_words.get(),
+                                  a_copy_words.get(), m_width, mont.get());
       // Use the second half of |tmp| so ASan will catch out-of-bounds writes.
-      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m_width,
-                                           r_words.get(), m_width, mont.get()));
+      bn_from_montgomery_small(r_words.get(), r_words.get(), m_width,
+                               mont.get());
       ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m_width));
       EXPECT_BIGNUMS_EQUAL("A * A_copy (mod M) (Montgomery, words)",
                            mod_square.get(), ret.get());
@@ -761,6 +763,9 @@
   ASSERT_TRUE(BN_mod_exp(ret.get(), a.get(), e.get(), m.get(), ctx));
   EXPECT_BIGNUMS_EQUAL("A ^ E (mod M)", mod_exp.get(), ret.get());
 
+  // The other implementations require reduced inputs.
+  ASSERT_TRUE(BN_nnmod(a.get(), a.get(), m.get(), ctx));
+
   if (BN_is_odd(m.get())) {
     ASSERT_TRUE(
         BN_mod_exp_mont(ret.get(), a.get(), e.get(), m.get(), ctx, NULL));
@@ -778,16 +783,14 @@
       bssl::UniquePtr<BN_MONT_CTX> mont(
           BN_MONT_CTX_new_for_modulus(m.get(), ctx));
       ASSERT_TRUE(mont.get());
-      ASSERT_TRUE(BN_nnmod(a.get(), a.get(), m.get(), ctx));
       std::unique_ptr<BN_ULONG[]> r_words(new BN_ULONG[m_width]),
           a_words(new BN_ULONG[m_width]);
       ASSERT_TRUE(bn_copy_words(a_words.get(), m_width, a.get()));
-      ASSERT_TRUE(bn_to_montgomery_small(a_words.get(), m_width, a_words.get(),
-                                         m_width, mont.get()));
-      ASSERT_TRUE(bn_mod_exp_mont_small(r_words.get(), m_width, a_words.get(),
-                                        m_width, e->d, e->width, mont.get()));
-      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m_width,
-                                           r_words.get(), m_width, mont.get()));
+      bn_to_montgomery_small(a_words.get(), a_words.get(), m_width, mont.get());
+      bn_mod_exp_mont_small(r_words.get(), a_words.get(), m_width, e->d,
+                            e->width, mont.get());
+      bn_from_montgomery_small(r_words.get(), r_words.get(), m_width,
+                               mont.get());
       ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m_width));
       EXPECT_BIGNUMS_EQUAL("A ^ E (mod M) (Montgomery, words)", mod_exp.get(),
                            ret.get());
@@ -1530,6 +1533,10 @@
   EXPECT_FALSE(mont);
   ERR_clear_error();
 
+  mont.reset(BN_MONT_CTX_new_consttime(b.get(), ctx()));
+  EXPECT_FALSE(mont);
+  ERR_clear_error();
+
   // Some operations also may not be used with an even modulus.
   ASSERT_TRUE(BN_set_word(b.get(), 16));
 
@@ -1537,6 +1544,10 @@
   EXPECT_FALSE(mont);
   ERR_clear_error();
 
+  mont.reset(BN_MONT_CTX_new_consttime(b.get(), ctx()));
+  EXPECT_FALSE(mont);
+  ERR_clear_error();
+
   EXPECT_FALSE(BN_mod_exp_mont(a.get(), BN_value_one(), BN_value_one(), b.get(),
                                ctx(), NULL));
   ERR_clear_error();
@@ -1555,21 +1566,16 @@
   ASSERT_TRUE(BN_rand(a.get(), 1024, BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ANY));
   BN_zero(zero.get());
 
-  ASSERT_TRUE(
-      BN_mod_exp(r.get(), a.get(), zero.get(), BN_value_one(), nullptr));
-  EXPECT_TRUE(BN_is_zero(r.get()));
-
-  ASSERT_TRUE(BN_mod_exp_mont(r.get(), a.get(), zero.get(), BN_value_one(),
-                              nullptr, nullptr));
-  EXPECT_TRUE(BN_is_zero(r.get()));
-
-  ASSERT_TRUE(BN_mod_exp_mont_consttime(r.get(), a.get(), zero.get(),
-                                        BN_value_one(), nullptr, nullptr));
+  ASSERT_TRUE(BN_mod_exp(r.get(), a.get(), zero.get(), BN_value_one(), ctx()));
   EXPECT_TRUE(BN_is_zero(r.get()));
 
   ASSERT_TRUE(BN_mod_exp_mont_word(r.get(), 42, zero.get(), BN_value_one(),
-                                   nullptr, nullptr));
+                                   ctx(), nullptr));
   EXPECT_TRUE(BN_is_zero(r.get()));
+
+  // The other modular exponentiation functions, |BN_mod_exp_mont| and
+  // |BN_mod_exp_mont_consttime|, require fully-reduced inputs, so 1**0 mod 1 is
+  // not a valid call.
 }
 
 TEST_F(BNTest, SmallPrime) {
@@ -2262,26 +2268,43 @@
   EXPECT_TRUE(BN_is_pow2(eight.get()));
 
   // |BN_MONT_CTX| is always stored minimally and uses the same R independent of
-  // input width.
+  // input width. Additionally, mont->RR is always the same width as mont->N,
+  // even if it fits in a smaller value.
   static const uint8_t kP[] = {
-      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff,
-      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01,
   };
   bssl::UniquePtr<BIGNUM> p(BN_bin2bn(kP, sizeof(kP), nullptr));
   ASSERT_TRUE(p);
 
+  // Test both the constant-time and variable-time functions at both minimal and
+  // non-minimal |p|.
   bssl::UniquePtr<BN_MONT_CTX> mont(
       BN_MONT_CTX_new_for_modulus(p.get(), ctx()));
   ASSERT_TRUE(mont);
-
-  ASSERT_TRUE(bn_resize_words(p.get(), 32));
   bssl::UniquePtr<BN_MONT_CTX> mont2(
-      BN_MONT_CTX_new_for_modulus(p.get(), ctx()));
+      BN_MONT_CTX_new_consttime(p.get(), ctx()));
   ASSERT_TRUE(mont2);
 
+  ASSERT_TRUE(bn_resize_words(p.get(), 32));
+  bssl::UniquePtr<BN_MONT_CTX> mont3(
+      BN_MONT_CTX_new_for_modulus(p.get(), ctx()));
+  ASSERT_TRUE(mont3);
+  bssl::UniquePtr<BN_MONT_CTX> mont4(
+      BN_MONT_CTX_new_consttime(p.get(), ctx()));
+  ASSERT_TRUE(mont4);
+
   EXPECT_EQ(mont->N.width, mont2->N.width);
+  EXPECT_EQ(mont->N.width, mont3->N.width);
+  EXPECT_EQ(mont->N.width, mont4->N.width);
   EXPECT_EQ(0, BN_cmp(&mont->RR, &mont2->RR));
+  EXPECT_EQ(0, BN_cmp(&mont->RR, &mont3->RR));
+  EXPECT_EQ(0, BN_cmp(&mont->RR, &mont4->RR));
+  EXPECT_EQ(mont->N.width, mont->RR.width);
+  EXPECT_EQ(mont->N.width, mont2->RR.width);
+  EXPECT_EQ(mont->N.width, mont3->RR.width);
+  EXPECT_EQ(mont->N.width, mont4->RR.width);
 }
 
 TEST_F(BNTest, CountLowZeroBits) {
diff --git a/src/crypto/fipsmodule/bn/bn_tests.txt b/src/crypto/fipsmodule/bn/bn_tests.txt
index 7f85a02..6bdca42 100644
--- a/src/crypto/fipsmodule/bn/bn_tests.txt
+++ b/src/crypto/fipsmodule/bn/bn_tests.txt
@@ -10120,6 +10120,17 @@
 E = 02
 M = 414141414141414141414127414141414141414141414141414141414141414141414141414141414141414141414141414141414141414141414141414141414141414141414141414141800000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001
 
+# Cover the E = 0 case for small numbers.
+ModExp = 01
+A = 86b49
+E = 00
+M = 30d26ecb
+
+ModExp = 00
+A = 00
+E = 00
+M = 01
+
 ModExp = 208f8aa0
 A = 86b49
 E = 2
diff --git a/src/crypto/fipsmodule/bn/exponentiation.c b/src/crypto/fipsmodule/bn/exponentiation.c
index c85c00b..b07111e 100644
--- a/src/crypto/fipsmodule/bn/exponentiation.c
+++ b/src/crypto/fipsmodule/bn/exponentiation.c
@@ -109,6 +109,7 @@
 #include <openssl/bn.h>
 
 #include <assert.h>
+#include <stdlib.h>
 #include <string.h>
 
 #include <openssl/cpu.h>
@@ -585,6 +586,13 @@
 
 int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
                BN_CTX *ctx) {
+  if (a->neg || BN_ucmp(a, m) >= 0) {
+    if (!BN_nnmod(r, a, m, ctx)) {
+      return 0;
+    }
+    a = r;
+  }
+
   if (BN_is_odd(m)) {
     return BN_mod_exp_mont(r, a, p, m, ctx, NULL);
   }
@@ -598,6 +606,11 @@
     OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS);
     return 0;
   }
+  if (a->neg || BN_ucmp(a, m) >= 0) {
+    OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED);
+    return 0;
+  }
+
   int bits = BN_num_bits(p);
   if (bits == 0) {
     // x**0 mod 1 is still zero.
@@ -622,35 +635,19 @@
 
   // Allocate a montgomery context if it was not supplied by the caller.
   if (mont == NULL) {
-    new_mont = BN_MONT_CTX_new_for_modulus(m, ctx);
+    new_mont = BN_MONT_CTX_new_consttime(m, ctx);
     if (new_mont == NULL) {
       goto err;
     }
     mont = new_mont;
   }
 
-  const BIGNUM *aa;
-  if (a->neg || BN_ucmp(a, m) >= 0) {
-    if (!BN_nnmod(val[0], a, m, ctx)) {
-      goto err;
-    }
-    aa = val[0];
-  } else {
-    aa = a;
-  }
-
-  if (BN_is_zero(aa)) {
-    BN_zero(rr);
-    ret = 1;
-    goto err;
-  }
-
   // We exponentiate by looking at sliding windows of the exponent and
-  // precomputing powers of |aa|. Windows may be shifted so they always end on a
-  // set bit, so only precompute odd powers. We compute val[i] = aa^(2*i + 1)
+  // precomputing powers of |a|. Windows may be shifted so they always end on a
+  // set bit, so only precompute odd powers. We compute val[i] = a^(2*i + 1)
   // for i = 0 to 2^(window-1), all in Montgomery form.
   int window = BN_window_bits_for_exponent_size(bits);
-  if (!BN_to_montgomery(val[0], aa, mont, ctx)) {
+  if (!BN_to_montgomery(val[0], a, mont, ctx)) {
     goto err;
   }
   if (window > 1) {
@@ -666,10 +663,8 @@
     }
   }
 
-  if (!bn_one_to_montgomery(r, mont, ctx)) {
-    goto err;
-  }
-
+  // |p| is non-zero, so at least one window is non-zero. To save some
+  // multiplications, defer initializing |r| until then.
   int r_is_one = 1;
   int wstart = bits - 1;  // The top bit of the window.
   for (;;) {
@@ -706,7 +701,11 @@
 
     assert(wvalue & 1);
     assert(wvalue < (1 << window));
-    if (!BN_mod_mul_montgomery(r, r, val[wvalue >> 1], mont, ctx)) {
+    if (r_is_one) {
+      if (!BN_copy(r, val[wvalue >> 1])) {
+        goto err;
+      }
+    } else if (!BN_mod_mul_montgomery(r, r, val[wvalue >> 1], mont, ctx)) {
       goto err;
     }
 
@@ -717,6 +716,9 @@
     wstart -= wsize + 1;
   }
 
+  // |p| is non-zero, so |r_is_one| must be cleared at some point.
+  assert(!r_is_one);
+
   if (!BN_from_montgomery(rr, r, mont, ctx)) {
     goto err;
   }
@@ -728,29 +730,24 @@
   return ret;
 }
 
-int bn_mod_exp_mont_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
-                          size_t num_a, const BN_ULONG *p, size_t num_p,
-                          const BN_MONT_CTX *mont) {
-  size_t num_n = mont->N.width;
-  if (num_n != num_a || num_n != num_r || num_n > BN_SMALL_MAX_WORDS) {
-    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-    return 0;
+void bn_mod_exp_mont_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
+                           const BN_ULONG *p, size_t num_p,
+                           const BN_MONT_CTX *mont) {
+  if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS) {
+    abort();
   }
-  if (!BN_is_odd(&mont->N)) {
-    OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS);
-    return 0;
+  assert(BN_is_odd(&mont->N));
+
+  // Count the number of bits in |p|. Note this function treats |p| as public.
+  while (num_p != 0 && p[num_p - 1] == 0) {
+    num_p--;
   }
-  unsigned bits = 0;
-  if (num_p != 0) {
-    bits = BN_num_bits_word(p[num_p - 1]) + (num_p - 1) * BN_BITS2;
+  if (num_p == 0) {
+    bn_from_montgomery_small(r, mont->RR.d, num, mont);
+    return;
   }
-  if (bits == 0) {
-    OPENSSL_memset(r, 0, num_r * sizeof(BN_ULONG));
-    if (!BN_is_one(&mont->N)) {
-      r[0] = 1;
-    }
-    return 1;
-  }
+  unsigned bits = BN_num_bits_word(p[num_p - 1]) + (num_p - 1) * BN_BITS2;
+  assert(bits != 0);
 
   // We exponentiate by looking at sliding windows of the exponent and
   // precomputing powers of |a|. Windows may be shifted so they always end on a
@@ -760,34 +757,24 @@
   if (window > TABLE_BITS_SMALL) {
     window = TABLE_BITS_SMALL;  // Tolerate excessively large |p|.
   }
-  int ret = 0;
   BN_ULONG val[TABLE_SIZE_SMALL][BN_SMALL_MAX_WORDS];
-  OPENSSL_memcpy(val[0], a, num_n * sizeof(BN_ULONG));
+  OPENSSL_memcpy(val[0], a, num * sizeof(BN_ULONG));
   if (window > 1) {
     BN_ULONG d[BN_SMALL_MAX_WORDS];
-    if (!bn_mod_mul_montgomery_small(d, num_n, val[0], num_n, val[0], num_n,
-                                     mont)) {
-      goto err;
-    }
+    bn_mod_mul_montgomery_small(d, val[0], val[0], num, mont);
     for (unsigned i = 1; i < 1u << (window - 1); i++) {
-      if (!bn_mod_mul_montgomery_small(val[i], num_n, val[i - 1], num_n, d,
-                                       num_n, mont)) {
-        goto err;
-      }
+      bn_mod_mul_montgomery_small(val[i], val[i - 1], d, num, mont);
     }
   }
 
-  if (!bn_one_to_montgomery_small(r, num_r, mont)) {
-    goto err;
-  }
-
+  // |p| is non-zero, so at least one window is non-zero. To save some
+  // multiplications, defer initializing |r| until then.
   int r_is_one = 1;
   unsigned wstart = bits - 1;  // The top bit of the window.
   for (;;) {
     if (!bn_is_bit_set_words(p, num_p, wstart)) {
-      if (!r_is_one &&
-          !bn_mod_mul_montgomery_small(r, num_r, r, num_r, r, num_r, mont)) {
-        goto err;
+      if (!r_is_one) {
+        bn_mod_mul_montgomery_small(r, r, r, num, mont);
       }
       if (wstart == 0) {
         break;
@@ -810,19 +797,17 @@
     // Shift |r| to the end of the window.
     if (!r_is_one) {
       for (unsigned i = 0; i < wsize + 1; i++) {
-        if (!bn_mod_mul_montgomery_small(r, num_r, r, num_r, r, num_r, mont)) {
-          goto err;
-        }
+        bn_mod_mul_montgomery_small(r, r, r, num, mont);
       }
     }
 
     assert(wvalue & 1);
     assert(wvalue < (1u << window));
-    if (!bn_mod_mul_montgomery_small(r, num_r, r, num_r, val[wvalue >> 1],
-                                     num_n, mont)) {
-      goto err;
+    if (r_is_one) {
+      OPENSSL_memcpy(r, val[wvalue >> 1], num * sizeof(BN_ULONG));
+    } else {
+      bn_mod_mul_montgomery_small(r, r, val[wvalue >> 1], num, mont);
     }
-
     r_is_one = 0;
     if (wstart == wsize) {
       break;
@@ -830,38 +815,33 @@
     wstart -= wsize + 1;
   }
 
-  ret = 1;
-
-err:
+  // |p| is non-zero, so |r_is_one| must be cleared at some point.
+  assert(!r_is_one);
   OPENSSL_cleanse(val, sizeof(val));
-  return ret;
 }
 
-int bn_mod_inverse_prime_mont_small(BN_ULONG *r, size_t num_r,
-                                    const BN_ULONG *a, size_t num_a,
-                                    const BN_MONT_CTX *mont) {
-  const BN_ULONG *p = mont->N.d;
-  size_t num_p = mont->N.width;
-  if (num_p > BN_SMALL_MAX_WORDS || num_p == 0) {
-    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-    return 0;
+void bn_mod_inverse_prime_mont_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
+                                     const BN_MONT_CTX *mont) {
+  if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS) {
+    abort();
   }
 
   // Per Fermat's Little Theorem, a^-1 = a^(p-2) (mod p) for p prime.
   BN_ULONG p_minus_two[BN_SMALL_MAX_WORDS];
-  OPENSSL_memcpy(p_minus_two, p, num_p * sizeof(BN_ULONG));
+  const BN_ULONG *p = mont->N.d;
+  OPENSSL_memcpy(p_minus_two, p, num * sizeof(BN_ULONG));
   if (p_minus_two[0] >= 2) {
     p_minus_two[0] -= 2;
   } else {
     p_minus_two[0] -= 2;
-    for (size_t i = 1; i < num_p; i++) {
+    for (size_t i = 1; i < num; i++) {
       if (p_minus_two[i]-- != 0) {
         break;
       }
     }
   }
 
-  return bn_mod_exp_mont_small(r, num_r, a, num_a, p_minus_two, num_p, mont);
+  bn_mod_exp_mont_small(r, a, num, p_minus_two, num, mont);
 }
 
 
@@ -988,12 +968,15 @@
   int powerbufLen = 0;
   unsigned char *powerbuf = NULL;
   BIGNUM tmp, am;
-  BIGNUM *new_a = NULL;
 
   if (!BN_is_odd(m)) {
     OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS);
     return 0;
   }
+  if (a->neg || BN_ucmp(a, m) >= 0) {
+    OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED);
+    return 0;
+  }
 
   // Use all bits stored in |p|, rather than |BN_num_bits|, so we do not leak
   // whether the top bits are zero.
@@ -1010,7 +993,7 @@
 
   // Allocate a montgomery context if it was not supplied by the caller.
   if (mont == NULL) {
-    new_mont = BN_MONT_CTX_new_for_modulus(m, ctx);
+    new_mont = BN_MONT_CTX_new_consttime(m, ctx);
     if (new_mont == NULL) {
       goto err;
     }
@@ -1021,15 +1004,6 @@
   // implementation assumes it can use |top| to size R.
   int top = mont->N.width;
 
-  if (a->neg || BN_ucmp(a, m) >= 0) {
-    new_a = BN_new();
-    if (new_a == NULL ||
-        !BN_nnmod(new_a, a, m, ctx)) {
-      goto err;
-    }
-    a = new_a;
-  }
-
 #ifdef RSAZ_ENABLED
   // If the size of the operands allow it, perform the optimized
   // RSAZ exponentiation. For further information see
@@ -1290,7 +1264,6 @@
 
 err:
   BN_MONT_CTX_free(new_mont);
-  BN_clear_free(new_a);
   OPENSSL_free(powerbufFree);
   return (ret);
 }
@@ -1303,6 +1276,11 @@
 
   int ret = 0;
 
+  // BN_mod_exp_mont requires reduced inputs.
+  if (bn_minimal_width(m) == 1) {
+    a %= m->d[0];
+  }
+
   if (!BN_set_word(&a_bignum, a)) {
     OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
     goto err;
diff --git a/src/crypto/fipsmodule/bn/internal.h b/src/crypto/fipsmodule/bn/internal.h
index 668d8dd..2fc38df 100644
--- a/src/crypto/fipsmodule/bn/internal.h
+++ b/src/crypto/fipsmodule/bn/internal.h
@@ -519,77 +519,59 @@
 #endif
 
 // bn_mul_small sets |r| to |a|*|b|. |num_r| must be |num_a| + |num_b|. |r| may
-// not alias with |a| or |b|. This function returns one on success and zero if
-// lengths are inconsistent.
-int bn_mul_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a,
+// not alias with |a| or |b|.
+void bn_mul_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a,
                  const BN_ULONG *b, size_t num_b);
 
 // bn_sqr_small sets |r| to |a|^2. |num_a| must be at most |BN_SMALL_MAX_WORDS|.
-// |num_r| must be |num_a|*2. |r| and |a| may not alias. This function returns
-// one on success and zero on programmer error.
-int bn_sqr_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a);
+// |num_r| must be |num_a|*2. |r| and |a| may not alias.
+void bn_sqr_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a);
 
 // In the following functions, the modulus must be at most |BN_SMALL_MAX_WORDS|
 // words long.
 
 // bn_to_montgomery_small sets |r| to |a| translated to the Montgomery domain.
-// |num_a| and |num_r| must be the length of the modulus, which is
-// |mont->N.top|. |a| must be fully reduced. This function returns one on
-// success and zero if lengths are inconsistent. |r| and |a| may alias.
-int bn_to_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
-                           size_t num_a, const BN_MONT_CTX *mont);
+// |r| and |a| are |num| words long, which must be |mont->N.width|. |a| must be
+// fully reduced and may alias |r|.
+void bn_to_montgomery_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
+                            const BN_MONT_CTX *mont);
 
 // bn_from_montgomery_small sets |r| to |a| translated out of the Montgomery
-// domain. |num_r| must be the length of the modulus, which is |mont->N.top|.
-// |a| must be at most |mont->N.top| * R and |num_a| must be at most 2 *
-// |mont->N.top|. This function returns one on success and zero if lengths are
-// inconsistent. |r| and |a| may alias.
-int bn_from_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
-                             size_t num_a, const BN_MONT_CTX *mont);
-
-// bn_one_to_montgomery_small sets |r| to one in Montgomery form. It returns one
-// on success and zero on error. |num_r| must be the length of the modulus,
-// which is |mont->N.top|. This function treats the bit width of the modulus as
-// public.
-int bn_one_to_montgomery_small(BN_ULONG *r, size_t num_r,
-                               const BN_MONT_CTX *mont);
+// domain. |r| and |a| are |num| words long, which must be |mont->N.width|. |a|
+// must be fully-reduced and may alias |r|.
+void bn_from_montgomery_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
+                              const BN_MONT_CTX *mont);
 
 // bn_mod_mul_montgomery_small sets |r| to |a| * |b| mod |mont->N|. Both inputs
-// and outputs are in the Montgomery domain. |num_r| must be the length of the
-// modulus, which is |mont->N.top|. This function returns one on success and
-// zero on internal error or inconsistent lengths. Any two of |r|, |a|, and |b|
-// may alias.
-//
-// This function requires |a| * |b| < N * R, where N is the modulus and R is the
-// Montgomery divisor, 2^(N.top * BN_BITS2). This should generally be satisfied
-// by ensuring |a| and |b| are fully reduced, however ECDSA has one computation
-// which requires the more general bound.
-int bn_mod_mul_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
-                                size_t num_a, const BN_ULONG *b, size_t num_b,
-                                const BN_MONT_CTX *mont);
+// and outputs are in the Montgomery domain. Each array is |num| words long,
+// which must be |mont->N.width|. Any two of |r|, |a|, and |b| may alias. |a|
+// and |b| must be reduced on input.
+void bn_mod_mul_montgomery_small(BN_ULONG *r, const BN_ULONG *a,
+                                 const BN_ULONG *b, size_t num,
+                                 const BN_MONT_CTX *mont);
 
 // bn_mod_exp_mont_small sets |r| to |a|^|p| mod |mont->N|. It returns one on
 // success and zero on programmer or internal error. Both inputs and outputs are
-// in the Montgomery domain. |num_r| and |num_a| must be |mont->N.top|, which
-// must be at most |BN_SMALL_MAX_WORDS|. |a| must be fully-reduced. This
-// function runs in time independent of |a|, but |p| and |mont->N| are public
-// values.
+// in the Montgomery domain. |r| and |a| are |num| words long, which must be
+// |mont->N.width| and at most |BN_SMALL_MAX_WORDS|. |a| must be fully-reduced.
+// This function runs in time independent of |a|, but |p| and |mont->N| are
+// public values. |a| must be fully-reduced and may alias with |r|.
 //
 // Note this function differs from |BN_mod_exp_mont| which uses Montgomery
 // reduction but takes input and output outside the Montgomery domain. Combine
 // this function with |bn_from_montgomery_small| and |bn_to_montgomery_small|
 // if necessary.
-int bn_mod_exp_mont_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
-                          size_t num_a, const BN_ULONG *p, size_t num_p,
-                          const BN_MONT_CTX *mont);
+void bn_mod_exp_mont_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
+                           const BN_ULONG *p, size_t num_p,
+                           const BN_MONT_CTX *mont);
 
 // bn_mod_inverse_prime_mont_small sets |r| to |a|^-1 mod |mont->N|. |mont->N|
-// must be a prime. |num_r| and |num_a| must be |mont->N.top|, which must be at
-// most |BN_SMALL_MAX_WORDS|. |a| must be fully-reduced. This function runs in
-// time independent of |a|, but |mont->N| is a public value.
-int bn_mod_inverse_prime_mont_small(BN_ULONG *r, size_t num_r,
-                                    const BN_ULONG *a, size_t num_a,
-                                    const BN_MONT_CTX *mont);
+// must be a prime. |r| and |a| are |num| words long, which must be
+// |mont->N.width| and at most |BN_SMALL_MAX_WORDS|. |a| must be fully-reduced
+// and may alias |r|. This function runs in time independent of |a|, but
+// |mont->N| is a public value.
+void bn_mod_inverse_prime_mont_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
+                                     const BN_MONT_CTX *mont);
 
 
 #if defined(__cplusplus)
diff --git a/src/crypto/fipsmodule/bn/montgomery.c b/src/crypto/fipsmodule/bn/montgomery.c
index 7ce8c4c..851c0a0 100644
--- a/src/crypto/fipsmodule/bn/montgomery.c
+++ b/src/crypto/fipsmodule/bn/montgomery.c
@@ -109,6 +109,8 @@
 #include <openssl/bn.h>
 
 #include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 
 #include <openssl/err.h>
@@ -170,7 +172,7 @@
 OPENSSL_COMPILE_ASSERT(sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS ==
                        sizeof(uint64_t), BN_MONT_CTX_set_64_bit_mismatch);
 
-int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) {
+static int bn_mont_ctx_set_N_and_n0(BN_MONT_CTX *mont, const BIGNUM *mod) {
   if (BN_is_zero(mod)) {
     OPENSSL_PUT_ERROR(BN, BN_R_DIV_BY_ZERO);
     return 0;
@@ -207,6 +209,13 @@
 #else
   mont->n0[1] = 0;
 #endif
+  return 1;
+}
+
+int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) {
+  if (!bn_mont_ctx_set_N_and_n0(mont, mod)) {
+    return 0;
+  }
 
   BN_CTX *new_ctx = NULL;
   if (ctx == NULL) {
@@ -223,7 +232,10 @@
   // BN_BITS2|, is correct because R**2 will still be a multiple of the latter
   // as |BN_MONT_CTX_N0_LIMBS| is either one or two.
   unsigned lgBigR = mont->N.width * BN_BITS2;
-  int ok = bn_mod_exp_base_2_consttime(&mont->RR, lgBigR * 2, &mont->N, ctx);
+  BN_zero(&mont->RR);
+  int ok = BN_set_bit(&mont->RR, lgBigR * 2) &&
+           BN_mod(&mont->RR, &mont->RR, &mont->N, ctx) &&
+           bn_resize_words(&mont->RR, mont->N.width);
   BN_CTX_free(new_ctx);
   return ok;
 }
@@ -238,6 +250,24 @@
   return mont;
 }
 
+BN_MONT_CTX *BN_MONT_CTX_new_consttime(const BIGNUM *mod, BN_CTX *ctx) {
+  BN_MONT_CTX *mont = BN_MONT_CTX_new();
+  if (mont == NULL ||
+      !bn_mont_ctx_set_N_and_n0(mont, mod)) {
+    goto err;
+  }
+  unsigned lgBigR = mont->N.width * BN_BITS2;
+  if (!bn_mod_exp_base_2_consttime(&mont->RR, lgBigR * 2, &mont->N, ctx) ||
+      !bn_resize_words(&mont->RR, mont->N.width)) {
+    goto err;
+  }
+  return mont;
+
+err:
+  BN_MONT_CTX_free(mont);
+  return NULL;
+}
+
 int BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, CRYPTO_MUTEX *lock,
                            const BIGNUM *mod, BN_CTX *bn_ctx) {
   CRYPTO_MUTEX_lock_read(lock);
@@ -427,89 +457,53 @@
          bn_fits_in_words(bn, mont->N.width);
 }
 
-int bn_to_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
-                           size_t num_a, const BN_MONT_CTX *mont) {
-  return bn_mod_mul_montgomery_small(r, num_r, a, num_a, mont->RR.d,
-                                     mont->RR.width, mont);
+void bn_to_montgomery_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
+                            const BN_MONT_CTX *mont) {
+  bn_mod_mul_montgomery_small(r, a, mont->RR.d, num, mont);
 }
 
-int bn_from_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
-                             size_t num_a, const BN_MONT_CTX *mont) {
-  size_t num_n = mont->N.width;
-  if (num_a > 2 * num_n || num_r != num_n || num_n > BN_SMALL_MAX_WORDS) {
-    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-    return 0;
+void bn_from_montgomery_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
+                              const BN_MONT_CTX *mont) {
+  if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS) {
+    abort();
   }
   BN_ULONG tmp[BN_SMALL_MAX_WORDS * 2];
-  size_t num_tmp = 2 * num_n;
-  OPENSSL_memcpy(tmp, a, num_a * sizeof(BN_ULONG));
-  OPENSSL_memset(tmp + num_a, 0, (num_tmp - num_a) * sizeof(BN_ULONG));
-  int ret = bn_from_montgomery_in_place(r, num_r, tmp, num_tmp, mont);
-  OPENSSL_cleanse(tmp, num_tmp * sizeof(BN_ULONG));
-  return ret;
+  OPENSSL_memcpy(tmp, a, num * sizeof(BN_ULONG));
+  OPENSSL_memset(tmp + num, 0, num * sizeof(BN_ULONG));
+  if (!bn_from_montgomery_in_place(r, num, tmp, 2 * num, mont)) {
+    abort();
+  }
+  OPENSSL_cleanse(tmp, 2 * num * sizeof(BN_ULONG));
 }
 
-int bn_one_to_montgomery_small(BN_ULONG *r, size_t num_r,
-                               const BN_MONT_CTX *mont) {
-  const BN_ULONG *n = mont->N.d;
-  size_t num_n = mont->N.width;
-  if (num_n == 0 || num_r != num_n) {
-    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-    return 0;
-  }
-
-  // If the high bit of |n| is set, R = 2^(num_n*BN_BITS2) < 2 * |n|, so we
-  // compute R - |n| rather than perform Montgomery reduction.
-  if (num_n > 0 && (n[num_n - 1] >> (BN_BITS2 - 1)) != 0) {
-    r[0] = 0 - n[0];
-    for (size_t i = 1; i < num_n; i++) {
-      r[i] = ~n[i];
-    }
-    return 1;
-  }
-
-  return bn_from_montgomery_small(r, num_r, mont->RR.d, mont->RR.width, mont);
-}
-
-int bn_mod_mul_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
-                                size_t num_a, const BN_ULONG *b, size_t num_b,
-                                const BN_MONT_CTX *mont) {
-  size_t num_n = mont->N.width;
-  if (num_r != num_n || num_a + num_b > 2 * num_n ||
-      num_n > BN_SMALL_MAX_WORDS) {
-    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-    return 0;
+void bn_mod_mul_montgomery_small(BN_ULONG *r, const BN_ULONG *a,
+                                 const BN_ULONG *b, size_t num,
+                                 const BN_MONT_CTX *mont) {
+  if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS) {
+    abort();
   }
 
 #if defined(OPENSSL_BN_ASM_MONT)
   // |bn_mul_mont| requires at least 128 bits of limbs, at least for x86.
-  if (num_n >= (128 / BN_BITS2) &&
-      num_a == num_n &&
-      num_b == num_n) {
-    if (!bn_mul_mont(r, a, b, mont->N.d, mont->n0, num_n)) {
-      assert(0);  // The check above ensures this won't happen.
-      OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
-      return 0;
+  if (num >= (128 / BN_BITS2)) {
+    if (!bn_mul_mont(r, a, b, mont->N.d, mont->n0, num)) {
+      abort();  // The check above ensures this won't happen.
     }
-    return 1;
+    return;
   }
 #endif
 
   // Compute the product.
   BN_ULONG tmp[2 * BN_SMALL_MAX_WORDS];
-  size_t num_tmp = 2 * num_n;
-  size_t num_ab = num_a + num_b;
-  if (a == b && num_a == num_b) {
-    if (!bn_sqr_small(tmp, num_ab, a, num_a)) {
-      return 0;
-    }
-  } else if (!bn_mul_small(tmp, num_ab, a, num_a, b, num_b)) {
-    return 0;
+  if (a == b) {
+    bn_sqr_small(tmp, 2 * num, a, num);
+  } else {
+    bn_mul_small(tmp, 2 * num, a, num, b, num);
   }
 
-  // Zero-extend to full width and reduce.
-  OPENSSL_memset(tmp + num_ab, 0, (num_tmp - num_ab) * sizeof(BN_ULONG));
-  int ret = bn_from_montgomery_in_place(r, num_r, tmp, num_tmp, mont);
-  OPENSSL_cleanse(tmp, num_tmp * sizeof(BN_ULONG));
-  return ret;
+  // Reduce.
+  if (!bn_from_montgomery_in_place(r, num, tmp, 2 * num, mont)) {
+    abort();
+  }
+  OPENSSL_cleanse(tmp, 2 * num * sizeof(BN_ULONG));
 }
diff --git a/src/crypto/fipsmodule/bn/montgomery_inv.c b/src/crypto/fipsmodule/bn/montgomery_inv.c
index a920ca4..94d99e8 100644
--- a/src/crypto/fipsmodule/bn/montgomery_inv.c
+++ b/src/crypto/fipsmodule/bn/montgomery_inv.c
@@ -32,7 +32,8 @@
 #define LG_LITTLE_R (BN_MONT_CTX_N0_LIMBS * BN_BITS2)
 
 uint64_t bn_mont_n0(const BIGNUM *n) {
-  // These conditions are checked by the caller, |BN_MONT_CTX_set|.
+  // These conditions are checked by the caller, |BN_MONT_CTX_set| or
+  // |BN_MONT_CTX_new_consttime|.
   assert(!BN_is_zero(n));
   assert(!BN_is_negative(n));
   assert(BN_is_odd(n));
diff --git a/src/crypto/fipsmodule/bn/mul.c b/src/crypto/fipsmodule/bn/mul.c
index 4a0711d..bd9393e 100644
--- a/src/crypto/fipsmodule/bn/mul.c
+++ b/src/crypto/fipsmodule/bn/mul.c
@@ -57,6 +57,7 @@
 #include <openssl/bn.h>
 
 #include <assert.h>
+#include <stdlib.h>
 #include <string.h>
 
 #include <openssl/err.h>
@@ -656,11 +657,10 @@
   return bn_mul_impl(r, a, b, ctx);
 }
 
-int bn_mul_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a,
-                 const BN_ULONG *b, size_t num_b) {
+void bn_mul_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a,
+                  const BN_ULONG *b, size_t num_b) {
   if (num_r != num_a + num_b) {
-    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-    return 0;
+    abort();
   }
   // TODO(davidben): Should this call |bn_mul_comba4| too? |BN_mul| does not
   // hit that code.
@@ -669,7 +669,6 @@
   } else {
     bn_mul_normal(r, a, num_a, b, num_b);
   }
-  return 1;
 }
 
 // tmp must have 2*n words
@@ -858,10 +857,9 @@
   return 1;
 }
 
-int bn_sqr_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a) {
+void bn_sqr_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a) {
   if (num_r != 2 * num_a || num_a > BN_SMALL_MAX_WORDS) {
-    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-    return 0;
+    abort();
   }
   if (num_a == 4) {
     bn_sqr_comba4(r, a);
@@ -872,5 +870,4 @@
     bn_sqr_normal(r, a, num_a, tmp);
     OPENSSL_cleanse(tmp, 2 * num_a * sizeof(BN_ULONG));
   }
-  return 1;
 }
diff --git a/src/crypto/fipsmodule/bn/prime.c b/src/crypto/fipsmodule/bn/prime.c
index a18d377..80b33c2 100644
--- a/src/crypto/fipsmodule/bn/prime.c
+++ b/src/crypto/fipsmodule/bn/prime.c
@@ -690,7 +690,7 @@
   BIGNUM *z = BN_CTX_get(ctx);
   BIGNUM *one_mont = BN_CTX_get(ctx);
   BIGNUM *w1_mont = BN_CTX_get(ctx);
-  mont = BN_MONT_CTX_new_for_modulus(w, ctx);
+  mont = BN_MONT_CTX_new_consttime(w, ctx);
   if (b == NULL || z == NULL || one_mont == NULL || w1_mont == NULL ||
       mont == NULL ||
       !bn_one_to_montgomery(one_mont, mont, ctx) ||
diff --git a/src/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl b/src/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
index 2a21140..c2f67f4 100755
--- a/src/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
+++ b/src/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
@@ -1,15 +1,17 @@
 #! /usr/bin/env perl
 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
 # Copyright (c) 2014, Intel Corporation. All Rights Reserved.
+# Copyright (c) 2015 CloudFlare, Inc.
 #
 # Licensed under the OpenSSL license (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
 # in the file LICENSE in the source distribution or at
 # https://www.openssl.org/source/license.html
 #
-# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
 # (1) Intel Corporation, Israel Development Center, Haifa, Israel
 # (2) University of Haifa, Israel
+# (3) CloudFlare, Inc.
 #
 # Reference:
 # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
@@ -18,23 +20,25 @@
 # Further optimization by <appro@openssl.org>:
 #
 #		this/original	with/without -DECP_NISTZ256_ASM(*)
-# Opteron	+12-49%		+110-150%
-# Bulldozer	+14-45%		+175-210%
-# P4		+18-46%		n/a :-(
-# Westmere	+12-34%		+80-87%
-# Sandy Bridge	+9-35%		+110-120%
-# Ivy Bridge	+9-35%		+110-125%
-# Haswell	+8-37%		+140-160%
-# Broadwell	+18-58%		+145-210%
-# Atom		+15-50%		+130-180%
-# VIA Nano	+43-160%	+300-480%
+# Opteron	+15-49%		+150-195%
+# Bulldozer	+18-45%		+175-240%
+# P4		+24-46%		+100-150%
+# Westmere	+18-34%		+87-160%
+# Sandy Bridge	+14-35%		+120-185%
+# Ivy Bridge	+11-35%		+125-180%
+# Haswell	+10-37%		+160-200%
+# Broadwell	+24-58%		+210-270%
+# Atom		+20-50%		+180-240%
+# VIA Nano	+50-160%	+480-480%
 #
 # (*)	"without -DECP_NISTZ256_ASM" refers to build with
 #	"enable-ec_nistp_64_gcc_128";
 #
 # Ranges denote minimum and maximum improvement coefficients depending
-# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
-# server-side operation. Keep in mind that +100% means 2x improvement.
+# on benchmark. In "this/original" column lower coefficient is for
+# ECDSA sign, while in "with/without" - for ECDH key agreement, and
+# higher - for ECDSA sign, relatively fastest server-side operation.
+# Keep in mind that +100% means 2x improvement.
 
 $flavour = shift;
 $output  = shift;
@@ -71,6 +75,12 @@
 .long 3,3,3,3,3,3,3,3
 .LONE_mont:
 .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+# Constants for computations modulo ord(p256)
+.Lord:
+.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+.LordK:
+.quad 0xccd1c8aaee00bc4f
 ___
 
 {
@@ -145,6 +155,1087 @@
 
 $code.=<<___;
 ################################################################################
+# void ecp_nistz256_ord_mul_mont(
+#   uint64_t res[4],
+#   uint64_t a[4],
+#   uint64_t b[4]);
+
+.globl	ecp_nistz256_ord_mul_mont
+.type	ecp_nistz256_ord_mul_mont,\@function,3
+.align	32
+ecp_nistz256_ord_mul_mont:
+.cfi_startproc
+___
+$code.=<<___	if ($addx);
+	leaq	OPENSSL_ia32cap_P(%rip), %rcx
+	mov	8(%rcx), %rcx
+	and	\$0x80100, %ecx
+	cmp	\$0x80100, %ecx
+	je	.Lecp_nistz256_ord_mul_montx
+___
+$code.=<<___;
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lord_mul_body:
+
+	mov	8*0($b_org), %rax
+	mov	$b_org, $b_ptr
+	lea	.Lord(%rip), %r14
+	mov	.LordK(%rip), %r15
+
+	################################# * b[0]
+	mov	%rax, $t0
+	mulq	8*0($a_ptr)
+	mov	%rax, $acc0
+	mov	$t0, %rax
+	mov	%rdx, $acc1
+
+	mulq	8*1($a_ptr)
+	add	%rax, $acc1
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $acc2
+
+	mulq	8*2($a_ptr)
+	add	%rax, $acc2
+	mov	$t0, %rax
+	adc	\$0, %rdx
+
+	 mov	$acc0, $acc5
+	 imulq	%r15,$acc0
+
+	mov	%rdx, $acc3
+	mulq	8*3($a_ptr)
+	add	%rax, $acc3
+	 mov	$acc0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $acc4
+
+	################################# First reduction step
+	mulq	8*0(%r14)
+	mov	$acc0, $t1
+	add	%rax, $acc5		# guaranteed to be zero
+	mov	$acc0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t0
+
+	sub	$acc0, $acc2
+	sbb	\$0, $acc0		# can't borrow
+
+	mulq	8*1(%r14)
+	add	$t0, $acc1
+	adc	\$0, %rdx
+	add	%rax, $acc1
+	mov	$t1, %rax
+	adc	%rdx, $acc2
+	mov	$t1, %rdx
+	adc	\$0, $acc0		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc3
+	 mov	8*1($b_ptr), %rax
+	sbb	%rdx, $t1		# can't borrow
+
+	add	$acc0, $acc3
+	adc	$t1, $acc4
+	adc	\$0, $acc5
+
+	################################# * b[1]
+	mov	%rax, $t0
+	mulq	8*0($a_ptr)
+	add	%rax, $acc1
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*1($a_ptr)
+	add	$t1, $acc2
+	adc	\$0, %rdx
+	add	%rax, $acc2
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*2($a_ptr)
+	add	$t1, $acc3
+	adc	\$0, %rdx
+	add	%rax, $acc3
+	mov	$t0, %rax
+	adc	\$0, %rdx
+
+	 mov	$acc1, $t0
+	 imulq	%r15, $acc1
+
+	mov	%rdx, $t1
+	mulq	8*3($a_ptr)
+	add	$t1, $acc4
+	adc	\$0, %rdx
+	xor	$acc0, $acc0
+	add	%rax, $acc4
+	 mov	$acc1, %rax
+	adc	%rdx, $acc5
+	adc	\$0, $acc0
+
+	################################# Second reduction step
+	mulq	8*0(%r14)
+	mov	$acc1, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	$acc1, %rax
+	adc	%rdx, $t0
+
+	sub	$acc1, $acc3
+	sbb	\$0, $acc1		# can't borrow
+
+	mulq	8*1(%r14)
+	add	$t0, $acc2
+	adc	\$0, %rdx
+	add	%rax, $acc2
+	mov	$t1, %rax
+	adc	%rdx, $acc3
+	mov	$t1, %rdx
+	adc	\$0, $acc1		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc4
+	 mov	8*2($b_ptr), %rax
+	sbb	%rdx, $t1		# can't borrow
+
+	add	$acc1, $acc4
+	adc	$t1, $acc5
+	adc	\$0, $acc0
+
+	################################## * b[2]
+	mov	%rax, $t0
+	mulq	8*0($a_ptr)
+	add	%rax, $acc2
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*1($a_ptr)
+	add	$t1, $acc3
+	adc	\$0, %rdx
+	add	%rax, $acc3
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*2($a_ptr)
+	add	$t1, $acc4
+	adc	\$0, %rdx
+	add	%rax, $acc4
+	mov	$t0, %rax
+	adc	\$0, %rdx
+
+	 mov	$acc2, $t0
+	 imulq	%r15, $acc2
+
+	mov	%rdx, $t1
+	mulq	8*3($a_ptr)
+	add	$t1, $acc5
+	adc	\$0, %rdx
+	xor	$acc1, $acc1
+	add	%rax, $acc5
+	 mov	$acc2, %rax
+	adc	%rdx, $acc0
+	adc	\$0, $acc1
+
+	################################# Third reduction step
+	mulq	8*0(%r14)
+	mov	$acc2, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	$acc2, %rax
+	adc	%rdx, $t0
+
+	sub	$acc2, $acc4
+	sbb	\$0, $acc2		# can't borrow
+
+	mulq	8*1(%r14)
+	add	$t0, $acc3
+	adc	\$0, %rdx
+	add	%rax, $acc3
+	mov	$t1, %rax
+	adc	%rdx, $acc4
+	mov	$t1, %rdx
+	adc	\$0, $acc2		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc5
+	 mov	8*3($b_ptr), %rax
+	sbb	%rdx, $t1		# can't borrow
+
+	add	$acc2, $acc5
+	adc	$t1, $acc0
+	adc	\$0, $acc1
+
+	################################# * b[3]
+	mov	%rax, $t0
+	mulq	8*0($a_ptr)
+	add	%rax, $acc3
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*1($a_ptr)
+	add	$t1, $acc4
+	adc	\$0, %rdx
+	add	%rax, $acc4
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*2($a_ptr)
+	add	$t1, $acc5
+	adc	\$0, %rdx
+	add	%rax, $acc5
+	mov	$t0, %rax
+	adc	\$0, %rdx
+
+	 mov	$acc3, $t0
+	 imulq	%r15, $acc3
+
+	mov	%rdx, $t1
+	mulq	8*3($a_ptr)
+	add	$t1, $acc0
+	adc	\$0, %rdx
+	xor	$acc2, $acc2
+	add	%rax, $acc0
+	 mov	$acc3, %rax
+	adc	%rdx, $acc1
+	adc	\$0, $acc2
+
+	################################# Last reduction step
+	mulq	8*0(%r14)
+	mov	$acc3, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	$acc3, %rax
+	adc	%rdx, $t0
+
+	sub	$acc3, $acc5
+	sbb	\$0, $acc3		# can't borrow
+
+	mulq	8*1(%r14)
+	add	$t0, $acc4
+	adc	\$0, %rdx
+	add	%rax, $acc4
+	mov	$t1, %rax
+	adc	%rdx, $acc5
+	mov	$t1, %rdx
+	adc	\$0, $acc3		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc0
+	sbb	%rdx, $t1		# can't borrow
+
+	add	$acc3, $acc0
+	adc	$t1, $acc1
+	adc	\$0, $acc2
+
+	################################# Subtract ord
+	 mov	$acc4, $a_ptr
+	sub	8*0(%r14), $acc4
+	 mov	$acc5, $acc3
+	sbb	8*1(%r14), $acc5
+	 mov	$acc0, $t0
+	sbb	8*2(%r14), $acc0
+	 mov	$acc1, $t1
+	sbb	8*3(%r14), $acc1
+	sbb	\$0, $acc2
+
+	cmovc	$a_ptr, $acc4
+	cmovc	$acc3, $acc5
+	cmovc	$t0, $acc0
+	cmovc	$t1, $acc1
+
+	mov	$acc4, 8*0($r_ptr)
+	mov	$acc5, 8*1($r_ptr)
+	mov	$acc0, 8*2($r_ptr)
+	mov	$acc1, 8*3($r_ptr)
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_mul_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+################################################################################
+# void ecp_nistz256_ord_sqr_mont(
+#   uint64_t res[4],
+#   uint64_t a[4],
+#   int rep);
+
+.globl	ecp_nistz256_ord_sqr_mont
+.type	ecp_nistz256_ord_sqr_mont,\@function,3
+.align	32
+ecp_nistz256_ord_sqr_mont:
+.cfi_startproc
+___
+$code.=<<___	if ($addx);
+	leaq	OPENSSL_ia32cap_P(%rip), %rcx
+	mov	8(%rcx), %rcx
+	and	\$0x80100, %ecx
+	cmp	\$0x80100, %ecx
+	je	.Lecp_nistz256_ord_sqr_montx
+___
+$code.=<<___;
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lord_sqr_body:
+
+	mov	8*0($a_ptr), $acc0
+	mov	8*1($a_ptr), %rax
+	mov	8*2($a_ptr), $acc6
+	mov	8*3($a_ptr), $acc7
+	lea	.Lord(%rip), $a_ptr	# pointer to modulus
+	mov	$b_org, $b_ptr
+	jmp	.Loop_ord_sqr
+
+.align	32
+.Loop_ord_sqr:
+	################################# a[1:] * a[0]
+	mov	%rax, $t1		# put aside a[1]
+	mul	$acc0			# a[1] * a[0]
+	mov	%rax, $acc1
+	movq	$t1, %xmm1		# offload a[1]
+	mov	$acc6, %rax
+	mov	%rdx, $acc2
+
+	mul	$acc0			# a[2] * a[0]
+	add	%rax, $acc2
+	mov	$acc7, %rax
+	movq	$acc6, %xmm2		# offload a[2]
+	adc	\$0, %rdx
+	mov	%rdx, $acc3
+
+	mul	$acc0			# a[3] * a[0]
+	add	%rax, $acc3
+	mov	$acc7, %rax
+	movq	$acc7, %xmm3		# offload a[3]
+	adc	\$0, %rdx
+	mov	%rdx, $acc4
+
+	################################# a[3] * a[2]
+	mul	$acc6			# a[3] * a[2]
+	mov	%rax, $acc5
+	mov	$acc6, %rax
+	mov	%rdx, $acc6
+
+	################################# a[2:] * a[1]
+	mul	$t1			# a[2] * a[1]
+	add	%rax, $acc3
+	mov	$acc7, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $acc7
+
+	mul	$t1			# a[3] * a[1]
+	add	%rax, $acc4
+	adc	\$0, %rdx
+
+	add	$acc7, $acc4
+	adc	%rdx, $acc5
+	adc	\$0, $acc6		# can't overflow
+
+	################################# *2
+	xor	$acc7, $acc7
+	mov	$acc0, %rax
+	add	$acc1, $acc1
+	adc	$acc2, $acc2
+	adc	$acc3, $acc3
+	adc	$acc4, $acc4
+	adc	$acc5, $acc5
+	adc	$acc6, $acc6
+	adc	\$0, $acc7
+
+	################################# Missing products
+	mul	%rax			# a[0] * a[0]
+	mov	%rax, $acc0
+	movq	%xmm1, %rax
+	mov	%rdx, $t1
+
+	mul	%rax			# a[1] * a[1]
+	add	$t1, $acc1
+	adc	%rax, $acc2
+	movq	%xmm2, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mul	%rax			# a[2] * a[2]
+	add	$t1, $acc3
+	adc	%rax, $acc4
+	movq	%xmm3, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	 mov	$acc0, $t0
+	 imulq	8*4($a_ptr), $acc0	# *= .LordK
+
+	mul	%rax			# a[3] * a[3]
+	add	$t1, $acc5
+	adc	%rax, $acc6
+	 mov	8*0($a_ptr), %rax	# modulus[0]
+	adc	%rdx, $acc7		# can't overflow
+
+	################################# First reduction step
+	mul	$acc0
+	mov	$acc0, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	8*1($a_ptr), %rax	# modulus[1]
+	adc	%rdx, $t0
+
+	sub	$acc0, $acc2
+	sbb	\$0, $t1		# can't borrow
+
+	mul	$acc0
+	add	$t0, $acc1
+	adc	\$0, %rdx
+	add	%rax, $acc1
+	mov	$acc0, %rax
+	adc	%rdx, $acc2
+	mov	$acc0, %rdx
+	adc	\$0, $t1		# can't overflow
+
+	 mov	$acc1, $t0
+	 imulq	8*4($a_ptr), $acc1	# *= .LordK
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc3
+	 mov	8*0($a_ptr), %rax
+	sbb	%rdx, $acc0		# can't borrow
+
+	add	$t1, $acc3
+	adc	\$0, $acc0		# can't overflow
+
+	################################# Second reduction step
+	mul	$acc1
+	mov	$acc1, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	8*1($a_ptr), %rax
+	adc	%rdx, $t0
+
+	sub	$acc1, $acc3
+	sbb	\$0, $t1		# can't borrow
+
+	mul	$acc1
+	add	$t0, $acc2
+	adc	\$0, %rdx
+	add	%rax, $acc2
+	mov	$acc1, %rax
+	adc	%rdx, $acc3
+	mov	$acc1, %rdx
+	adc	\$0, $t1		# can't overflow
+
+	 mov	$acc2, $t0
+	 imulq	8*4($a_ptr), $acc2	# *= .LordK
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc0
+	 mov	8*0($a_ptr), %rax
+	sbb	%rdx, $acc1		# can't borrow
+
+	add	$t1, $acc0
+	adc	\$0, $acc1		# can't overflow
+
+	################################# Third reduction step
+	mul	$acc2
+	mov	$acc2, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	8*1($a_ptr), %rax
+	adc	%rdx, $t0
+
+	sub	$acc2, $acc0
+	sbb	\$0, $t1		# can't borrow
+
+	mul	$acc2
+	add	$t0, $acc3
+	adc	\$0, %rdx
+	add	%rax, $acc3
+	mov	$acc2, %rax
+	adc	%rdx, $acc0
+	mov	$acc2, %rdx
+	adc	\$0, $t1		# can't overflow
+
+	 mov	$acc3, $t0
+	 imulq	8*4($a_ptr), $acc3	# *= .LordK
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc1
+	 mov	8*0($a_ptr), %rax
+	sbb	%rdx, $acc2		# can't borrow
+
+	add	$t1, $acc1
+	adc	\$0, $acc2		# can't overflow
+
+	################################# Last reduction step
+	mul	$acc3
+	mov	$acc3, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	8*1($a_ptr), %rax
+	adc	%rdx, $t0
+
+	sub	$acc3, $acc1
+	sbb	\$0, $t1		# can't borrow
+
+	mul	$acc3
+	add	$t0, $acc0
+	adc	\$0, %rdx
+	add	%rax, $acc0
+	mov	$acc3, %rax
+	adc	%rdx, $acc1
+	mov	$acc3, %rdx
+	adc	\$0, $t1		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc2
+	sbb	%rdx, $acc3		# can't borrow
+
+	add	$t1, $acc2
+	adc	\$0, $acc3		# can't overflow
+
+	################################# Add bits [511:256] of the sqr result
+	xor	%rdx, %rdx
+	add	$acc4, $acc0
+	adc	$acc5, $acc1
+	 mov	$acc0, $acc4
+	adc	$acc6, $acc2
+	adc	$acc7, $acc3
+	 mov	$acc1, %rax
+	adc	\$0, %rdx
+
+	################################# Compare to modulus
+	sub	8*0($a_ptr), $acc0
+	 mov	$acc2, $acc6
+	sbb	8*1($a_ptr), $acc1
+	sbb	8*2($a_ptr), $acc2
+	 mov	$acc3, $acc7
+	sbb	8*3($a_ptr), $acc3
+	sbb	\$0, %rdx
+
+	cmovc	$acc4, $acc0
+	cmovnc	$acc1, %rax
+	cmovnc	$acc2, $acc6
+	cmovnc	$acc3, $acc7
+
+	dec	$b_ptr
+	jnz	.Loop_ord_sqr
+
+	mov	$acc0, 8*0($r_ptr)
+	mov	%rax,  8*1($r_ptr)
+	pxor	%xmm1, %xmm1
+	mov	$acc6, 8*2($r_ptr)
+	pxor	%xmm2, %xmm2
+	mov	$acc7, 8*3($r_ptr)
+	pxor	%xmm3, %xmm3
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_sqr_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+___
+
+$code.=<<___	if ($addx);
+################################################################################
+.type	ecp_nistz256_ord_mul_montx,\@function,3
+.align	32
+ecp_nistz256_ord_mul_montx:
+.cfi_startproc
+.Lecp_nistz256_ord_mul_montx:
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lord_mulx_body:
+
+	mov	$b_org, $b_ptr
+	mov	8*0($b_org), %rdx
+	mov	8*0($a_ptr), $acc1
+	mov	8*1($a_ptr), $acc2
+	mov	8*2($a_ptr), $acc3
+	mov	8*3($a_ptr), $acc4
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+	lea	.Lord-128(%rip), %r14
+	mov	.LordK(%rip), %r15
+
+	################################# Multiply by b[0]
+	mulx	$acc1, $acc0, $acc1
+	mulx	$acc2, $t0, $acc2
+	mulx	$acc3, $t1, $acc3
+	add	$t0, $acc1
+	mulx	$acc4, $t0, $acc4
+	 mov	$acc0, %rdx
+	 mulx	%r15, %rdx, %rax
+	adc	$t1, $acc2
+	adc	$t0, $acc3
+	adc	\$0, $acc4
+
+	################################# reduction
+	xor	$acc5, $acc5		# $acc5=0, cf=0, of=0
+	mulx	8*0+128(%r14), $t0, $t1
+	adcx	$t0, $acc0		# guaranteed to be zero
+	adox	$t1, $acc1
+
+	mulx	8*1+128(%r14), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+
+	mulx	8*2+128(%r14), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*3+128(%r14), $t0, $t1
+	 mov	8*1($b_ptr), %rdx
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+	adcx	$acc0, $acc4
+	adox	$acc0, $acc5
+	adc	\$0, $acc5		# cf=0, of=0
+
+	################################# Multiply by b[1]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc1, %rdx
+	 mulx	%r15, %rdx, %rax
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	adcx	$acc0, $acc5
+	adox	$acc0, $acc0
+	adc	\$0, $acc0		# cf=0, of=0
+
+	################################# reduction
+	mulx	8*0+128(%r14), $t0, $t1
+	adcx	$t0, $acc1		# guaranteed to be zero
+	adox	$t1, $acc2
+
+	mulx	8*1+128(%r14), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*2+128(%r14), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*3+128(%r14), $t0, $t1
+	 mov	8*2($b_ptr), %rdx
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+	adcx	$acc1, $acc5
+	adox	$acc1, $acc0
+	adc	\$0, $acc0		# cf=0, of=0
+
+	################################# Multiply by b[2]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc2, %rdx
+	 mulx	%r15, %rdx, %rax
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+
+	adcx	$acc1, $acc0
+	adox	$acc1, $acc1
+	adc	\$0, $acc1		# cf=0, of=0
+
+	################################# reduction
+	mulx	8*0+128(%r14), $t0, $t1
+	adcx	$t0, $acc2		# guaranteed to be zero
+	adox	$t1, $acc3
+
+	mulx	8*1+128(%r14), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*2+128(%r14), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*3+128(%r14), $t0, $t1
+	 mov	8*3($b_ptr), %rdx
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+	adcx	$acc2, $acc0
+	adox	$acc2, $acc1
+	adc	\$0, $acc1		# cf=0, of=0
+
+	################################# Multiply by b[3]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc3, %rdx
+	 mulx	%r15, %rdx, %rax
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+
+	adcx	$acc2, $acc1
+	adox	$acc2, $acc2
+	adc	\$0, $acc2		# cf=0, of=0
+
+	################################# reduction
+	mulx	8*0+128(%r14), $t0, $t1
+	adcx	$t0, $acc3		# guranteed to be zero
+	adox	$t1, $acc4
+
+	mulx	8*1+128(%r14), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*2+128(%r14), $t0, $t1
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+
+	mulx	8*3+128(%r14), $t0, $t1
+	lea	128(%r14),%r14
+	 mov	$acc4, $t2
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	 mov	$acc5, $t3
+	adcx	$acc3, $acc1
+	adox	$acc3, $acc2
+	adc	\$0, $acc2
+
+	#################################
+	# Branch-less conditional subtraction of P
+	 mov	$acc0, $t0
+	sub	8*0(%r14), $acc4
+	sbb	8*1(%r14), $acc5
+	sbb	8*2(%r14), $acc0
+	 mov	$acc1, $t1
+	sbb	8*3(%r14), $acc1
+	sbb	\$0, $acc2
+
+	cmovc	$t2, $acc4
+	cmovc	$t3, $acc5
+	cmovc	$t0, $acc0
+	cmovc	$t1, $acc1
+
+	mov	$acc4, 8*0($r_ptr)
+	mov	$acc5, 8*1($r_ptr)
+	mov	$acc0, 8*2($r_ptr)
+	mov	$acc1, 8*3($r_ptr)
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_mulx_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
+
+.type	ecp_nistz256_ord_sqr_montx,\@function,3
+.align	32
+ecp_nistz256_ord_sqr_montx:
+.cfi_startproc
+.Lecp_nistz256_ord_sqr_montx:
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lord_sqrx_body:
+
+	mov	$b_org, $b_ptr
+	mov	8*0($a_ptr), %rdx
+	mov	8*1($a_ptr), $acc6
+	mov	8*2($a_ptr), $acc7
+	mov	8*3($a_ptr), $acc0
+	lea	.Lord(%rip), $a_ptr
+	jmp	.Loop_ord_sqrx
+
+.align	32
+.Loop_ord_sqrx:
+	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
+	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
+	 mov	%rdx, %rax		# offload a[0]
+	 movq	$acc6, %xmm1		# offload a[1]
+	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
+	 mov	$acc6, %rdx
+	add	$t0, $acc2
+	 movq	$acc7, %xmm2		# offload a[2]
+	adc	$t1, $acc3
+	adc	\$0, $acc4
+	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
+	#################################
+	mulx	$acc7, $t0, $t1		# a[1]*a[2]
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	$acc0, $t0, $t1		# a[1]*a[3]
+	 mov	$acc7, %rdx
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+	adc	\$0, $acc5
+	#################################
+	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
+	mov	%rax, %rdx
+	 movq	$acc0, %xmm3		# offload a[3]
+	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
+	 adcx	$acc1, $acc1		# acc1:6<<1
+	adox	$t0, $acc5
+	 adcx	$acc2, $acc2
+	adox	$acc7, $acc6		# of=0
+
+	################################# a[i]*a[i]
+	mulx	%rdx, $acc0, $t1
+	movq	%xmm1, %rdx
+	 adcx	$acc3, $acc3
+	adox	$t1, $acc1
+	 adcx	$acc4, $acc4
+	mulx	%rdx, $t0, $t4
+	movq	%xmm2, %rdx
+	 adcx	$acc5, $acc5
+	adox	$t0, $acc2
+	 adcx	$acc6, $acc6
+	mulx	%rdx, $t0, $t1
+	.byte	0x67
+	movq	%xmm3, %rdx
+	adox	$t4, $acc3
+	 adcx	$acc7, $acc7
+	adox	$t0, $acc4
+	adox	$t1, $acc5
+	mulx	%rdx, $t0, $t4
+	adox	$t0, $acc6
+	adox	$t4, $acc7
+
+	################################# reduction
+	mov	$acc0, %rdx
+	mulx	8*4($a_ptr), %rdx, $t0
+
+	xor	%rax, %rax		# cf=0, of=0
+	mulx	8*0($a_ptr), $t0, $t1
+	adcx	$t0, $acc0		# guaranteed to be zero
+	adox	$t1, $acc1
+	mulx	8*1($a_ptr), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+	mulx	8*2($a_ptr), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+	mulx	8*3($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc0		# of=0
+	adcx	%rax, $acc0		# cf=0
+
+	#################################
+	mov	$acc1, %rdx
+	mulx	8*4($a_ptr), %rdx, $t0
+
+	mulx	8*0($a_ptr), $t0, $t1
+	adox	$t0, $acc1		# guaranteed to be zero
+	adcx	$t1, $acc2
+	mulx	8*1($a_ptr), $t0, $t1
+	adox	$t0, $acc2
+	adcx	$t1, $acc3
+	mulx	8*2($a_ptr), $t0, $t1
+	adox	$t0, $acc3
+	adcx	$t1, $acc0
+	mulx	8*3($a_ptr), $t0, $t1
+	adox	$t0, $acc0
+	adcx	$t1, $acc1		# cf=0
+	adox	%rax, $acc1		# of=0
+
+	#################################
+	mov	$acc2, %rdx
+	mulx	8*4($a_ptr), %rdx, $t0
+
+	mulx	8*0($a_ptr), $t0, $t1
+	adcx	$t0, $acc2		# guaranteed to be zero
+	adox	$t1, $acc3
+	mulx	8*1($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc0
+	mulx	8*2($a_ptr), $t0, $t1
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	mulx	8*3($a_ptr), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2		# of=0
+	adcx	%rax, $acc2		# cf=0
+
+	#################################
+	mov	$acc3, %rdx
+	mulx	8*4($a_ptr), %rdx, $t0
+
+	mulx	8*0($a_ptr), $t0, $t1
+	adox	$t0, $acc3		# guaranteed to be zero
+	adcx	$t1, $acc0
+	mulx	8*1($a_ptr), $t0, $t1
+	adox	$t0, $acc0
+	adcx	$t1, $acc1
+	mulx	8*2($a_ptr), $t0, $t1
+	adox	$t0, $acc1
+	adcx	$t1, $acc2
+	mulx	8*3($a_ptr), $t0, $t1
+	adox	$t0, $acc2
+	adcx	$t1, $acc3
+	adox	%rax, $acc3
+
+	################################# accumulate upper half
+	add	$acc0, $acc4		# add	$acc4, $acc0
+	adc	$acc5, $acc1
+	 mov	$acc4, %rdx
+	adc	$acc6, $acc2
+	adc	$acc7, $acc3
+	 mov	$acc1, $acc6
+	adc	\$0, %rax
+
+	################################# compare to modulus
+	sub	8*0($a_ptr), $acc4
+	 mov	$acc2, $acc7
+	sbb	8*1($a_ptr), $acc1
+	sbb	8*2($a_ptr), $acc2
+	 mov	$acc3, $acc0
+	sbb	8*3($a_ptr), $acc3
+	sbb	\$0, %rax
+
+	cmovnc	$acc4, %rdx
+	cmovnc	$acc1, $acc6
+	cmovnc	$acc2, $acc7
+	cmovnc	$acc3, $acc0
+
+	dec	$b_ptr
+	jnz	.Loop_ord_sqrx
+
+	mov	%rdx, 8*0($r_ptr)
+	mov	$acc6, 8*1($r_ptr)
+	pxor	%xmm1, %xmm1
+	mov	$acc7, 8*2($r_ptr)
+	pxor	%xmm2, %xmm2
+	mov	$acc0, 8*3($r_ptr)
+	pxor	%xmm3, %xmm3
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_sqrx_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
+___
+
+$code.=<<___;
+################################################################################
 # void ecp_nistz256_mul_mont(
 #   uint64_t res[4],
 #   uint64_t a[4],
@@ -2840,6 +3931,24 @@
 	.rva	.LSEH_end_ecp_nistz256_neg
 	.rva	.LSEH_info_ecp_nistz256_neg
 
+	.rva	.LSEH_begin_ecp_nistz256_ord_mul_mont
+	.rva	.LSEH_end_ecp_nistz256_ord_mul_mont
+	.rva	.LSEH_info_ecp_nistz256_ord_mul_mont
+
+	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_mont
+	.rva	.LSEH_end_ecp_nistz256_ord_sqr_mont
+	.rva	.LSEH_info_ecp_nistz256_ord_sqr_mont
+___
+$code.=<<___	if ($addx);
+	.rva	.LSEH_begin_ecp_nistz256_ord_mul_montx
+	.rva	.LSEH_end_ecp_nistz256_ord_mul_montx
+	.rva	.LSEH_info_ecp_nistz256_ord_mul_montx
+
+	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_montx
+	.rva	.LSEH_end_ecp_nistz256_ord_sqr_montx
+	.rva	.LSEH_info_ecp_nistz256_ord_sqr_montx
+___
+$code.=<<___;
 	.rva	.LSEH_begin_ecp_nistz256_mul_mont
 	.rva	.LSEH_end_ecp_nistz256_mul_mont
 	.rva	.LSEH_info_ecp_nistz256_mul_mont
@@ -2899,6 +4008,30 @@
 	.byte	9,0,0,0
 	.rva	short_handler
 	.rva	.Lneg_body,.Lneg_epilogue		# HandlerData[]
+.LSEH_info_ecp_nistz256_ord_mul_mont:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lord_mul_body,.Lord_mul_epilogue	# HandlerData[]
+	.long	48,0
+.LSEH_info_ecp_nistz256_ord_sqr_mont:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lord_sqr_body,.Lord_sqr_epilogue	# HandlerData[]
+	.long	48,0
+___
+$code.=<<___ if ($addx);
+.LSEH_info_ecp_nistz256_ord_mul_montx:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lord_mulx_body,.Lord_mulx_epilogue	# HandlerData[]
+	.long	48,0
+.LSEH_info_ecp_nistz256_ord_sqr_montx:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lord_sqrx_body,.Lord_sqrx_epilogue	# HandlerData[]
+	.long	48,0
+___
+$code.=<<___;
 .LSEH_info_ecp_nistz256_mul_mont:
 	.byte	9,0,0,0
 	.rva	full_handler
diff --git a/src/crypto/fipsmodule/ec/ec.c b/src/crypto/fipsmodule/ec/ec.c
index ee7ec55..07f9c34 100644
--- a/src/crypto/fipsmodule/ec/ec.c
+++ b/src/crypto/fipsmodule/ec/ec.c
@@ -352,7 +352,7 @@
   }
 
   if (BN_num_bytes(order) > EC_MAX_SCALAR_BYTES) {
-    OPENSSL_PUT_ERROR(EC, EC_R_INVALID_FIELD);
+    OPENSSL_PUT_ERROR(EC, EC_R_INVALID_GROUP_ORDER);
     return 0;
   }
 
@@ -839,7 +839,7 @@
   BIGNUM *tmp = BN_CTX_get(ctx);
   int ok = tmp != NULL &&
            BN_nnmod(tmp, in, order, ctx) &&
-           ec_bignum_to_scalar_unchecked(group, out, tmp);
+           ec_bignum_to_scalar(group, out, tmp);
   BN_CTX_end(ctx);
   return ok;
 }
@@ -955,30 +955,3 @@
 
   return OPENSSL_NUM_BUILT_IN_CURVES;
 }
-
-int ec_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out,
-                        const BIGNUM *in) {
-  if (!ec_bignum_to_scalar_unchecked(group, out, in)) {
-    return 0;
-  }
-  if (!bn_less_than_words(out->words, group->order.d, group->order.width)) {
-    OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR);
-    return 0;
-  }
-  return 1;
-}
-
-int ec_bignum_to_scalar_unchecked(const EC_GROUP *group, EC_SCALAR *out,
-                                  const BIGNUM *in) {
-  if (!bn_copy_words(out->words, group->order.width, in)) {
-    OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR);
-    return 0;
-  }
-  return 1;
-}
-
-int ec_random_nonzero_scalar(const EC_GROUP *group, EC_SCALAR *out,
-                             const uint8_t additional_data[32]) {
-  return bn_rand_range_words(out->words, 1, group->order.d, group->order.width,
-                             additional_data);
-}
diff --git a/src/crypto/fipsmodule/ec/ec_montgomery.c b/src/crypto/fipsmodule/ec/ec_montgomery.c
index 165c06f..d80fa23 100644
--- a/src/crypto/fipsmodule/ec/ec_montgomery.c
+++ b/src/crypto/fipsmodule/ec/ec_montgomery.c
@@ -184,68 +184,58 @@
 
   BN_CTX_start(ctx);
 
-  if (BN_cmp(&point->Z, &group->one) == 0) {
-    // |point| is already affine.
-    if (x != NULL && !BN_from_montgomery(x, &point->X, group->mont, ctx)) {
+  // transform  (X, Y, Z)  into  (x, y) := (X/Z^2, Y/Z^3)
+
+  BIGNUM *Z_1 = BN_CTX_get(ctx);
+  BIGNUM *Z_2 = BN_CTX_get(ctx);
+  BIGNUM *Z_3 = BN_CTX_get(ctx);
+  if (Z_1 == NULL ||
+      Z_2 == NULL ||
+      Z_3 == NULL) {
+    goto err;
+  }
+
+  // The straightforward way to calculate the inverse of a Montgomery-encoded
+  // value where the result is Montgomery-encoded is:
+  //
+  //    |BN_from_montgomery| + invert + |BN_to_montgomery|.
+  //
+  // This is equivalent, but more efficient, because |BN_from_montgomery|
+  // is more efficient (at least in theory) than |BN_to_montgomery|, since it
+  // doesn't have to do the multiplication before the reduction.
+  //
+  // Use Fermat's Little Theorem instead of |BN_mod_inverse_odd| since this
+  // inversion may be done as the final step of private key operations.
+  // Unfortunately, this is suboptimal for ECDSA verification.
+  if (!BN_from_montgomery(Z_1, &point->Z, group->mont, ctx) ||
+      !BN_from_montgomery(Z_1, Z_1, group->mont, ctx) ||
+      !bn_mod_inverse_prime(Z_1, Z_1, &group->field, ctx, group->mont)) {
+    goto err;
+  }
+
+  if (!BN_mod_mul_montgomery(Z_2, Z_1, Z_1, group->mont, ctx)) {
+    goto err;
+  }
+
+  // Instead of using |BN_from_montgomery| to convert the |x| coordinate
+  // and then calling |BN_from_montgomery| again to convert the |y|
+  // coordinate below, convert the common factor |Z_2| once now, saving one
+  // reduction.
+  if (!BN_from_montgomery(Z_2, Z_2, group->mont, ctx)) {
+    goto err;
+  }
+
+  if (x != NULL) {
+    if (!BN_mod_mul_montgomery(x, &point->X, Z_2, group->mont, ctx)) {
       goto err;
     }
-    if (y != NULL && !BN_from_montgomery(y, &point->Y, group->mont, ctx)) {
+  }
+
+  if (y != NULL) {
+    if (!BN_mod_mul_montgomery(Z_3, Z_2, Z_1, group->mont, ctx) ||
+        !BN_mod_mul_montgomery(y, &point->Y, Z_3, group->mont, ctx)) {
       goto err;
     }
-  } else {
-    // transform  (X, Y, Z)  into  (x, y) := (X/Z^2, Y/Z^3)
-
-    BIGNUM *Z_1 = BN_CTX_get(ctx);
-    BIGNUM *Z_2 = BN_CTX_get(ctx);
-    BIGNUM *Z_3 = BN_CTX_get(ctx);
-    if (Z_1 == NULL ||
-        Z_2 == NULL ||
-        Z_3 == NULL) {
-      goto err;
-    }
-
-    // The straightforward way to calculate the inverse of a Montgomery-encoded
-    // value where the result is Montgomery-encoded is:
-    //
-    //    |BN_from_montgomery| + invert + |BN_to_montgomery|.
-    //
-    // This is equivalent, but more efficient, because |BN_from_montgomery|
-    // is more efficient (at least in theory) than |BN_to_montgomery|, since it
-    // doesn't have to do the multiplication before the reduction.
-    //
-    // Use Fermat's Little Theorem instead of |BN_mod_inverse_odd| since this
-    // inversion may be done as the final step of private key operations.
-    // Unfortunately, this is suboptimal for ECDSA verification.
-    if (!BN_from_montgomery(Z_1, &point->Z, group->mont, ctx) ||
-        !BN_from_montgomery(Z_1, Z_1, group->mont, ctx) ||
-        !bn_mod_inverse_prime(Z_1, Z_1, &group->field, ctx, group->mont)) {
-      goto err;
-    }
-
-    if (!BN_mod_mul_montgomery(Z_2, Z_1, Z_1, group->mont, ctx)) {
-      goto err;
-    }
-
-    // Instead of using |BN_from_montgomery| to convert the |x| coordinate
-    // and then calling |BN_from_montgomery| again to convert the |y|
-    // coordinate below, convert the common factor |Z_2| once now, saving one
-    // reduction.
-    if (!BN_from_montgomery(Z_2, Z_2, group->mont, ctx)) {
-      goto err;
-    }
-
-    if (x != NULL) {
-      if (!BN_mod_mul_montgomery(x, &point->X, Z_2, group->mont, ctx)) {
-        goto err;
-      }
-    }
-
-    if (y != NULL) {
-      if (!BN_mod_mul_montgomery(Z_3, Z_2, Z_1, group->mont, ctx) ||
-          !BN_mod_mul_montgomery(y, &point->Y, Z_3, group->mont, ctx)) {
-        goto err;
-      }
-    }
   }
 
   ret = 1;
@@ -267,4 +257,5 @@
   out->field_sqr = ec_GFp_mont_field_sqr;
   out->field_encode = ec_GFp_mont_field_encode;
   out->field_decode = ec_GFp_mont_field_decode;
+  out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery;
 }
diff --git a/src/crypto/fipsmodule/ec/internal.h b/src/crypto/fipsmodule/ec/internal.h
index c5d7291..7f72c31 100644
--- a/src/crypto/fipsmodule/ec/internal.h
+++ b/src/crypto/fipsmodule/ec/internal.h
@@ -133,6 +133,12 @@
                       BN_CTX *);  // e.g. to Montgomery
   int (*field_decode)(const EC_GROUP *, BIGNUM *r, const BIGNUM *a,
                       BN_CTX *);  // e.g. from Montgomery
+
+  // scalar_inv_mont sets |out| to |in|^-1, where both input and output are in
+  // Montgomery form.
+  void (*scalar_inv_montgomery)(const EC_GROUP *group, EC_SCALAR *out,
+                                const EC_SCALAR *in);
+
 } /* EC_METHOD */;
 
 const EC_METHOD *EC_GFp_mont_method(void);
@@ -183,16 +189,33 @@
 OPENSSL_EXPORT int ec_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out,
                                        const BIGNUM *in);
 
-// ec_bignum_to_scalar_unchecked behaves like |ec_bignum_to_scalar| but does not
-// check |in| is fully reduced.
-int ec_bignum_to_scalar_unchecked(const EC_GROUP *group, EC_SCALAR *out,
-                                  const BIGNUM *in);
-
 // ec_random_nonzero_scalar sets |out| to a uniformly selected random value from
 // 1 to |group->order| - 1. It returns one on success and zero on error.
 int ec_random_nonzero_scalar(const EC_GROUP *group, EC_SCALAR *out,
                              const uint8_t additional_data[32]);
 
+// ec_scalar_add sets |r| to |a| + |b|.
+void ec_scalar_add(const EC_GROUP *group, EC_SCALAR *r, const EC_SCALAR *a,
+                   const EC_SCALAR *b);
+
+// ec_scalar_to_montgomery sets |r| to |a| in Montgomery form.
+void ec_scalar_to_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                             const EC_SCALAR *a);
+
+// ec_scalar_to_montgomery sets |r| to |a| converted from Montgomery form.
+void ec_scalar_from_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                               const EC_SCALAR *a);
+
+// ec_scalar_mul_montgomery sets |r| to |a| * |b| where inputs and outputs are
+// in Montgomery form.
+void ec_scalar_mul_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                              const EC_SCALAR *a, const EC_SCALAR *b);
+
+// ec_scalar_mul_montgomery sets |r| to |a|^-1 where inputs and outputs are in
+// Montgomery form.
+void ec_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                              const EC_SCALAR *a);
+
 // ec_point_add_mixed behaves like |EC_POINT_add|, but |&b->Z| must be zero or
 // one.
 int ec_point_add_mixed(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
@@ -254,6 +277,8 @@
 int ec_GFp_simple_make_affine(const EC_GROUP *, EC_POINT *, BN_CTX *);
 int ec_GFp_simple_points_make_affine(const EC_GROUP *, size_t num,
                                      EC_POINT * [], BN_CTX *);
+void ec_simple_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                                     const EC_SCALAR *a);
 
 // method functions in montgomery.c
 int ec_GFp_mont_group_init(EC_GROUP *);
diff --git a/src/crypto/fipsmodule/ec/p224-64.c b/src/crypto/fipsmodule/ec/p224-64.c
index 7e2f45b..0a379fe 100644
--- a/src/crypto/fipsmodule/ec/p224-64.c
+++ b/src/crypto/fipsmodule/ec/p224-64.c
@@ -203,38 +203,25 @@
   }
 }
 
-// To preserve endianness when using BN_bn2bin and BN_bin2bn
-static void p224_flip_endian(uint8_t *out, const uint8_t *in, size_t len) {
-  for (size_t i = 0; i < len; ++i) {
-    out[i] = in[len - 1 - i];
-  }
-}
-
 // From OpenSSL BIGNUM to internal representation
 static int p224_BN_to_felem(p224_felem out, const BIGNUM *bn) {
   // BN_bn2bin eats leading zeroes
   p224_felem_bytearray b_out;
-  OPENSSL_memset(b_out, 0, sizeof(b_out));
-  size_t num_bytes = BN_num_bytes(bn);
-  if (num_bytes > sizeof(b_out) ||
-      BN_is_negative(bn)) {
+  if (BN_is_negative(bn) ||
+      !BN_bn2le_padded(b_out, sizeof(b_out), bn)) {
     OPENSSL_PUT_ERROR(EC, EC_R_BIGNUM_OUT_OF_RANGE);
     return 0;
   }
 
-  p224_felem_bytearray b_in;
-  num_bytes = BN_bn2bin(bn, b_in);
-  p224_flip_endian(b_out, b_in, num_bytes);
   p224_bin28_to_felem(out, b_out);
   return 1;
 }
 
 // From internal representation to OpenSSL BIGNUM
 static BIGNUM *p224_felem_to_BN(BIGNUM *out, const p224_felem in) {
-  p224_felem_bytearray b_in, b_out;
-  p224_felem_to_bin28(b_in, in);
-  p224_flip_endian(b_out, b_in, sizeof(b_out));
-  return BN_bin2bn(b_out, sizeof(b_out), out);
+  p224_felem_bytearray b_out;
+  p224_felem_to_bin28(b_out, in);
+  return BN_le2bn(b_out, sizeof(b_out), out);
 }
 
 // Field operations, using the internal representation of field elements.
@@ -1127,6 +1114,7 @@
   out->field_sqr = ec_GFp_nistp224_field_sqr;
   out->field_encode = NULL;
   out->field_decode = NULL;
+  out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery;
 };
 
 #endif  // BORINGSSL_HAS_UINT128 && !SMALL
diff --git a/src/crypto/fipsmodule/ec/p256-x86_64.c b/src/crypto/fipsmodule/ec/p256-x86_64.c
index dbe99ed..d8d3a39 100644
--- a/src/crypto/fipsmodule/ec/p256-x86_64.c
+++ b/src/crypto/fipsmodule/ec/p256-x86_64.c
@@ -430,6 +430,87 @@
   return 1;
 }
 
+static void ecp_nistz256_inv_mod_ord(const EC_GROUP *group, EC_SCALAR *out,
+                                     const EC_SCALAR *in) {
+  // table[i] stores a power of |in| corresponding to the matching enum value.
+  enum {
+    // The following indices specify the power in binary.
+    i_1 = 0,
+    i_10,
+    i_11,
+    i_101,
+    i_111,
+    i_1010,
+    i_1111,
+    i_10101,
+    i_101010,
+    i_101111,
+    // The following indices specify 2^N-1, or N ones in a row.
+    i_x6,
+    i_x8,
+    i_x16,
+    i_x32
+  };
+  BN_ULONG table[15][P256_LIMBS];
+
+  // https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion
+  //
+  // Even though this code path spares 12 squarings, 4.5%, and 13
+  // multiplications, 25%, the overall sign operation is not that much faster,
+  // not more that 2%. Most of the performance of this function comes from the
+  // scalar operations.
+
+  // Pre-calculate powers.
+  OPENSSL_memcpy(table[i_1], in->words, P256_LIMBS * sizeof(BN_ULONG));
+
+  ecp_nistz256_ord_sqr_mont(table[i_10], table[i_1], 1);
+
+  ecp_nistz256_ord_mul_mont(table[i_11], table[i_1], table[i_10]);
+
+  ecp_nistz256_ord_mul_mont(table[i_101], table[i_11], table[i_10]);
+
+  ecp_nistz256_ord_mul_mont(table[i_111], table[i_101], table[i_10]);
+
+  ecp_nistz256_ord_sqr_mont(table[i_1010], table[i_101], 1);
+
+  ecp_nistz256_ord_mul_mont(table[i_1111], table[i_1010], table[i_101]);
+
+  ecp_nistz256_ord_sqr_mont(table[i_10101], table[i_1010], 1);
+  ecp_nistz256_ord_mul_mont(table[i_10101], table[i_10101], table[i_1]);
+
+  ecp_nistz256_ord_sqr_mont(table[i_101010], table[i_10101], 1);
+
+  ecp_nistz256_ord_mul_mont(table[i_101111], table[i_101010], table[i_101]);
+
+  ecp_nistz256_ord_mul_mont(table[i_x6], table[i_101010], table[i_10101]);
+
+  ecp_nistz256_ord_sqr_mont(table[i_x8], table[i_x6], 2);
+  ecp_nistz256_ord_mul_mont(table[i_x8], table[i_x8], table[i_11]);
+
+  ecp_nistz256_ord_sqr_mont(table[i_x16], table[i_x8], 8);
+  ecp_nistz256_ord_mul_mont(table[i_x16], table[i_x16], table[i_x8]);
+
+  ecp_nistz256_ord_sqr_mont(table[i_x32], table[i_x16], 16);
+  ecp_nistz256_ord_mul_mont(table[i_x32], table[i_x32], table[i_x16]);
+
+  // Compute |in| raised to the order-2.
+  ecp_nistz256_ord_sqr_mont(out->words, table[i_x32], 64);
+  ecp_nistz256_ord_mul_mont(out->words, out->words, table[i_x32]);
+  static const struct {
+    uint8_t p, i;
+  } kChain[27] = {{32, i_x32},    {6, i_101111}, {5, i_111},    {4, i_11},
+                  {5, i_1111},    {5, i_10101},  {4, i_101},    {3, i_101},
+                  {3, i_101},     {5, i_111},    {9, i_101111}, {6, i_1111},
+                  {2, i_1},       {5, i_1},      {6, i_1111},   {5, i_111},
+                  {4, i_111},     {5, i_111},    {5, i_101},    {3, i_11},
+                  {10, i_101111}, {2, i_11},     {5, i_11},     {5, i_11},
+                  {3, i_1},       {7, i_10101},  {6, i_1111}};
+  for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kChain); i++) {
+    ecp_nistz256_ord_sqr_mont(out->words, out->words, kChain[i].p);
+    ecp_nistz256_ord_mul_mont(out->words, out->words, table[kChain[i].i]);
+  }
+}
+
 DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistz256_method) {
   out->group_init = ec_GFp_mont_group_init;
   out->group_finish = ec_GFp_mont_group_finish;
@@ -441,6 +522,7 @@
   out->field_sqr = ec_GFp_mont_field_sqr;
   out->field_encode = ec_GFp_mont_field_encode;
   out->field_decode = ec_GFp_mont_field_decode;
+  out->scalar_inv_montgomery = ecp_nistz256_inv_mod_ord;
 };
 
 #endif /* !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
diff --git a/src/crypto/fipsmodule/ec/p256-x86_64.h b/src/crypto/fipsmodule/ec/p256-x86_64.h
index 9226124..21b461c 100644
--- a/src/crypto/fipsmodule/ec/p256-x86_64.h
+++ b/src/crypto/fipsmodule/ec/p256-x86_64.h
@@ -62,6 +62,24 @@
 }
 
 
+// P-256 scalar operations.
+//
+// The following functions compute modulo N, where N is the order of P-256. They
+// take fully-reduced inputs and give fully-reduced outputs.
+
+// ecp_nistz256_ord_mul_mont sets |res| to |a| * |b| where inputs and outputs
+// are in Montgomery form. That is, |res| is |a| * |b| * 2^-256 mod N.
+void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS],
+                               const BN_ULONG a[P256_LIMBS],
+                               const BN_ULONG b[P256_LIMBS]);
+
+// ecp_nistz256_ord_sqr_mont sets |res| to |a|^(2*|rep|) where inputs and
+// outputs are in Montgomery form. That is, |res| is
+// (|a| * 2^-256)^(2*|rep|) * 2^256 mod N.
+void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
+                               const BN_ULONG a[P256_LIMBS], int rep);
+
+
 // P-256 point operations.
 //
 // The following functions may be used in-place. All coordinates are in the
diff --git a/src/crypto/fipsmodule/ec/p256-x86_64_test.cc b/src/crypto/fipsmodule/ec/p256-x86_64_test.cc
index 5cd701b..8ed1dd4 100644
--- a/src/crypto/fipsmodule/ec/p256-x86_64_test.cc
+++ b/src/crypto/fipsmodule/ec/p256-x86_64_test.cc
@@ -365,6 +365,47 @@
   }
 }
 
+static void TestOrdMulMont(FileTest *t) {
+  // This test works on scalars rather than field elements, but the
+  // representation is the same.
+  BN_ULONG a[P256_LIMBS], b[P256_LIMBS], result[P256_LIMBS];
+  ASSERT_TRUE(GetFieldElement(t, a, "A"));
+  ASSERT_TRUE(GetFieldElement(t, b, "B"));
+  ASSERT_TRUE(GetFieldElement(t, result, "Result"));
+
+  BN_ULONG ret[P256_LIMBS];
+  ecp_nistz256_ord_mul_mont(ret, a, b);
+  EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
+
+  ecp_nistz256_ord_mul_mont(ret, b, a);
+  EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
+
+  OPENSSL_memcpy(ret, a, sizeof(ret));
+  ecp_nistz256_ord_mul_mont(ret, ret /* a */, b);
+  EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
+
+  OPENSSL_memcpy(ret, a, sizeof(ret));
+  ecp_nistz256_ord_mul_mont(ret, b, ret);
+  EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
+
+  OPENSSL_memcpy(ret, b, sizeof(ret));
+  ecp_nistz256_ord_mul_mont(ret, a, ret /* b */);
+  EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
+
+  OPENSSL_memcpy(ret, b, sizeof(ret));
+  ecp_nistz256_ord_mul_mont(ret, ret /* b */, a);
+  EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
+
+  if (OPENSSL_memcmp(a, b, sizeof(a)) == 0) {
+    ecp_nistz256_ord_sqr_mont(ret, a, 1);
+    EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
+
+    OPENSSL_memcpy(ret, a, sizeof(ret));
+    ecp_nistz256_ord_sqr_mont(ret, ret /* a */, 1);
+    EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
+  }
+}
+
 TEST(P256_X86_64Test, TestVectors) {
   return FileTestGTest("crypto/fipsmodule/ec/p256-x86_64_tests.txt",
                        [](FileTest *t) {
@@ -376,6 +417,8 @@
       TestFromMont(t);
     } else if (t->GetParameter() == "PointAdd") {
       TestPointAdd(t);
+    } else if (t->GetParameter() == "OrdMulMont") {
+      TestOrdMulMont(t);
     } else {
       FAIL() << "Unknown test type:" << t->GetParameter();
     }
diff --git a/src/crypto/fipsmodule/ec/p256-x86_64_tests.txt b/src/crypto/fipsmodule/ec/p256-x86_64_tests.txt
index a680850..d1fdad0 100644
--- a/src/crypto/fipsmodule/ec/p256-x86_64_tests.txt
+++ b/src/crypto/fipsmodule/ec/p256-x86_64_tests.txt
@@ -1403,3 +1403,141 @@
 B.Z = 1f7c7226d78e51478c683bbb6afe01abc2225dbfc773d0806d30ff5f827b76c8
 Result.X = fba400ae656ec3103c5c5f531d2a0f7368031e01a48a91f1a4f3138d294b13be
 Result.Y = 160e358ad1f059eb62722df01a7440048a1db21ecaea8698efa9677db6e9ff97
+
+
+# Scalar montgomery multiplication tests.
+#
+# The following tests satisfy A * B * 2^-256 = Result (mod N).
+
+Test = OrdMulMont
+A = 0000000000000000000000000000000000000000000000000000000000000000
+B = b4e9b0aea84aa5ed86964a22881a4d0e58f88e9225f30990c18751e7d4b9ec95
+Result = 0000000000000000000000000000000000000000000000000000000000000000
+
+Test = OrdMulMont
+A = 00000000ffffffff00000000000000004319055258e8617b0c46353d039cdaaf
+B = 5d24e62244973fbd829573d5a579b4e89a6512933a2c3d255bbdbc1c89028323
+Result = 5d24e62244973fbd829573d5a579b4e89a6512933a2c3d255bbdbc1c89028323
+
+Test = OrdMulMont
+A = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632550
+B = abafdc695e4c2c850f8fc60f1efdbf7406a3cd2c6c59bb7e608985723896c187
+Result = 917b1214c7b31a7ee7e53be0b41a139e435ff576b51ec6af1e1a944412bea38b
+
+Test = OrdMulMont
+A = cf0f01b83670a1c79154ea16f3574ca2d4c688a3c3b6017795cbe54854418904
+B = c5ec4d3b00fb2e11fb3b1aa09e60f7d187f7c515977d1343dab9745961fcbb43
+Result = 7aaddcee32e3b340af5ad06f854284cbbce5a1ab919e9b7771c3b0e937093438
+
+Test = OrdMulMont
+A = 50023f9913879ac4020bc45a89a0ea89082db6265b96b851af29969dd8a9661c
+B = 7c165b1cba80808db114441563aa0fbfba41b9e8acff77312a2dd2138b74ef89
+Result = 3d2ca1705d8d38cbc76a5409c6535044733cafcb95d12654af1d14de177978b5
+
+Test = OrdMulMont
+A = 4d5341ea735e53d2e4f2934755642adee209bd0e5a1506206513227f3c48b270
+B = 6e48f2b60eb8fb86760134abaf3d61692557862924069c599ceb31309ea18704
+Result = 37cde3e35c814d4287bd345b910d687983929907b7a08afa2acd8596832ea86c
+
+Test = OrdMulMont
+A = 33d06c3f5a595a41a6f9c4356f8ab2b8c550d4c64b806eab5560af247c5fa9ed
+B = 0e52f34adf5754343bcf3529d652620da3c05b5dd9cdcddfb08b674a1ad21a09
+Result = 9dc64d7b4c1bc33b930e0daee2a24fc41f770378659ee71b846d2239b0fea8ea
+
+Test = OrdMulMont
+A = 8f211780cce4f93b7193b9378e6f83e1147fb3602b052eef782de8cc833e54ab
+B = e1e4f7f1feb15be64292cff86b47cd9730bcb15b133340022b824d591a660cdf
+Result = dfa2b683b1ae23027c7c109e0abb40a1366eda027ad2cad1a09061a57bee391f
+
+Test = OrdMulMont
+A = 803c279c7e4c11a5568290c0a5789ceab6860f51a942bf646501a45e1ec0a6bf
+B = c0a1145a12037129c571f5f939bf16ea0b8b480f08ec774c045d059841f7d5ed
+Result = ab48fa3b4aa692a7c077cc55ee3c3fff895118a23728c2fa5f361b30730d955a
+
+Test = OrdMulMont
+A = 0e5c95158297d75dbf0b02c3090730f65bf14704495b14837dd907af569407f1
+B = 5a03e3787c8772b2fb7ab07d7fe7fe653a58bdae7fde3174c6ed305e524f5728
+Result = 71296d305dcf9ce39010ea4f4bbf9f7c1064a413597bdc7574c13dea3fa514dc
+
+Test = OrdMulMont
+A = 366299be07886f7846fc74231db624b169360e3c8f60196a1afc9f2101e03922
+B = d6d7c830a6edb6861868b964519a6b68f6f24f7c09d66003f3f88eadd1e00158
+Result = 0b89596bf5054ebe95a39dab6e975b58190160610b09b2a4f93331ecc0e79fd3
+
+Test = OrdMulMont
+A = 8f36f0ef275a72192c3b7388e84df2b8acf66fc53aaf556e3be05c76b3f782c0
+B = 704e519363d44e8df8d91f5f347eb61e8d3e85c8fc1b82980c370a379b2bc81c
+Result = b70a392e3ce5e85b5efbbded9b8c16a3068ba9b93b4cbed9a9a71dffaad6b58a
+
+Test = OrdMulMont
+A = bf4466ef4dea9f06f0f3b4f14e01140a774262c7e0706584f4d7dac19be46d58
+B = 4af12d528b2cef0f6714961bca2ab682f8abaa97600ea8181f71563d56f8a9f5
+Result = 7b6827c0881b9846e32499e13277efb07917cf4b8c8c72bfb3daa8c1786a8e15
+
+
+# Test cases where A == B to test squaring.
+
+Test = OrdMulMont
+A = 0000000000000000000000000000000000000000000000000000000000000000
+B = 0000000000000000000000000000000000000000000000000000000000000000
+Result = 0000000000000000000000000000000000000000000000000000000000000000
+
+Test = OrdMulMont
+A = 00000000ffffffff00000000000000004319055258e8617b0c46353d039cdaaf
+B = 00000000ffffffff00000000000000004319055258e8617b0c46353d039cdaaf
+Result = 00000000ffffffff00000000000000004319055258e8617b0c46353d039cdaaf
+
+Test = OrdMulMont
+A = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632550
+B = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632550
+Result = 60d066334905c1e907f8b6041e607725badef3e243566fafce1bc8f79c197c79
+
+Test = OrdMulMont
+A = da43b8dd7fe8830a4fe8980ec585ccbe903a2965a695cdff398200b74b2ede41
+B = da43b8dd7fe8830a4fe8980ec585ccbe903a2965a695cdff398200b74b2ede41
+Result = 5ec68604412205b380e26ee4e4081eccc10ac7d1417b09cd534f8517b0de81ec
+
+Test = OrdMulMont
+A = a82a2b8bdbf8a37dc7cb5799691494a8c9fbf649686a4d250dc30697feb0fa47
+B = a82a2b8bdbf8a37dc7cb5799691494a8c9fbf649686a4d250dc30697feb0fa47
+Result = 552c094a8841621d6cc26b3b54ce5da5664283888445196a6433d3cfdcad3aee
+
+Test = OrdMulMont
+A = d785006e250410d9dcc6d7740795a7374c25b00b9c9a37b8285694a07307eacd
+B = d785006e250410d9dcc6d7740795a7374c25b00b9c9a37b8285694a07307eacd
+Result = 971aaa9e70ad082cf43725f2e65bc73f4bf762459cee13167545072ec7bdcaf8
+
+Test = OrdMulMont
+A = 69d6d9f5417e87d603a3fb6acafa0d1f974abf94ca57ce58d718a0ad5d02a496
+B = 69d6d9f5417e87d603a3fb6acafa0d1f974abf94ca57ce58d718a0ad5d02a496
+Result = eb3284e5799fbe93171f08e6de9f792cd17f036b3a17671b0310e49b48e589b3
+
+Test = OrdMulMont
+A = 1c28f742c3e26e74901d0425f2eb4d5272524668d2405875b32cf6433f212900
+B = 1c28f742c3e26e74901d0425f2eb4d5272524668d2405875b32cf6433f212900
+Result = 74f70a95399b7ad061a2200fa50528d68eee4654341c8158101e1e3f8f16e642
+
+Test = OrdMulMont
+A = 026b2f69f0259d221920b2f358b378a79826f0332ee36afa257765043e3d6732
+B = 026b2f69f0259d221920b2f358b378a79826f0332ee36afa257765043e3d6732
+Result = e1e9cfa4724995bb50971ca22f3c028cd31cb51fbef8a37c31f10fd1d468f13b
+
+Test = OrdMulMont
+A = 376ed4fadcc1c6c4160a0c9c2ab7c62260367968b08d304d47c65f25625d7d60
+B = 376ed4fadcc1c6c4160a0c9c2ab7c62260367968b08d304d47c65f25625d7d60
+Result = b9ccb67f377e1278f1d2eeda26e5eed76f32406c9deed9764fc0aa346d91e02b
+
+Test = OrdMulMont
+A = 50f66867d0a4ef389678d760d2a4db886583b4c068d0e240f7ddf3472c871304
+B = 50f66867d0a4ef389678d760d2a4db886583b4c068d0e240f7ddf3472c871304
+Result = 82c3467bc5f7ca8b45f4ee61546745e2f53755a02e87f65f572418d60e471c8b
+
+Test = OrdMulMont
+A = 5b8bd82b37206d2b727f19ad2d02f63773470074dde7d43d2a77c448ddf2f978
+B = 5b8bd82b37206d2b727f19ad2d02f63773470074dde7d43d2a77c448ddf2f978
+Result = dbf3c2fc67a0688c3b5ff12cab1739d50b6093c5d98943d388652b1207e4a0f2
+
+Test = OrdMulMont
+A = bed7b3a4dada0e16984eb59ee239005ab212e5b1772cdd5d240c8ee268f65c81
+B = bed7b3a4dada0e16984eb59ee239005ab212e5b1772cdd5d240c8ee268f65c81
+Result = 9232aa2759ca9c5efbaefb0cf45cc6bc9c89def8c25e5c169fe623f30787df36
diff --git a/src/crypto/fipsmodule/ec/scalar.c b/src/crypto/fipsmodule/ec/scalar.c
new file mode 100644
index 0000000..aa364de
--- /dev/null
+++ b/src/crypto/fipsmodule/ec/scalar.c
@@ -0,0 +1,74 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/ec.h>
+
+#include "internal.h"
+#include "../bn/internal.h"
+
+
+int ec_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out,
+                        const BIGNUM *in) {
+  if (!bn_copy_words(out->words, group->order.width, in) ||
+      !bn_less_than_words(out->words, group->order.d, group->order.width)) {
+    OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR);
+    return 0;
+  }
+  return 1;
+}
+
+int ec_random_nonzero_scalar(const EC_GROUP *group, EC_SCALAR *out,
+                             const uint8_t additional_data[32]) {
+  return bn_rand_range_words(out->words, 1, group->order.d, group->order.width,
+                             additional_data);
+}
+
+void ec_scalar_add(const EC_GROUP *group, EC_SCALAR *r, const EC_SCALAR *a,
+                   const EC_SCALAR *b) {
+  const BIGNUM *order = &group->order;
+  BN_ULONG tmp[EC_MAX_SCALAR_WORDS];
+  bn_mod_add_words(r->words, a->words, b->words, order->d, tmp, order->width);
+  OPENSSL_cleanse(tmp, sizeof(tmp));
+}
+
+void ec_scalar_to_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                             const EC_SCALAR *a) {
+  const BIGNUM *order = &group->order;
+  bn_to_montgomery_small(r->words, a->words, order->width, group->order_mont);
+}
+
+void ec_scalar_from_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                               const EC_SCALAR *a) {
+  const BIGNUM *order = &group->order;
+  bn_from_montgomery_small(r->words, a->words, order->width, group->order_mont);
+}
+
+void ec_scalar_mul_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                              const EC_SCALAR *a, const EC_SCALAR *b) {
+  const BIGNUM *order = &group->order;
+  bn_mod_mul_montgomery_small(r->words, a->words, b->words, order->width,
+                              group->order_mont);
+}
+
+void ec_simple_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                                     const EC_SCALAR *a) {
+  const BIGNUM *order = &group->order;
+  bn_mod_inverse_prime_mont_small(r->words, a->words, order->width,
+                                  group->order_mont);
+}
+
+void ec_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                              const EC_SCALAR *a) {
+  group->meth->scalar_inv_montgomery(group, r, a);
+}
diff --git a/src/crypto/fipsmodule/ecdsa/ecdsa.c b/src/crypto/fipsmodule/ecdsa/ecdsa.c
index 85490fa..f3ce214 100644
--- a/src/crypto/fipsmodule/ecdsa/ecdsa.c
+++ b/src/crypto/fipsmodule/ecdsa/ecdsa.c
@@ -66,22 +66,6 @@
 #include "../../internal.h"
 
 
-static void scalar_add(const EC_GROUP *group, EC_SCALAR *r, const EC_SCALAR *a,
-                       const EC_SCALAR *b) {
-  const BIGNUM *order = &group->order;
-  BN_ULONG tmp[EC_MAX_SCALAR_WORDS];
-  bn_mod_add_words(r->words, a->words, b->words, order->d, tmp, order->width);
-  OPENSSL_cleanse(tmp, sizeof(tmp));
-}
-
-static int scalar_mul_montgomery(const EC_GROUP *group, EC_SCALAR *r,
-                                 const EC_SCALAR *a, const EC_SCALAR *b) {
-  const BIGNUM *order = &group->order;
-  return bn_mod_mul_montgomery_small(r->words, order->width, a->words,
-                                     order->width, b->words, order->width,
-                                     group->order_mont);
-}
-
 // digest_to_scalar interprets |digest_len| bytes from |digest| as a scalar for
 // ECDSA. Note this value is not fully reduced modulo the order, only the
 // correct number of bits.
@@ -217,7 +201,6 @@
   }
 
   EC_SCALAR r, s, u1, u2, s_inv_mont, m;
-  const BIGNUM *order = EC_GROUP_get0_order(group);
   if (BN_is_zero(sig->r) ||
       !ec_bignum_to_scalar(group, &r, sig->r) ||
       BN_is_zero(sig->s) ||
@@ -225,27 +208,22 @@
     OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE);
     goto err;
   }
-  // s_inv_mont = s^-1 mod order. We convert the result to Montgomery form for
-  // the products below.
-  int no_inverse;
-  if (!BN_mod_inverse_odd(X, &no_inverse, sig->s, order, ctx) ||
-      // TODO(davidben): Add a words version of |BN_mod_inverse_odd| and write
-      // into |s_inv_mont| directly.
-      !ec_bignum_to_scalar_unchecked(group, &s_inv_mont, X) ||
-      !bn_to_montgomery_small(s_inv_mont.words, order->width, s_inv_mont.words,
-                              order->width, group->order_mont)) {
-    goto err;
-  }
+
+  // s_inv_mont = s^-1 in the Montgomery domain. This is
+  // |ec_scalar_to_montgomery| followed by |ec_scalar_inv_montgomery|, but
+  // |ec_scalar_inv_montgomery| followed by |ec_scalar_from_montgomery| is
+  // equivalent and slightly more efficient.
+  ec_scalar_inv_montgomery(group, &s_inv_mont, &s);
+  ec_scalar_from_montgomery(group, &s_inv_mont, &s_inv_mont);
+
   // u1 = m * s^-1 mod order
   // u2 = r * s^-1 mod order
   //
   // |s_inv_mont| is in Montgomery form while |m| and |r| are not, so |u1| and
   // |u2| will be taken out of Montgomery form, as desired.
   digest_to_scalar(group, &m, digest, digest_len);
-  if (!scalar_mul_montgomery(group, &u1, &m, &s_inv_mont) ||
-      !scalar_mul_montgomery(group, &u2, &r, &s_inv_mont)) {
-    goto err;
-  }
+  ec_scalar_mul_montgomery(group, &u1, &m, &s_inv_mont);
+  ec_scalar_mul_montgomery(group, &u2, &r, &s_inv_mont);
 
   point = EC_POINT_new(group);
   if (point == NULL) {
@@ -328,15 +306,12 @@
       }
     }
 
-    // Compute k^-1. We leave it in the Montgomery domain as an optimization for
-    // later operations.
-    if (!bn_to_montgomery_small(out_kinv_mont->words, order->width, k.words,
-                                order->width, group->order_mont) ||
-        !bn_mod_inverse_prime_mont_small(out_kinv_mont->words, order->width,
-                                         out_kinv_mont->words, order->width,
-                                         group->order_mont)) {
-      goto err;
-    }
+    // Compute k^-1 in the Montgomery domain. This is |ec_scalar_to_montgomery|
+    // followed by |ec_scalar_inv_montgomery|, but |ec_scalar_inv_montgomery|
+    // followed by |ec_scalar_from_montgomery| is equivalent and slightly more
+    // efficient.
+    ec_scalar_inv_montgomery(group, out_kinv_mont, &k);
+    ec_scalar_from_montgomery(group, out_kinv_mont, out_kinv_mont);
 
     // Compute r, the x-coordinate of generator * k.
     if (!ec_point_mul_scalar(group, tmp_point, &k, NULL, NULL, ctx) ||
@@ -396,20 +371,19 @@
     // Compute priv_key * r (mod order). Note if only one parameter is in the
     // Montgomery domain, |scalar_mod_mul_montgomery| will compute the answer in
     // the normal domain.
-    if (!ec_bignum_to_scalar(group, &r_mont, ret->r) ||
-        !bn_to_montgomery_small(r_mont.words, order->width, r_mont.words,
-                                order->width, group->order_mont) ||
-        !scalar_mul_montgomery(group, &s, priv_key, &r_mont)) {
+    if (!ec_bignum_to_scalar(group, &r_mont, ret->r)) {
       goto err;
     }
+    ec_scalar_to_montgomery(group, &r_mont, &r_mont);
+    ec_scalar_mul_montgomery(group, &s, priv_key, &r_mont);
 
     // Compute tmp = m + priv_key * r.
-    scalar_add(group, &tmp, &m, &s);
+    ec_scalar_add(group, &tmp, &m, &s);
 
     // Finally, multiply s by k^-1. That was retained in Montgomery form, so the
     // same technique as the previous multiplication works.
-    if (!scalar_mul_montgomery(group, &s, &tmp, &kinv_mont) ||
-        !bn_set_words(ret->s, s.words, order->width)) {
+    ec_scalar_mul_montgomery(group, &s, &tmp, &kinv_mont);
+    if (!bn_set_words(ret->s, s.words, order->width)) {
       goto err;
     }
     if (!BN_is_zero(ret->s)) {
diff --git a/src/crypto/fipsmodule/rsa/rsa_impl.c b/src/crypto/fipsmodule/rsa/rsa_impl.c
index 49cbc15..6d1206b 100644
--- a/src/crypto/fipsmodule/rsa/rsa_impl.c
+++ b/src/crypto/fipsmodule/rsa/rsa_impl.c
@@ -167,7 +167,7 @@
 
   if (rsa->p != NULL && rsa->q != NULL) {
     if (rsa->mont_p == NULL) {
-      rsa->mont_p = BN_MONT_CTX_new_for_modulus(rsa->p, ctx);
+      rsa->mont_p = BN_MONT_CTX_new_consttime(rsa->p, ctx);
       if (rsa->mont_p == NULL) {
         goto err;
       }
@@ -175,7 +175,7 @@
     const BIGNUM *p_fixed = &rsa->mont_p->N;
 
     if (rsa->mont_q == NULL) {
-      rsa->mont_q = BN_MONT_CTX_new_for_modulus(rsa->q, ctx);
+      rsa->mont_q = BN_MONT_CTX_new_consttime(rsa->q, ctx);
       if (rsa->mont_q == NULL) {
         goto err;
       }
@@ -715,7 +715,13 @@
   }
 
   if (rsa->p != NULL && rsa->q != NULL && rsa->e != NULL && rsa->dmp1 != NULL &&
-      rsa->dmq1 != NULL && rsa->iqmp != NULL) {
+      rsa->dmq1 != NULL && rsa->iqmp != NULL &&
+      // Require that we can reduce |f| by |rsa->p| and |rsa->q| in constant
+      // time, which requires primes be the same size, rounded to the Montgomery
+      // coefficient. (See |mod_montgomery|.) This is not required by RFC 8017,
+      // but it is true for keys generated by us and all common implementations.
+      bn_less_than_montgomery_R(rsa->q, rsa->mont_p) &&
+      bn_less_than_montgomery_R(rsa->p, rsa->mont_q)) {
     if (!mod_exp(result, f, rsa, ctx)) {
       goto err;
     }
@@ -780,11 +786,11 @@
                           const BN_MONT_CTX *mont_p, const BIGNUM *q,
                           BN_CTX *ctx) {
   // Reducing in constant-time with Montgomery reduction requires I <= p * R. We
-  // have I < p * q, so this follows if q < R. In particular, this always holds
-  // if p and q are the same size, which is true for any RSA keys we or anyone
-  // sane generates. For other keys, we fall back to |BN_mod|.
+  // have I < p * q, so this follows if q < R. The caller should have checked
+  // this already.
   if (!bn_less_than_montgomery_R(q, mont_p)) {
-    return BN_mod(r, I, p, ctx);
+    OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR);
+    return 0;
   }
 
   if (// Reduce mod p with Montgomery reduction. This computes I * R^-1 mod p.
@@ -928,6 +934,8 @@
 // relatively prime to |e|. If |p| is non-NULL, |out| will also not be close to
 // |p|. |sqrt2| must be ⌊2^(bits-1)×√2⌋ (or a slightly overestimate for large
 // sizes), and |pow2_bits_100| must be 2^(bits-100).
+//
+// This function fails with probability around 2^-21.
 static int generate_prime(BIGNUM *out, int bits, const BIGNUM *e,
                           const BIGNUM *p, const BIGNUM *sqrt2,
                           const BIGNUM *pow2_bits_100, BN_CTX *ctx,
@@ -944,11 +952,36 @@
   // Use the limit from steps 4.7 and 5.8 for most values of |e|. When |e| is 3,
   // the 186-4 limit is too low, so we use a higher one. Note this case is not
   // reachable from |RSA_generate_key_fips|.
+  //
+  // |limit| determines the failure probability. We must find a prime that is
+  // not 1 mod |e|. By the prime number theorem, we'll find one with probability
+  // p = (e-1)/e * 2/(ln(2)*bits). Note the second term is doubled because we
+  // discard even numbers.
+  //
+  // The failure probability is thus (1-p)^limit. To convert that to a power of
+  // two, we take logs. -log_2((1-p)^limit) = -limit * ln(1-p) / ln(2).
+  //
+  // >>> def f(bits, e, limit):
+  // ...   p = (e-1.0)/e * 2.0/(math.log(2)*bits)
+  // ...   return -limit * math.log(1 - p) / math.log(2)
+  // ...
+  // >>> f(1024, 65537, 5*1024)
+  // 20.842750558272634
+  // >>> f(1536, 65537, 5*1536)
+  // 20.83294549602474
+  // >>> f(2048, 65537, 5*2048)
+  // 20.828047576234948
+  // >>> f(1024, 3, 8*1024)
+  // 22.222147925962307
+  // >>> f(1536, 3, 8*1536)
+  // 22.21518251065506
+  // >>> f(2048, 3, 8*2048)
+  // 22.211701985875937
   if (bits >= INT_MAX/32) {
     OPENSSL_PUT_ERROR(RSA, RSA_R_MODULUS_TOO_LARGE);
     return 0;
   }
-  int limit = BN_is_word(e, 3) ? bits * 32 : bits * 5;
+  int limit = BN_is_word(e, 3) ? bits * 8 : bits * 5;
 
   int ret = 0, tries = 0, rand_tries = 0;
   BN_CTX_start(ctx);
@@ -1027,7 +1060,14 @@
   return ret;
 }
 
-int RSA_generate_key_ex(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb) {
+// rsa_generate_key_impl generates an RSA key using a generalized version of
+// FIPS 186-4 appendix B.3. |RSA_generate_key_fips| performs additional checks
+// for FIPS-compliant key generation.
+//
+// This function returns one on success and zero on failure. It has a failure
+// probability of about 2^-20.
+static int rsa_generate_key_impl(RSA *rsa, int bits, BIGNUM *e_value,
+                                 BN_GENCB *cb) {
   // See FIPS 186-4 appendix B.3. This function implements a generalized version
   // of the FIPS algorithm. |RSA_generate_key_fips| performs additional checks
   // for FIPS-compliant key generation.
@@ -1113,6 +1153,9 @@
   do {
     // Generate p and q, each of size |prime_bits|, using the steps outlined in
     // appendix FIPS 186-4 appendix B.3.3.
+    //
+    // Each call to |generate_prime| fails with probability p = 2^-21. The
+    // probability that either call fails is 1 - (1-p)^2, which is around 2^-20.
     if (!generate_prime(rsa->p, prime_bits, rsa->e, NULL, sqrt2,
                         pow2_prime_bits_100, ctx, cb) ||
         !BN_GENCB_call(cb, 3, 0) ||
@@ -1192,6 +1235,65 @@
   return ret;
 }
 
+static void replace_bignum(BIGNUM **out, BIGNUM **in) {
+  BN_free(*out);
+  *out = *in;
+  *in = NULL;
+}
+
+static void replace_bn_mont_ctx(BN_MONT_CTX **out, BN_MONT_CTX **in) {
+  BN_MONT_CTX_free(*out);
+  *out = *in;
+  *in = NULL;
+}
+
+int RSA_generate_key_ex(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb) {
+  // |rsa_generate_key_impl|'s 2^-20 failure probability is too high at scale,
+  // so we run the FIPS algorithm four times, bringing it down to 2^-80. We
+  // should just adjust the retry limit, but FIPS 186-4 prescribes that value
+  // and thus results in unnecessary complexity.
+  for (int i = 0; i < 4; i++) {
+    ERR_clear_error();
+    // Generate into scratch space, to avoid leaving partial work on failure.
+    RSA *tmp = RSA_new();
+    if (tmp == NULL) {
+      return 0;
+    }
+    if (rsa_generate_key_impl(tmp, bits, e_value, cb)) {
+      replace_bignum(&rsa->n, &tmp->n);
+      replace_bignum(&rsa->e, &tmp->e);
+      replace_bignum(&rsa->d, &tmp->d);
+      replace_bignum(&rsa->p, &tmp->p);
+      replace_bignum(&rsa->q, &tmp->q);
+      replace_bignum(&rsa->dmp1, &tmp->dmp1);
+      replace_bignum(&rsa->dmq1, &tmp->dmq1);
+      replace_bignum(&rsa->iqmp, &tmp->iqmp);
+      replace_bn_mont_ctx(&rsa->mont_n, &tmp->mont_n);
+      replace_bn_mont_ctx(&rsa->mont_p, &tmp->mont_p);
+      replace_bn_mont_ctx(&rsa->mont_q, &tmp->mont_q);
+      replace_bignum(&rsa->d_fixed, &tmp->d_fixed);
+      replace_bignum(&rsa->dmp1_fixed, &tmp->dmp1_fixed);
+      replace_bignum(&rsa->dmq1_fixed, &tmp->dmq1_fixed);
+      replace_bignum(&rsa->inv_small_mod_large_mont,
+                     &tmp->inv_small_mod_large_mont);
+      rsa->private_key_frozen = tmp->private_key_frozen;
+      RSA_free(tmp);
+      return 1;
+    }
+    uint32_t err = ERR_peek_error();
+    RSA_free(tmp);
+    tmp = NULL;
+    // Only retry on |RSA_R_TOO_MANY_ITERATIONS|. This is so a caller-induced
+    // failure in |BN_GENCB_call| is still fatal.
+    if (ERR_GET_LIB(err) != ERR_LIB_RSA ||
+        ERR_GET_REASON(err) != RSA_R_TOO_MANY_ITERATIONS) {
+      return 0;
+    }
+  }
+
+  return 0;
+}
+
 int RSA_generate_key_fips(RSA *rsa, int bits, BN_GENCB *cb) {
   // FIPS 186-4 allows 2048-bit and 3072-bit RSA keys (1024-bit and 1536-bit
   // primes, respectively) with the prime generation method we use.
diff --git a/src/crypto/mem.c b/src/crypto/mem.c
index 50c6fe6..5d45baa 100644
--- a/src/crypto/mem.c
+++ b/src/crypto/mem.c
@@ -59,7 +59,6 @@
 #include <assert.h>
 #include <stdarg.h>
 #include <stdio.h>
-#include <string.h>
 
 #if defined(OPENSSL_WINDOWS)
 OPENSSL_MSVC_PRAGMA(warning(push, 3))
diff --git a/src/crypto/rsa_extra/rsa_test.cc b/src/crypto/rsa_extra/rsa_test.cc
index a6bfb87..211b690 100644
--- a/src/crypto/rsa_extra/rsa_test.cc
+++ b/src/crypto/rsa_extra/rsa_test.cc
@@ -501,12 +501,12 @@
   ERR_clear_error();
 
   // Test that we can generate 2048-bit and 3072-bit RSA keys.
-  EXPECT_TRUE(RSA_generate_key_fips(rsa.get(), 2048, nullptr));
+  ASSERT_TRUE(RSA_generate_key_fips(rsa.get(), 2048, nullptr));
   EXPECT_EQ(2048u, BN_num_bits(rsa->n));
 
   rsa.reset(RSA_new());
   ASSERT_TRUE(rsa);
-  EXPECT_TRUE(RSA_generate_key_fips(rsa.get(), 3072, nullptr));
+  ASSERT_TRUE(RSA_generate_key_fips(rsa.get(), 3072, nullptr));
   EXPECT_EQ(3072u, BN_num_bits(rsa->n));
 }
 
@@ -653,22 +653,22 @@
 
   bssl::UniquePtr<RSA> rsa(RSA_new());
   ASSERT_TRUE(rsa);
-  EXPECT_TRUE(RSA_generate_key_ex(rsa.get(), 1025, e.get(), nullptr));
+  ASSERT_TRUE(RSA_generate_key_ex(rsa.get(), 1025, e.get(), nullptr));
   EXPECT_EQ(1024u, BN_num_bits(rsa->n));
 
   rsa.reset(RSA_new());
   ASSERT_TRUE(rsa);
-  EXPECT_TRUE(RSA_generate_key_ex(rsa.get(), 1027, e.get(), nullptr));
+  ASSERT_TRUE(RSA_generate_key_ex(rsa.get(), 1027, e.get(), nullptr));
   EXPECT_EQ(1024u, BN_num_bits(rsa->n));
 
   rsa.reset(RSA_new());
   ASSERT_TRUE(rsa);
-  EXPECT_TRUE(RSA_generate_key_ex(rsa.get(), 1151, e.get(), nullptr));
+  ASSERT_TRUE(RSA_generate_key_ex(rsa.get(), 1151, e.get(), nullptr));
   EXPECT_EQ(1024u, BN_num_bits(rsa->n));
 
   rsa.reset(RSA_new());
   ASSERT_TRUE(rsa);
-  EXPECT_TRUE(RSA_generate_key_ex(rsa.get(), 1152, e.get(), nullptr));
+  ASSERT_TRUE(RSA_generate_key_ex(rsa.get(), 1152, e.get(), nullptr));
   EXPECT_EQ(1152u, BN_num_bits(rsa->n));
 }
 
@@ -896,6 +896,123 @@
   ASSERT_TRUE(BN_sub(rsa->iqmp, rsa->iqmp, rsa->p));
 }
 
+TEST(RSATest, KeygenFail) {
+  bssl::UniquePtr<RSA> rsa(RSA_new());
+  ASSERT_TRUE(rsa);
+
+  // Cause RSA key generation after a prime has been generated, to test that
+  // |rsa| is left alone.
+  BN_GENCB cb;
+  BN_GENCB_set(&cb,
+               [](int event, int, BN_GENCB *) -> int { return event != 3; },
+               nullptr);
+
+  bssl::UniquePtr<BIGNUM> e(BN_new());
+  ASSERT_TRUE(e);
+  ASSERT_TRUE(BN_set_word(e.get(), RSA_F4));
+
+  // Key generation should fail.
+  EXPECT_FALSE(RSA_generate_key_ex(rsa.get(), 2048, e.get(), &cb));
+
+  // Failed key generations do not leave garbage in |rsa|.
+  EXPECT_FALSE(rsa->n);
+  EXPECT_FALSE(rsa->e);
+  EXPECT_FALSE(rsa->d);
+  EXPECT_FALSE(rsa->p);
+  EXPECT_FALSE(rsa->q);
+  EXPECT_FALSE(rsa->dmp1);
+  EXPECT_FALSE(rsa->dmq1);
+  EXPECT_FALSE(rsa->iqmp);
+  EXPECT_FALSE(rsa->mont_n);
+  EXPECT_FALSE(rsa->mont_p);
+  EXPECT_FALSE(rsa->mont_q);
+  EXPECT_FALSE(rsa->d_fixed);
+  EXPECT_FALSE(rsa->dmp1_fixed);
+  EXPECT_FALSE(rsa->dmq1_fixed);
+  EXPECT_FALSE(rsa->inv_small_mod_large_mont);
+  EXPECT_FALSE(rsa->private_key_frozen);
+
+  // Failed key generations leave the previous contents alone.
+  EXPECT_TRUE(RSA_generate_key_ex(rsa.get(), 2048, e.get(), nullptr));
+  uint8_t *der;
+  size_t der_len;
+  ASSERT_TRUE(RSA_private_key_to_bytes(&der, &der_len, rsa.get()));
+  bssl::UniquePtr<uint8_t> delete_der(der);
+
+  EXPECT_FALSE(RSA_generate_key_ex(rsa.get(), 2048, e.get(), &cb));
+
+  uint8_t *der2;
+  size_t der2_len;
+  ASSERT_TRUE(RSA_private_key_to_bytes(&der2, &der2_len, rsa.get()));
+  bssl::UniquePtr<uint8_t> delete_der2(der2);
+  EXPECT_EQ(Bytes(der, der_len), Bytes(der2, der2_len));
+
+  // Generating a key over an existing key works, despite any cached state.
+  EXPECT_TRUE(RSA_generate_key_ex(rsa.get(), 2048, e.get(), nullptr));
+  EXPECT_TRUE(RSA_check_key(rsa.get()));
+  uint8_t *der3;
+  size_t der3_len;
+  ASSERT_TRUE(RSA_private_key_to_bytes(&der3, &der3_len, rsa.get()));
+  bssl::UniquePtr<uint8_t> delete_der3(der3);
+  EXPECT_NE(Bytes(der, der_len), Bytes(der3, der3_len));
+}
+
+TEST(RSATest, KeygenFailOnce) {
+  bssl::UniquePtr<RSA> rsa(RSA_new());
+  ASSERT_TRUE(rsa);
+
+  // Cause only the first iteration of RSA key generation to fail.
+  bool failed = false;
+  BN_GENCB cb;
+  BN_GENCB_set(&cb,
+               [](int event, int n, BN_GENCB *cb_ptr) -> int {
+                 bool *failed_ptr = static_cast<bool *>(cb_ptr->arg);
+                 if (*failed_ptr) {
+                   ADD_FAILURE() << "Callback called multiple times.";
+                   return 1;
+                 }
+                 *failed_ptr = true;
+                 return 0;
+               },
+               &failed);
+
+  // Although key generation internally retries, the external behavior of
+  // |BN_GENCB| is preserved.
+  bssl::UniquePtr<BIGNUM> e(BN_new());
+  ASSERT_TRUE(e);
+  ASSERT_TRUE(BN_set_word(e.get(), RSA_F4));
+  EXPECT_FALSE(RSA_generate_key_ex(rsa.get(), 2048, e.get(), &cb));
+}
+
+TEST(RSATest, KeygenInternalRetry) {
+  bssl::UniquePtr<RSA> rsa(RSA_new());
+  ASSERT_TRUE(rsa);
+
+  // Simulate one internal attempt at key generation failing.
+  bool failed = false;
+  BN_GENCB cb;
+  BN_GENCB_set(&cb,
+               [](int event, int n, BN_GENCB *cb_ptr) -> int {
+                 bool *failed_ptr = static_cast<bool *>(cb_ptr->arg);
+                 if (*failed_ptr) {
+                   return 1;
+                 }
+                 *failed_ptr = true;
+                 // This test does not test any public API behavior. It is just
+                 // a hack to exercise the retry codepath and make sure it
+                 // works.
+                 OPENSSL_PUT_ERROR(RSA, RSA_R_TOO_MANY_ITERATIONS);
+                 return 0;
+               },
+               &failed);
+
+  // Key generation internally retries on RSA_R_TOO_MANY_ITERATIONS.
+  bssl::UniquePtr<BIGNUM> e(BN_new());
+  ASSERT_TRUE(e);
+  ASSERT_TRUE(BN_set_word(e.get(), RSA_F4));
+  EXPECT_TRUE(RSA_generate_key_ex(rsa.get(), 2048, e.get(), &cb));
+}
+
 #if !defined(BORINGSSL_SHARED_LIBRARY)
 TEST(RSATest, SqrtTwo) {
   bssl::UniquePtr<BIGNUM> sqrt(BN_new()), pow2(BN_new());
diff --git a/src/crypto/x509/by_dir.c b/src/crypto/x509/by_dir.c
index b3bfffe..9a0e2eb 100644
--- a/src/crypto/x509/by_dir.c
+++ b/src/crypto/x509/by_dir.c
@@ -65,6 +65,8 @@
 #include <openssl/thread.h>
 #include <openssl/x509.h>
 
+#if !defined(OPENSSL_TRUSTY)
+
 #include "../internal.h"
 
 typedef struct lookup_dir_hashes_st {
@@ -452,3 +454,5 @@
         BUF_MEM_free(b);
     return (ok);
 }
+
+#endif  // OPENSSL_TRUSTY