external/boringssl: Sync to a63d0ad40dd621d5b9472dc9f1756692f969451e.

This includes the following changes:

https://boringssl.googlesource.com/boringssl/+log/9f0e7cb314ae64234b928fd379381ae9760a9a5f..a63d0ad40dd621d5b9472dc9f1756692f969451e

Test: BoringSSL CTS Presubmits.
Change-Id: I283b7d8f01ceef3becb152708b65894c717e3680
diff --git a/src/crypto/bio/connect.c b/src/crypto/bio/connect.c
index 0b60f6a..604803a 100644
--- a/src/crypto/bio/connect.c
+++ b/src/crypto/bio/connect.c
@@ -56,6 +56,8 @@
 
 #include <openssl/bio.h>
 
+#if !defined(OPENSSL_TRUSTY)
+
 #include <assert.h>
 #include <errno.h>
 #include <string.h>
@@ -540,3 +542,5 @@
 int BIO_do_connect(BIO *bio) {
   return BIO_ctrl(bio, BIO_C_DO_STATE_MACHINE, 0, NULL);
 }
+
+#endif  // OPENSSL_TRUSTY
diff --git a/src/crypto/bio/fd.c b/src/crypto/bio/fd.c
index fed5228..877f53d 100644
--- a/src/crypto/bio/fd.c
+++ b/src/crypto/bio/fd.c
@@ -56,6 +56,8 @@
 
 #include <openssl/bio.h>
 
+#if !defined(OPENSSL_TRUSTY)
+
 #include <errno.h>
 #include <string.h>
 
@@ -274,3 +276,5 @@
 int BIO_get_fd(BIO *bio, int *out_fd) {
   return BIO_ctrl(bio, BIO_C_GET_FD, 0, (char *) out_fd);
 }
+
+#endif  // OPENSSL_TRUSTY
diff --git a/src/crypto/bio/file.c b/src/crypto/bio/file.c
index f61dbe4..6a0b9a9 100644
--- a/src/crypto/bio/file.c
+++ b/src/crypto/bio/file.c
@@ -73,6 +73,8 @@
 
 #include <openssl/bio.h>
 
+#if !defined(OPENSSL_TRUSTY)
+
 #include <errno.h>
 #include <stdio.h>
 #include <string.h>
@@ -313,3 +315,5 @@
   return BIO_ctrl(bio, BIO_C_SET_FILENAME,
                   BIO_CLOSE | BIO_FP_READ | BIO_FP_WRITE, (char *)filename);
 }
+
+#endif  // OPENSSL_TRUSTY
diff --git a/src/crypto/bio/socket.c b/src/crypto/bio/socket.c
index 111761f..081ce01 100644
--- a/src/crypto/bio/socket.c
+++ b/src/crypto/bio/socket.c
@@ -57,6 +57,8 @@
 
 #include <openssl/bio.h>
 
+#if !defined(OPENSSL_TRUSTY)
+
 #include <fcntl.h>
 #include <string.h>
 
@@ -200,3 +202,5 @@
   BIO_set_fd(ret, fd, close_flag);
   return ret;
 }
+
+#endif  // OPENSSL_TRUSTY
diff --git a/src/crypto/bio/socket_helper.c b/src/crypto/bio/socket_helper.c
index 268405a..d4209d0 100644
--- a/src/crypto/bio/socket_helper.c
+++ b/src/crypto/bio/socket_helper.c
@@ -18,6 +18,8 @@
 #include <openssl/bio.h>
 #include <openssl/err.h>
 
+#if !defined(OPENSSL_TRUSTY)
+
 #include <fcntl.h>
 #include <string.h>
 #include <sys/types.h>
@@ -112,3 +114,5 @@
   }
   return error;
 }
+
+#endif  // OPENSSL_TRUSTY
diff --git a/src/crypto/cipher_extra/tls_cbc.c b/src/crypto/cipher_extra/tls_cbc.c
index 6f95130..a24602b 100644
--- a/src/crypto/cipher_extra/tls_cbc.c
+++ b/src/crypto/cipher_extra/tls_cbc.c
@@ -271,7 +271,7 @@
   HASH_CTX md_state;
   void (*md_final_raw)(HASH_CTX *ctx, uint8_t *md_out);
   void (*md_transform)(HASH_CTX *ctx, const uint8_t *block);
-  unsigned md_size, md_block_size = 64;
+  unsigned md_size, md_block_size = 64, md_block_shift = 6;
   // md_length_size is the number of bytes in the length field that terminates
   // the hash.
   unsigned md_length_size = 8;
@@ -305,6 +305,7 @@
       md_transform = tls1_sha512_transform;
       md_size = SHA384_DIGEST_LENGTH;
       md_block_size = 128;
+      md_block_shift = 7;
       md_length_size = 16;
       break;
 
@@ -318,6 +319,7 @@
 
   assert(md_length_size <= MAX_HASH_BIT_COUNT_BYTES);
   assert(md_block_size <= MAX_HASH_BLOCK_SIZE);
+  assert(md_block_size == (1u << md_block_shift));
   assert(md_size <= EVP_MAX_MD_SIZE);
 
   static const size_t kHeaderLength = 13;
@@ -350,18 +352,16 @@
   // k is the starting byte offset into the conceptual header||data where
   // we start processing.
   size_t k = 0;
-  // mac_end_offset is the index just past the end of the data to be
-  // MACed.
+  // mac_end_offset is the index just past the end of the data to be MACed.
   size_t mac_end_offset = data_plus_mac_size + kHeaderLength - md_size;
-  // c is the index of the 0x80 byte in the final hash block that
-  // contains application data.
-  size_t c = mac_end_offset % md_block_size;
-  // index_a is the hash block number that contains the 0x80 terminating
-  // value.
-  size_t index_a = mac_end_offset / md_block_size;
-  // index_b is the hash block number that contains the 64-bit hash
-  // length, in bits.
-  size_t index_b = (mac_end_offset + md_length_size) / md_block_size;
+  // c is the index of the 0x80 byte in the final hash block that contains
+  // application data.
+  size_t c = mac_end_offset & (md_block_size - 1);
+  // index_a is the hash block number that contains the 0x80 terminating value.
+  size_t index_a = mac_end_offset >> md_block_shift;
+  // index_b is the hash block number that contains the 64-bit hash length, in
+  // bits.
+  size_t index_b = (mac_end_offset + md_length_size) >> md_block_shift;
 
   if (num_blocks > kVarianceBlocks) {
     num_starting_blocks = num_blocks - kVarianceBlocks;
diff --git a/src/crypto/cpu-intel.c b/src/crypto/cpu-intel.c
index 1ac280c..701ebed 100644
--- a/src/crypto/cpu-intel.c
+++ b/src/crypto/cpu-intel.c
@@ -170,10 +170,11 @@
     }
   }
 
-  uint32_t extended_features = 0;
+  uint32_t extended_features[2] = {0};
   if (num_ids >= 7) {
     OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 7);
-    extended_features = ebx;
+    extended_features[0] = ebx;
+    extended_features[1] = ecx;
   }
 
   // Determine the number of cores sharing an L1 data cache to adjust the
@@ -241,26 +242,26 @@
     //
     // TODO(davidben): Should bits 17 and 26-28 also be cleared? Upstream
     // doesn't clear those.
-    extended_features &=
+    extended_features[0] &=
         ~((1u << 5) | (1u << 16) | (1u << 21) | (1u << 30) | (1u << 31));
   }
   // See Intel manual, volume 1, section 15.2.
   if ((xcr0 & 0xe6) != 0xe6) {
     // Clear AVX512F. Note we don't touch other AVX512 extensions because they
     // can be used with YMM.
-    extended_features &= ~(1u << 16);
+    extended_features[0] &= ~(1u << 16);
   }
 
   // Disable ADX instructions on Knights Landing. See OpenSSL commit
   // 64d92d74985ebb3d0be58a9718f9e080a14a8e7f.
   if ((ecx & (1u << 26)) == 0) {
-    extended_features &= ~(1u << 19);
+    extended_features[0] &= ~(1u << 19);
   }
 
   OPENSSL_ia32cap_P[0] = edx;
   OPENSSL_ia32cap_P[1] = ecx;
-  OPENSSL_ia32cap_P[2] = extended_features;
-  OPENSSL_ia32cap_P[3] = 0;
+  OPENSSL_ia32cap_P[2] = extended_features[0];
+  OPENSSL_ia32cap_P[3] = extended_features[1];
 
   const char *env1, *env2;
   env1 = getenv("OPENSSL_ia32cap");
diff --git a/src/crypto/fipsmodule/bcm.c b/src/crypto/fipsmodule/bcm.c
index 028ec4e..f382313 100644
--- a/src/crypto/fipsmodule/bcm.c
+++ b/src/crypto/fipsmodule/bcm.c
@@ -63,6 +63,7 @@
 #include "ec/p224-64.c"
 #include "../../third_party/fiat/p256.c"
 #include "ec/p256-x86_64.c"
+#include "ec/scalar.c"
 #include "ec/simple.c"
 #include "ec/util.c"
 #include "ec/wnaf.c"
diff --git a/src/crypto/fipsmodule/bn/bn_test.cc b/src/crypto/fipsmodule/bn/bn_test.cc
index 93d6d0f..a25d487 100644
--- a/src/crypto/fipsmodule/bn/bn_test.cc
+++ b/src/crypto/fipsmodule/bn/bn_test.cc
@@ -467,13 +467,13 @@
           r_words(new BN_ULONG[num_r]);
       ASSERT_TRUE(bn_copy_words(a_words.get(), num_a, a.get()));
 
-      ASSERT_TRUE(bn_mul_small(r_words.get(), num_r, a_words.get(), num_a,
-                               a_words.get(), num_a));
+      bn_mul_small(r_words.get(), num_r, a_words.get(), num_a, a_words.get(),
+                   num_a);
       ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), num_r));
       EXPECT_BIGNUMS_EQUAL("A * A (words)", square.get(), ret.get());
 
       OPENSSL_memset(r_words.get(), 'A', num_r * sizeof(BN_ULONG));
-      ASSERT_TRUE(bn_sqr_small(r_words.get(), num_r, a_words.get(), num_a));
+      bn_sqr_small(r_words.get(), num_r, a_words.get(), num_a);
 
       ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), num_r));
       EXPECT_BIGNUMS_EQUAL("A^2 (words)", square.get(), ret.get());
@@ -535,8 +535,8 @@
         ASSERT_TRUE(bn_copy_words(a_words.get(), num_a, a.get()));
         ASSERT_TRUE(bn_copy_words(b_words.get(), num_b, b.get()));
 
-        ASSERT_TRUE(bn_mul_small(r_words.get(), num_r, a_words.get(), num_a,
-                                 b_words.get(), num_b));
+        bn_mul_small(r_words.get(), num_r, a_words.get(), num_a, b_words.get(),
+                     num_b);
         ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), num_r));
         EXPECT_BIGNUMS_EQUAL("A * B (words)", product.get(), ret.get());
       }
@@ -630,8 +630,17 @@
     // Reduce |a| and |b| and test the Montgomery version.
     bssl::UniquePtr<BN_MONT_CTX> mont(
         BN_MONT_CTX_new_for_modulus(m.get(), ctx));
-    bssl::UniquePtr<BIGNUM> a_tmp(BN_new()), b_tmp(BN_new());
     ASSERT_TRUE(mont);
+
+    // Sanity-check that the constant-time version computes the same n0 and RR.
+    bssl::UniquePtr<BN_MONT_CTX> mont2(
+        BN_MONT_CTX_new_consttime(m.get(), ctx));
+    ASSERT_TRUE(mont2);
+    EXPECT_BIGNUMS_EQUAL("RR (mod M) (constant-time)", &mont->RR, &mont2->RR);
+    EXPECT_EQ(mont->n0[0], mont2->n0[0]);
+    EXPECT_EQ(mont->n0[1], mont2->n0[1]);
+
+    bssl::UniquePtr<BIGNUM> a_tmp(BN_new()), b_tmp(BN_new());
     ASSERT_TRUE(a_tmp);
     ASSERT_TRUE(b_tmp);
     ASSERT_TRUE(BN_nnmod(a.get(), a.get(), m.get(), ctx));
@@ -651,16 +660,13 @@
           b_words(new BN_ULONG[m_width]), r_words(new BN_ULONG[m_width]);
       ASSERT_TRUE(bn_copy_words(a_words.get(), m_width, a.get()));
       ASSERT_TRUE(bn_copy_words(b_words.get(), m_width, b.get()));
-      ASSERT_TRUE(bn_to_montgomery_small(a_words.get(), m_width, a_words.get(),
-                                         m_width, mont.get()));
-      ASSERT_TRUE(bn_to_montgomery_small(b_words.get(), m_width, b_words.get(),
-                                         m_width, mont.get()));
-      ASSERT_TRUE(bn_mod_mul_montgomery_small(
-          r_words.get(), m_width, a_words.get(), m_width, b_words.get(), m_width,
-          mont.get()));
+      bn_to_montgomery_small(a_words.get(), a_words.get(), m_width, mont.get());
+      bn_to_montgomery_small(b_words.get(), b_words.get(), m_width, mont.get());
+      bn_mod_mul_montgomery_small(r_words.get(), a_words.get(), b_words.get(),
+                                  m_width, mont.get());
       // Use the second half of |tmp| so ASan will catch out-of-bounds writes.
-      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m_width, r_words.get(),
-                                           m_width, mont.get()));
+      bn_from_montgomery_small(r_words.get(), r_words.get(), m_width,
+                               mont.get());
       ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m_width));
       EXPECT_BIGNUMS_EQUAL("A * B (mod M) (Montgomery, words)", mod_mul.get(),
                            ret.get());
@@ -718,13 +724,10 @@
       std::unique_ptr<BN_ULONG[]> a_words(new BN_ULONG[m_width]),
           a_copy_words(new BN_ULONG[m_width]), r_words(new BN_ULONG[m_width]);
       ASSERT_TRUE(bn_copy_words(a_words.get(), m_width, a.get()));
-      ASSERT_TRUE(bn_to_montgomery_small(a_words.get(), m_width, a_words.get(),
-                                         m_width, mont.get()));
-      ASSERT_TRUE(bn_mod_mul_montgomery_small(
-          r_words.get(), m_width, a_words.get(), m_width, a_words.get(),
-          m_width, mont.get()));
-      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m_width,
-                                           r_words.get(), m_width, mont.get()));
+      bn_to_montgomery_small(a_words.get(), a_words.get(), m_width, mont.get());
+      bn_mod_mul_montgomery_small(r_words.get(), a_words.get(), a_words.get(),
+                                  m_width, mont.get());
+      bn_from_montgomery_small(r_words.get(), r_words.get(), m_width, mont.get());
       ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m_width));
       EXPECT_BIGNUMS_EQUAL("A * A (mod M) (Montgomery, words)",
                            mod_square.get(), ret.get());
@@ -732,12 +735,11 @@
       // Repeat the operation with |a_copy_words|.
       OPENSSL_memcpy(a_copy_words.get(), a_words.get(),
                      m_width * sizeof(BN_ULONG));
-      ASSERT_TRUE(bn_mod_mul_montgomery_small(
-          r_words.get(), m_width, a_words.get(), m_width, a_copy_words.get(),
-          m_width, mont.get()));
+      bn_mod_mul_montgomery_small(r_words.get(), a_words.get(),
+                                  a_copy_words.get(), m_width, mont.get());
       // Use the second half of |tmp| so ASan will catch out-of-bounds writes.
-      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m_width,
-                                           r_words.get(), m_width, mont.get()));
+      bn_from_montgomery_small(r_words.get(), r_words.get(), m_width,
+                               mont.get());
       ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m_width));
       EXPECT_BIGNUMS_EQUAL("A * A_copy (mod M) (Montgomery, words)",
                            mod_square.get(), ret.get());
@@ -761,6 +763,9 @@
   ASSERT_TRUE(BN_mod_exp(ret.get(), a.get(), e.get(), m.get(), ctx));
   EXPECT_BIGNUMS_EQUAL("A ^ E (mod M)", mod_exp.get(), ret.get());
 
+  // The other implementations require reduced inputs.
+  ASSERT_TRUE(BN_nnmod(a.get(), a.get(), m.get(), ctx));
+
   if (BN_is_odd(m.get())) {
     ASSERT_TRUE(
         BN_mod_exp_mont(ret.get(), a.get(), e.get(), m.get(), ctx, NULL));
@@ -778,16 +783,14 @@
       bssl::UniquePtr<BN_MONT_CTX> mont(
           BN_MONT_CTX_new_for_modulus(m.get(), ctx));
       ASSERT_TRUE(mont.get());
-      ASSERT_TRUE(BN_nnmod(a.get(), a.get(), m.get(), ctx));
       std::unique_ptr<BN_ULONG[]> r_words(new BN_ULONG[m_width]),
           a_words(new BN_ULONG[m_width]);
       ASSERT_TRUE(bn_copy_words(a_words.get(), m_width, a.get()));
-      ASSERT_TRUE(bn_to_montgomery_small(a_words.get(), m_width, a_words.get(),
-                                         m_width, mont.get()));
-      ASSERT_TRUE(bn_mod_exp_mont_small(r_words.get(), m_width, a_words.get(),
-                                        m_width, e->d, e->width, mont.get()));
-      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m_width,
-                                           r_words.get(), m_width, mont.get()));
+      bn_to_montgomery_small(a_words.get(), a_words.get(), m_width, mont.get());
+      bn_mod_exp_mont_small(r_words.get(), a_words.get(), m_width, e->d,
+                            e->width, mont.get());
+      bn_from_montgomery_small(r_words.get(), r_words.get(), m_width,
+                               mont.get());
       ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m_width));
       EXPECT_BIGNUMS_EQUAL("A ^ E (mod M) (Montgomery, words)", mod_exp.get(),
                            ret.get());
@@ -1530,6 +1533,10 @@
   EXPECT_FALSE(mont);
   ERR_clear_error();
 
+  mont.reset(BN_MONT_CTX_new_consttime(b.get(), ctx()));
+  EXPECT_FALSE(mont);
+  ERR_clear_error();
+
   // Some operations also may not be used with an even modulus.
   ASSERT_TRUE(BN_set_word(b.get(), 16));
 
@@ -1537,6 +1544,10 @@
   EXPECT_FALSE(mont);
   ERR_clear_error();
 
+  mont.reset(BN_MONT_CTX_new_consttime(b.get(), ctx()));
+  EXPECT_FALSE(mont);
+  ERR_clear_error();
+
   EXPECT_FALSE(BN_mod_exp_mont(a.get(), BN_value_one(), BN_value_one(), b.get(),
                                ctx(), NULL));
   ERR_clear_error();
@@ -1555,21 +1566,16 @@
   ASSERT_TRUE(BN_rand(a.get(), 1024, BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ANY));
   BN_zero(zero.get());
 
-  ASSERT_TRUE(
-      BN_mod_exp(r.get(), a.get(), zero.get(), BN_value_one(), nullptr));
-  EXPECT_TRUE(BN_is_zero(r.get()));
-
-  ASSERT_TRUE(BN_mod_exp_mont(r.get(), a.get(), zero.get(), BN_value_one(),
-                              nullptr, nullptr));
-  EXPECT_TRUE(BN_is_zero(r.get()));
-
-  ASSERT_TRUE(BN_mod_exp_mont_consttime(r.get(), a.get(), zero.get(),
-                                        BN_value_one(), nullptr, nullptr));
+  ASSERT_TRUE(BN_mod_exp(r.get(), a.get(), zero.get(), BN_value_one(), ctx()));
   EXPECT_TRUE(BN_is_zero(r.get()));
 
   ASSERT_TRUE(BN_mod_exp_mont_word(r.get(), 42, zero.get(), BN_value_one(),
-                                   nullptr, nullptr));
+                                   ctx(), nullptr));
   EXPECT_TRUE(BN_is_zero(r.get()));
+
+  // The other modular exponentiation functions, |BN_mod_exp_mont| and
+  // |BN_mod_exp_mont_consttime|, require fully-reduced inputs, so 1**0 mod 1 is
+  // not a valid call.
 }
 
 TEST_F(BNTest, SmallPrime) {
@@ -2262,26 +2268,43 @@
   EXPECT_TRUE(BN_is_pow2(eight.get()));
 
   // |BN_MONT_CTX| is always stored minimally and uses the same R independent of
-  // input width.
+  // input width. Additionally, mont->RR is always the same width as mont->N,
+  // even if it fits in a smaller value.
   static const uint8_t kP[] = {
-      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff,
-      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01,
   };
   bssl::UniquePtr<BIGNUM> p(BN_bin2bn(kP, sizeof(kP), nullptr));
   ASSERT_TRUE(p);
 
+  // Test both the constant-time and variable-time functions at both minimal and
+  // non-minimal |p|.
   bssl::UniquePtr<BN_MONT_CTX> mont(
       BN_MONT_CTX_new_for_modulus(p.get(), ctx()));
   ASSERT_TRUE(mont);
-
-  ASSERT_TRUE(bn_resize_words(p.get(), 32));
   bssl::UniquePtr<BN_MONT_CTX> mont2(
-      BN_MONT_CTX_new_for_modulus(p.get(), ctx()));
+      BN_MONT_CTX_new_consttime(p.get(), ctx()));
   ASSERT_TRUE(mont2);
 
+  ASSERT_TRUE(bn_resize_words(p.get(), 32));
+  bssl::UniquePtr<BN_MONT_CTX> mont3(
+      BN_MONT_CTX_new_for_modulus(p.get(), ctx()));
+  ASSERT_TRUE(mont3);
+  bssl::UniquePtr<BN_MONT_CTX> mont4(
+      BN_MONT_CTX_new_consttime(p.get(), ctx()));
+  ASSERT_TRUE(mont4);
+
   EXPECT_EQ(mont->N.width, mont2->N.width);
+  EXPECT_EQ(mont->N.width, mont3->N.width);
+  EXPECT_EQ(mont->N.width, mont4->N.width);
   EXPECT_EQ(0, BN_cmp(&mont->RR, &mont2->RR));
+  EXPECT_EQ(0, BN_cmp(&mont->RR, &mont3->RR));
+  EXPECT_EQ(0, BN_cmp(&mont->RR, &mont4->RR));
+  EXPECT_EQ(mont->N.width, mont->RR.width);
+  EXPECT_EQ(mont->N.width, mont2->RR.width);
+  EXPECT_EQ(mont->N.width, mont3->RR.width);
+  EXPECT_EQ(mont->N.width, mont4->RR.width);
 }
 
 TEST_F(BNTest, CountLowZeroBits) {
diff --git a/src/crypto/fipsmodule/bn/bn_tests.txt b/src/crypto/fipsmodule/bn/bn_tests.txt
index 7f85a02..6bdca42 100644
--- a/src/crypto/fipsmodule/bn/bn_tests.txt
+++ b/src/crypto/fipsmodule/bn/bn_tests.txt
@@ -10120,6 +10120,17 @@
 E = 02
 M = 414141414141414141414127414141414141414141414141414141414141414141414141414141414141414141414141414141414141414141414141414141414141414141414141414141800000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001
 
+# Cover the E = 0 case for small numbers.
+ModExp = 01
+A = 86b49
+E = 00
+M = 30d26ecb
+
+ModExp = 00
+A = 00
+E = 00
+M = 01
+
 ModExp = 208f8aa0
 A = 86b49
 E = 2
diff --git a/src/crypto/fipsmodule/bn/exponentiation.c b/src/crypto/fipsmodule/bn/exponentiation.c
index c85c00b..b07111e 100644
--- a/src/crypto/fipsmodule/bn/exponentiation.c
+++ b/src/crypto/fipsmodule/bn/exponentiation.c
@@ -109,6 +109,7 @@
 #include <openssl/bn.h>
 
 #include <assert.h>
+#include <stdlib.h>
 #include <string.h>
 
 #include <openssl/cpu.h>
@@ -585,6 +586,13 @@
 
 int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
                BN_CTX *ctx) {
+  if (a->neg || BN_ucmp(a, m) >= 0) {
+    if (!BN_nnmod(r, a, m, ctx)) {
+      return 0;
+    }
+    a = r;
+  }
+
   if (BN_is_odd(m)) {
     return BN_mod_exp_mont(r, a, p, m, ctx, NULL);
   }
@@ -598,6 +606,11 @@
     OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS);
     return 0;
   }
+  if (a->neg || BN_ucmp(a, m) >= 0) {
+    OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED);
+    return 0;
+  }
+
   int bits = BN_num_bits(p);
   if (bits == 0) {
     // x**0 mod 1 is still zero.
@@ -622,35 +635,19 @@
 
   // Allocate a montgomery context if it was not supplied by the caller.
   if (mont == NULL) {
-    new_mont = BN_MONT_CTX_new_for_modulus(m, ctx);
+    new_mont = BN_MONT_CTX_new_consttime(m, ctx);
     if (new_mont == NULL) {
       goto err;
     }
     mont = new_mont;
   }
 
-  const BIGNUM *aa;
-  if (a->neg || BN_ucmp(a, m) >= 0) {
-    if (!BN_nnmod(val[0], a, m, ctx)) {
-      goto err;
-    }
-    aa = val[0];
-  } else {
-    aa = a;
-  }
-
-  if (BN_is_zero(aa)) {
-    BN_zero(rr);
-    ret = 1;
-    goto err;
-  }
-
   // We exponentiate by looking at sliding windows of the exponent and
-  // precomputing powers of |aa|. Windows may be shifted so they always end on a
-  // set bit, so only precompute odd powers. We compute val[i] = aa^(2*i + 1)
+  // precomputing powers of |a|. Windows may be shifted so they always end on a
+  // set bit, so only precompute odd powers. We compute val[i] = a^(2*i + 1)
   // for i = 0 to 2^(window-1), all in Montgomery form.
   int window = BN_window_bits_for_exponent_size(bits);
-  if (!BN_to_montgomery(val[0], aa, mont, ctx)) {
+  if (!BN_to_montgomery(val[0], a, mont, ctx)) {
     goto err;
   }
   if (window > 1) {
@@ -666,10 +663,8 @@
     }
   }
 
-  if (!bn_one_to_montgomery(r, mont, ctx)) {
-    goto err;
-  }
-
+  // |p| is non-zero, so at least one window is non-zero. To save some
+  // multiplications, defer initializing |r| until then.
   int r_is_one = 1;
   int wstart = bits - 1;  // The top bit of the window.
   for (;;) {
@@ -706,7 +701,11 @@
 
     assert(wvalue & 1);
     assert(wvalue < (1 << window));
-    if (!BN_mod_mul_montgomery(r, r, val[wvalue >> 1], mont, ctx)) {
+    if (r_is_one) {
+      if (!BN_copy(r, val[wvalue >> 1])) {
+        goto err;
+      }
+    } else if (!BN_mod_mul_montgomery(r, r, val[wvalue >> 1], mont, ctx)) {
       goto err;
     }
 
@@ -717,6 +716,9 @@
     wstart -= wsize + 1;
   }
 
+  // |p| is non-zero, so |r_is_one| must be cleared at some point.
+  assert(!r_is_one);
+
   if (!BN_from_montgomery(rr, r, mont, ctx)) {
     goto err;
   }
@@ -728,29 +730,24 @@
   return ret;
 }
 
-int bn_mod_exp_mont_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
-                          size_t num_a, const BN_ULONG *p, size_t num_p,
-                          const BN_MONT_CTX *mont) {
-  size_t num_n = mont->N.width;
-  if (num_n != num_a || num_n != num_r || num_n > BN_SMALL_MAX_WORDS) {
-    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-    return 0;
+void bn_mod_exp_mont_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
+                           const BN_ULONG *p, size_t num_p,
+                           const BN_MONT_CTX *mont) {
+  if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS) {
+    abort();
   }
-  if (!BN_is_odd(&mont->N)) {
-    OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS);
-    return 0;
+  assert(BN_is_odd(&mont->N));
+
+  // Count the number of bits in |p|. Note this function treats |p| as public.
+  while (num_p != 0 && p[num_p - 1] == 0) {
+    num_p--;
   }
-  unsigned bits = 0;
-  if (num_p != 0) {
-    bits = BN_num_bits_word(p[num_p - 1]) + (num_p - 1) * BN_BITS2;
+  if (num_p == 0) {
+    bn_from_montgomery_small(r, mont->RR.d, num, mont);
+    return;
   }
-  if (bits == 0) {
-    OPENSSL_memset(r, 0, num_r * sizeof(BN_ULONG));
-    if (!BN_is_one(&mont->N)) {
-      r[0] = 1;
-    }
-    return 1;
-  }
+  unsigned bits = BN_num_bits_word(p[num_p - 1]) + (num_p - 1) * BN_BITS2;
+  assert(bits != 0);
 
   // We exponentiate by looking at sliding windows of the exponent and
   // precomputing powers of |a|. Windows may be shifted so they always end on a
@@ -760,34 +757,24 @@
   if (window > TABLE_BITS_SMALL) {
     window = TABLE_BITS_SMALL;  // Tolerate excessively large |p|.
   }
-  int ret = 0;
   BN_ULONG val[TABLE_SIZE_SMALL][BN_SMALL_MAX_WORDS];
-  OPENSSL_memcpy(val[0], a, num_n * sizeof(BN_ULONG));
+  OPENSSL_memcpy(val[0], a, num * sizeof(BN_ULONG));
   if (window > 1) {
     BN_ULONG d[BN_SMALL_MAX_WORDS];
-    if (!bn_mod_mul_montgomery_small(d, num_n, val[0], num_n, val[0], num_n,
-                                     mont)) {
-      goto err;
-    }
+    bn_mod_mul_montgomery_small(d, val[0], val[0], num, mont);
     for (unsigned i = 1; i < 1u << (window - 1); i++) {
-      if (!bn_mod_mul_montgomery_small(val[i], num_n, val[i - 1], num_n, d,
-                                       num_n, mont)) {
-        goto err;
-      }
+      bn_mod_mul_montgomery_small(val[i], val[i - 1], d, num, mont);
     }
   }
 
-  if (!bn_one_to_montgomery_small(r, num_r, mont)) {
-    goto err;
-  }
-
+  // |p| is non-zero, so at least one window is non-zero. To save some
+  // multiplications, defer initializing |r| until then.
   int r_is_one = 1;
   unsigned wstart = bits - 1;  // The top bit of the window.
   for (;;) {
     if (!bn_is_bit_set_words(p, num_p, wstart)) {
-      if (!r_is_one &&
-          !bn_mod_mul_montgomery_small(r, num_r, r, num_r, r, num_r, mont)) {
-        goto err;
+      if (!r_is_one) {
+        bn_mod_mul_montgomery_small(r, r, r, num, mont);
       }
       if (wstart == 0) {
         break;
@@ -810,19 +797,17 @@
     // Shift |r| to the end of the window.
     if (!r_is_one) {
       for (unsigned i = 0; i < wsize + 1; i++) {
-        if (!bn_mod_mul_montgomery_small(r, num_r, r, num_r, r, num_r, mont)) {
-          goto err;
-        }
+        bn_mod_mul_montgomery_small(r, r, r, num, mont);
       }
     }
 
     assert(wvalue & 1);
     assert(wvalue < (1u << window));
-    if (!bn_mod_mul_montgomery_small(r, num_r, r, num_r, val[wvalue >> 1],
-                                     num_n, mont)) {
-      goto err;
+    if (r_is_one) {
+      OPENSSL_memcpy(r, val[wvalue >> 1], num * sizeof(BN_ULONG));
+    } else {
+      bn_mod_mul_montgomery_small(r, r, val[wvalue >> 1], num, mont);
     }
-
     r_is_one = 0;
     if (wstart == wsize) {
       break;
@@ -830,38 +815,33 @@
     wstart -= wsize + 1;
   }
 
-  ret = 1;
-
-err:
+  // |p| is non-zero, so |r_is_one| must be cleared at some point.
+  assert(!r_is_one);
   OPENSSL_cleanse(val, sizeof(val));
-  return ret;
 }
 
-int bn_mod_inverse_prime_mont_small(BN_ULONG *r, size_t num_r,
-                                    const BN_ULONG *a, size_t num_a,
-                                    const BN_MONT_CTX *mont) {
-  const BN_ULONG *p = mont->N.d;
-  size_t num_p = mont->N.width;
-  if (num_p > BN_SMALL_MAX_WORDS || num_p == 0) {
-    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-    return 0;
+void bn_mod_inverse_prime_mont_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
+                                     const BN_MONT_CTX *mont) {
+  if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS) {
+    abort();
   }
 
   // Per Fermat's Little Theorem, a^-1 = a^(p-2) (mod p) for p prime.
   BN_ULONG p_minus_two[BN_SMALL_MAX_WORDS];
-  OPENSSL_memcpy(p_minus_two, p, num_p * sizeof(BN_ULONG));
+  const BN_ULONG *p = mont->N.d;
+  OPENSSL_memcpy(p_minus_two, p, num * sizeof(BN_ULONG));
   if (p_minus_two[0] >= 2) {
     p_minus_two[0] -= 2;
   } else {
     p_minus_two[0] -= 2;
-    for (size_t i = 1; i < num_p; i++) {
+    for (size_t i = 1; i < num; i++) {
       if (p_minus_two[i]-- != 0) {
         break;
       }
     }
   }
 
-  return bn_mod_exp_mont_small(r, num_r, a, num_a, p_minus_two, num_p, mont);
+  bn_mod_exp_mont_small(r, a, num, p_minus_two, num, mont);
 }
 
 
@@ -988,12 +968,15 @@
   int powerbufLen = 0;
   unsigned char *powerbuf = NULL;
   BIGNUM tmp, am;
-  BIGNUM *new_a = NULL;
 
   if (!BN_is_odd(m)) {
     OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS);
     return 0;
   }
+  if (a->neg || BN_ucmp(a, m) >= 0) {
+    OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED);
+    return 0;
+  }
 
   // Use all bits stored in |p|, rather than |BN_num_bits|, so we do not leak
   // whether the top bits are zero.
@@ -1010,7 +993,7 @@
 
   // Allocate a montgomery context if it was not supplied by the caller.
   if (mont == NULL) {
-    new_mont = BN_MONT_CTX_new_for_modulus(m, ctx);
+    new_mont = BN_MONT_CTX_new_consttime(m, ctx);
     if (new_mont == NULL) {
       goto err;
     }
@@ -1021,15 +1004,6 @@
   // implementation assumes it can use |top| to size R.
   int top = mont->N.width;
 
-  if (a->neg || BN_ucmp(a, m) >= 0) {
-    new_a = BN_new();
-    if (new_a == NULL ||
-        !BN_nnmod(new_a, a, m, ctx)) {
-      goto err;
-    }
-    a = new_a;
-  }
-
 #ifdef RSAZ_ENABLED
   // If the size of the operands allow it, perform the optimized
   // RSAZ exponentiation. For further information see
@@ -1290,7 +1264,6 @@
 
 err:
   BN_MONT_CTX_free(new_mont);
-  BN_clear_free(new_a);
   OPENSSL_free(powerbufFree);
   return (ret);
 }
@@ -1303,6 +1276,11 @@
 
   int ret = 0;
 
+  // BN_mod_exp_mont requires reduced inputs.
+  if (bn_minimal_width(m) == 1) {
+    a %= m->d[0];
+  }
+
   if (!BN_set_word(&a_bignum, a)) {
     OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
     goto err;
diff --git a/src/crypto/fipsmodule/bn/internal.h b/src/crypto/fipsmodule/bn/internal.h
index 668d8dd..2fc38df 100644
--- a/src/crypto/fipsmodule/bn/internal.h
+++ b/src/crypto/fipsmodule/bn/internal.h
@@ -519,77 +519,59 @@
 #endif
 
 // bn_mul_small sets |r| to |a|*|b|. |num_r| must be |num_a| + |num_b|. |r| may
-// not alias with |a| or |b|. This function returns one on success and zero if
-// lengths are inconsistent.
-int bn_mul_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a,
+// not alias with |a| or |b|.
+void bn_mul_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a,
                  const BN_ULONG *b, size_t num_b);
 
 // bn_sqr_small sets |r| to |a|^2. |num_a| must be at most |BN_SMALL_MAX_WORDS|.
-// |num_r| must be |num_a|*2. |r| and |a| may not alias. This function returns
-// one on success and zero on programmer error.
-int bn_sqr_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a);
+// |num_r| must be |num_a|*2. |r| and |a| may not alias.
+void bn_sqr_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a);
 
 // In the following functions, the modulus must be at most |BN_SMALL_MAX_WORDS|
 // words long.
 
 // bn_to_montgomery_small sets |r| to |a| translated to the Montgomery domain.
-// |num_a| and |num_r| must be the length of the modulus, which is
-// |mont->N.top|. |a| must be fully reduced. This function returns one on
-// success and zero if lengths are inconsistent. |r| and |a| may alias.
-int bn_to_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
-                           size_t num_a, const BN_MONT_CTX *mont);
+// |r| and |a| are |num| words long, which must be |mont->N.width|. |a| must be
+// fully reduced and may alias |r|.
+void bn_to_montgomery_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
+                            const BN_MONT_CTX *mont);
 
 // bn_from_montgomery_small sets |r| to |a| translated out of the Montgomery
-// domain. |num_r| must be the length of the modulus, which is |mont->N.top|.
-// |a| must be at most |mont->N.top| * R and |num_a| must be at most 2 *
-// |mont->N.top|. This function returns one on success and zero if lengths are
-// inconsistent. |r| and |a| may alias.
-int bn_from_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
-                             size_t num_a, const BN_MONT_CTX *mont);
-
-// bn_one_to_montgomery_small sets |r| to one in Montgomery form. It returns one
-// on success and zero on error. |num_r| must be the length of the modulus,
-// which is |mont->N.top|. This function treats the bit width of the modulus as
-// public.
-int bn_one_to_montgomery_small(BN_ULONG *r, size_t num_r,
-                               const BN_MONT_CTX *mont);
+// domain. |r| and |a| are |num| words long, which must be |mont->N.width|. |a|
+// must be fully-reduced and may alias |r|.
+void bn_from_montgomery_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
+                              const BN_MONT_CTX *mont);
 
 // bn_mod_mul_montgomery_small sets |r| to |a| * |b| mod |mont->N|. Both inputs
-// and outputs are in the Montgomery domain. |num_r| must be the length of the
-// modulus, which is |mont->N.top|. This function returns one on success and
-// zero on internal error or inconsistent lengths. Any two of |r|, |a|, and |b|
-// may alias.
-//
-// This function requires |a| * |b| < N * R, where N is the modulus and R is the
-// Montgomery divisor, 2^(N.top * BN_BITS2). This should generally be satisfied
-// by ensuring |a| and |b| are fully reduced, however ECDSA has one computation
-// which requires the more general bound.
-int bn_mod_mul_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
-                                size_t num_a, const BN_ULONG *b, size_t num_b,
-                                const BN_MONT_CTX *mont);
+// and outputs are in the Montgomery domain. Each array is |num| words long,
+// which must be |mont->N.width|. Any two of |r|, |a|, and |b| may alias. |a|
+// and |b| must be reduced on input.
+void bn_mod_mul_montgomery_small(BN_ULONG *r, const BN_ULONG *a,
+                                 const BN_ULONG *b, size_t num,
+                                 const BN_MONT_CTX *mont);
 
 // bn_mod_exp_mont_small sets |r| to |a|^|p| mod |mont->N|. It returns one on
 // success and zero on programmer or internal error. Both inputs and outputs are
-// in the Montgomery domain. |num_r| and |num_a| must be |mont->N.top|, which
-// must be at most |BN_SMALL_MAX_WORDS|. |a| must be fully-reduced. This
-// function runs in time independent of |a|, but |p| and |mont->N| are public
-// values.
+// in the Montgomery domain. |r| and |a| are |num| words long, which must be
+// |mont->N.width| and at most |BN_SMALL_MAX_WORDS|. |a| must be fully-reduced.
+// This function runs in time independent of |a|, but |p| and |mont->N| are
+// public values. |a| must be fully-reduced and may alias with |r|.
 //
 // Note this function differs from |BN_mod_exp_mont| which uses Montgomery
 // reduction but takes input and output outside the Montgomery domain. Combine
 // this function with |bn_from_montgomery_small| and |bn_to_montgomery_small|
 // if necessary.
-int bn_mod_exp_mont_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
-                          size_t num_a, const BN_ULONG *p, size_t num_p,
-                          const BN_MONT_CTX *mont);
+void bn_mod_exp_mont_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
+                           const BN_ULONG *p, size_t num_p,
+                           const BN_MONT_CTX *mont);
 
 // bn_mod_inverse_prime_mont_small sets |r| to |a|^-1 mod |mont->N|. |mont->N|
-// must be a prime. |num_r| and |num_a| must be |mont->N.top|, which must be at
-// most |BN_SMALL_MAX_WORDS|. |a| must be fully-reduced. This function runs in
-// time independent of |a|, but |mont->N| is a public value.
-int bn_mod_inverse_prime_mont_small(BN_ULONG *r, size_t num_r,
-                                    const BN_ULONG *a, size_t num_a,
-                                    const BN_MONT_CTX *mont);
+// must be a prime. |r| and |a| are |num| words long, which must be
+// |mont->N.width| and at most |BN_SMALL_MAX_WORDS|. |a| must be fully-reduced
+// and may alias |r|. This function runs in time independent of |a|, but
+// |mont->N| is a public value.
+void bn_mod_inverse_prime_mont_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
+                                     const BN_MONT_CTX *mont);
 
 
 #if defined(__cplusplus)
diff --git a/src/crypto/fipsmodule/bn/montgomery.c b/src/crypto/fipsmodule/bn/montgomery.c
index 7ce8c4c..851c0a0 100644
--- a/src/crypto/fipsmodule/bn/montgomery.c
+++ b/src/crypto/fipsmodule/bn/montgomery.c
@@ -109,6 +109,8 @@
 #include <openssl/bn.h>
 
 #include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 
 #include <openssl/err.h>
@@ -170,7 +172,7 @@
 OPENSSL_COMPILE_ASSERT(sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS ==
                        sizeof(uint64_t), BN_MONT_CTX_set_64_bit_mismatch);
 
-int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) {
+static int bn_mont_ctx_set_N_and_n0(BN_MONT_CTX *mont, const BIGNUM *mod) {
   if (BN_is_zero(mod)) {
     OPENSSL_PUT_ERROR(BN, BN_R_DIV_BY_ZERO);
     return 0;
@@ -207,6 +209,13 @@
 #else
   mont->n0[1] = 0;
 #endif
+  return 1;
+}
+
+int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) {
+  if (!bn_mont_ctx_set_N_and_n0(mont, mod)) {
+    return 0;
+  }
 
   BN_CTX *new_ctx = NULL;
   if (ctx == NULL) {
@@ -223,7 +232,10 @@
   // BN_BITS2|, is correct because R**2 will still be a multiple of the latter
   // as |BN_MONT_CTX_N0_LIMBS| is either one or two.
   unsigned lgBigR = mont->N.width * BN_BITS2;
-  int ok = bn_mod_exp_base_2_consttime(&mont->RR, lgBigR * 2, &mont->N, ctx);
+  BN_zero(&mont->RR);
+  int ok = BN_set_bit(&mont->RR, lgBigR * 2) &&
+           BN_mod(&mont->RR, &mont->RR, &mont->N, ctx) &&
+           bn_resize_words(&mont->RR, mont->N.width);
   BN_CTX_free(new_ctx);
   return ok;
 }
@@ -238,6 +250,24 @@
   return mont;
 }
 
+BN_MONT_CTX *BN_MONT_CTX_new_consttime(const BIGNUM *mod, BN_CTX *ctx) {
+  BN_MONT_CTX *mont = BN_MONT_CTX_new();
+  if (mont == NULL ||
+      !bn_mont_ctx_set_N_and_n0(mont, mod)) {
+    goto err;
+  }
+  unsigned lgBigR = mont->N.width * BN_BITS2;
+  if (!bn_mod_exp_base_2_consttime(&mont->RR, lgBigR * 2, &mont->N, ctx) ||
+      !bn_resize_words(&mont->RR, mont->N.width)) {
+    goto err;
+  }
+  return mont;
+
+err:
+  BN_MONT_CTX_free(mont);
+  return NULL;
+}
+
 int BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, CRYPTO_MUTEX *lock,
                            const BIGNUM *mod, BN_CTX *bn_ctx) {
   CRYPTO_MUTEX_lock_read(lock);
@@ -427,89 +457,53 @@
          bn_fits_in_words(bn, mont->N.width);
 }
 
-int bn_to_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
-                           size_t num_a, const BN_MONT_CTX *mont) {
-  return bn_mod_mul_montgomery_small(r, num_r, a, num_a, mont->RR.d,
-                                     mont->RR.width, mont);
+void bn_to_montgomery_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
+                            const BN_MONT_CTX *mont) {
+  bn_mod_mul_montgomery_small(r, a, mont->RR.d, num, mont);
 }
 
-int bn_from_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
-                             size_t num_a, const BN_MONT_CTX *mont) {
-  size_t num_n = mont->N.width;
-  if (num_a > 2 * num_n || num_r != num_n || num_n > BN_SMALL_MAX_WORDS) {
-    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-    return 0;
+void bn_from_montgomery_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
+                              const BN_MONT_CTX *mont) {
+  if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS) {
+    abort();
   }
   BN_ULONG tmp[BN_SMALL_MAX_WORDS * 2];
-  size_t num_tmp = 2 * num_n;
-  OPENSSL_memcpy(tmp, a, num_a * sizeof(BN_ULONG));
-  OPENSSL_memset(tmp + num_a, 0, (num_tmp - num_a) * sizeof(BN_ULONG));
-  int ret = bn_from_montgomery_in_place(r, num_r, tmp, num_tmp, mont);
-  OPENSSL_cleanse(tmp, num_tmp * sizeof(BN_ULONG));
-  return ret;
+  OPENSSL_memcpy(tmp, a, num * sizeof(BN_ULONG));
+  OPENSSL_memset(tmp + num, 0, num * sizeof(BN_ULONG));
+  if (!bn_from_montgomery_in_place(r, num, tmp, 2 * num, mont)) {
+    abort();
+  }
+  OPENSSL_cleanse(tmp, 2 * num * sizeof(BN_ULONG));
 }
 
-int bn_one_to_montgomery_small(BN_ULONG *r, size_t num_r,
-                               const BN_MONT_CTX *mont) {
-  const BN_ULONG *n = mont->N.d;
-  size_t num_n = mont->N.width;
-  if (num_n == 0 || num_r != num_n) {
-    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-    return 0;
-  }
-
-  // If the high bit of |n| is set, R = 2^(num_n*BN_BITS2) < 2 * |n|, so we
-  // compute R - |n| rather than perform Montgomery reduction.
-  if (num_n > 0 && (n[num_n - 1] >> (BN_BITS2 - 1)) != 0) {
-    r[0] = 0 - n[0];
-    for (size_t i = 1; i < num_n; i++) {
-      r[i] = ~n[i];
-    }
-    return 1;
-  }
-
-  return bn_from_montgomery_small(r, num_r, mont->RR.d, mont->RR.width, mont);
-}
-
-int bn_mod_mul_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
-                                size_t num_a, const BN_ULONG *b, size_t num_b,
-                                const BN_MONT_CTX *mont) {
-  size_t num_n = mont->N.width;
-  if (num_r != num_n || num_a + num_b > 2 * num_n ||
-      num_n > BN_SMALL_MAX_WORDS) {
-    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-    return 0;
+void bn_mod_mul_montgomery_small(BN_ULONG *r, const BN_ULONG *a,
+                                 const BN_ULONG *b, size_t num,
+                                 const BN_MONT_CTX *mont) {
+  if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS) {
+    abort();
   }
 
 #if defined(OPENSSL_BN_ASM_MONT)
   // |bn_mul_mont| requires at least 128 bits of limbs, at least for x86.
-  if (num_n >= (128 / BN_BITS2) &&
-      num_a == num_n &&
-      num_b == num_n) {
-    if (!bn_mul_mont(r, a, b, mont->N.d, mont->n0, num_n)) {
-      assert(0);  // The check above ensures this won't happen.
-      OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
-      return 0;
+  if (num >= (128 / BN_BITS2)) {
+    if (!bn_mul_mont(r, a, b, mont->N.d, mont->n0, num)) {
+      abort();  // The check above ensures this won't happen.
     }
-    return 1;
+    return;
   }
 #endif
 
   // Compute the product.
   BN_ULONG tmp[2 * BN_SMALL_MAX_WORDS];
-  size_t num_tmp = 2 * num_n;
-  size_t num_ab = num_a + num_b;
-  if (a == b && num_a == num_b) {
-    if (!bn_sqr_small(tmp, num_ab, a, num_a)) {
-      return 0;
-    }
-  } else if (!bn_mul_small(tmp, num_ab, a, num_a, b, num_b)) {
-    return 0;
+  if (a == b) {
+    bn_sqr_small(tmp, 2 * num, a, num);
+  } else {
+    bn_mul_small(tmp, 2 * num, a, num, b, num);
   }
 
-  // Zero-extend to full width and reduce.
-  OPENSSL_memset(tmp + num_ab, 0, (num_tmp - num_ab) * sizeof(BN_ULONG));
-  int ret = bn_from_montgomery_in_place(r, num_r, tmp, num_tmp, mont);
-  OPENSSL_cleanse(tmp, num_tmp * sizeof(BN_ULONG));
-  return ret;
+  // Reduce.
+  if (!bn_from_montgomery_in_place(r, num, tmp, 2 * num, mont)) {
+    abort();
+  }
+  OPENSSL_cleanse(tmp, 2 * num * sizeof(BN_ULONG));
 }
diff --git a/src/crypto/fipsmodule/bn/montgomery_inv.c b/src/crypto/fipsmodule/bn/montgomery_inv.c
index a920ca4..94d99e8 100644
--- a/src/crypto/fipsmodule/bn/montgomery_inv.c
+++ b/src/crypto/fipsmodule/bn/montgomery_inv.c
@@ -32,7 +32,8 @@
 #define LG_LITTLE_R (BN_MONT_CTX_N0_LIMBS * BN_BITS2)
 
 uint64_t bn_mont_n0(const BIGNUM *n) {
-  // These conditions are checked by the caller, |BN_MONT_CTX_set|.
+  // These conditions are checked by the caller, |BN_MONT_CTX_set| or
+  // |BN_MONT_CTX_new_consttime|.
   assert(!BN_is_zero(n));
   assert(!BN_is_negative(n));
   assert(BN_is_odd(n));
diff --git a/src/crypto/fipsmodule/bn/mul.c b/src/crypto/fipsmodule/bn/mul.c
index 4a0711d..bd9393e 100644
--- a/src/crypto/fipsmodule/bn/mul.c
+++ b/src/crypto/fipsmodule/bn/mul.c
@@ -57,6 +57,7 @@
 #include <openssl/bn.h>
 
 #include <assert.h>
+#include <stdlib.h>
 #include <string.h>
 
 #include <openssl/err.h>
@@ -656,11 +657,10 @@
   return bn_mul_impl(r, a, b, ctx);
 }
 
-int bn_mul_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a,
-                 const BN_ULONG *b, size_t num_b) {
+void bn_mul_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a,
+                  const BN_ULONG *b, size_t num_b) {
   if (num_r != num_a + num_b) {
-    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-    return 0;
+    abort();
   }
   // TODO(davidben): Should this call |bn_mul_comba4| too? |BN_mul| does not
   // hit that code.
@@ -669,7 +669,6 @@
   } else {
     bn_mul_normal(r, a, num_a, b, num_b);
   }
-  return 1;
 }
 
 // tmp must have 2*n words
@@ -858,10 +857,9 @@
   return 1;
 }
 
-int bn_sqr_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a) {
+void bn_sqr_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a) {
   if (num_r != 2 * num_a || num_a > BN_SMALL_MAX_WORDS) {
-    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-    return 0;
+    abort();
   }
   if (num_a == 4) {
     bn_sqr_comba4(r, a);
@@ -872,5 +870,4 @@
     bn_sqr_normal(r, a, num_a, tmp);
     OPENSSL_cleanse(tmp, 2 * num_a * sizeof(BN_ULONG));
   }
-  return 1;
 }
diff --git a/src/crypto/fipsmodule/bn/prime.c b/src/crypto/fipsmodule/bn/prime.c
index a18d377..80b33c2 100644
--- a/src/crypto/fipsmodule/bn/prime.c
+++ b/src/crypto/fipsmodule/bn/prime.c
@@ -690,7 +690,7 @@
   BIGNUM *z = BN_CTX_get(ctx);
   BIGNUM *one_mont = BN_CTX_get(ctx);
   BIGNUM *w1_mont = BN_CTX_get(ctx);
-  mont = BN_MONT_CTX_new_for_modulus(w, ctx);
+  mont = BN_MONT_CTX_new_consttime(w, ctx);
   if (b == NULL || z == NULL || one_mont == NULL || w1_mont == NULL ||
       mont == NULL ||
       !bn_one_to_montgomery(one_mont, mont, ctx) ||
diff --git a/src/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl b/src/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
index 2a21140..c2f67f4 100755
--- a/src/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
+++ b/src/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
@@ -1,15 +1,17 @@
 #! /usr/bin/env perl
 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
 # Copyright (c) 2014, Intel Corporation. All Rights Reserved.
+# Copyright (c) 2015 CloudFlare, Inc.
 #
 # Licensed under the OpenSSL license (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
 # in the file LICENSE in the source distribution or at
 # https://www.openssl.org/source/license.html
 #
-# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
 # (1) Intel Corporation, Israel Development Center, Haifa, Israel
 # (2) University of Haifa, Israel
+# (3) CloudFlare, Inc.
 #
 # Reference:
 # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
@@ -18,23 +20,25 @@
 # Further optimization by <appro@openssl.org>:
 #
 #		this/original	with/without -DECP_NISTZ256_ASM(*)
-# Opteron	+12-49%		+110-150%
-# Bulldozer	+14-45%		+175-210%
-# P4		+18-46%		n/a :-(
-# Westmere	+12-34%		+80-87%
-# Sandy Bridge	+9-35%		+110-120%
-# Ivy Bridge	+9-35%		+110-125%
-# Haswell	+8-37%		+140-160%
-# Broadwell	+18-58%		+145-210%
-# Atom		+15-50%		+130-180%
-# VIA Nano	+43-160%	+300-480%
+# Opteron	+15-49%		+150-195%
+# Bulldozer	+18-45%		+175-240%
+# P4		+24-46%		+100-150%
+# Westmere	+18-34%		+87-160%
+# Sandy Bridge	+14-35%		+120-185%
+# Ivy Bridge	+11-35%		+125-180%
+# Haswell	+10-37%		+160-200%
+# Broadwell	+24-58%		+210-270%
+# Atom		+20-50%		+180-240%
+# VIA Nano	+50-160%	+480-480%
 #
 # (*)	"without -DECP_NISTZ256_ASM" refers to build with
 #	"enable-ec_nistp_64_gcc_128";
 #
 # Ranges denote minimum and maximum improvement coefficients depending
-# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
-# server-side operation. Keep in mind that +100% means 2x improvement.
+# on benchmark. In "this/original" column lower coefficient is for
+# ECDSA sign, while in "with/without" - for ECDH key agreement, and
+# higher - for ECDSA sign, relatively fastest server-side operation.
+# Keep in mind that +100% means 2x improvement.
 
 $flavour = shift;
 $output  = shift;
@@ -71,6 +75,12 @@
 .long 3,3,3,3,3,3,3,3
 .LONE_mont:
 .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+# Constants for computations modulo ord(p256)
+.Lord:
+.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+.LordK:
+.quad 0xccd1c8aaee00bc4f
 ___
 
 {
@@ -145,6 +155,1087 @@
 
 $code.=<<___;
 ################################################################################
+# void ecp_nistz256_ord_mul_mont(
+#   uint64_t res[4],
+#   uint64_t a[4],
+#   uint64_t b[4]);
+
+.globl	ecp_nistz256_ord_mul_mont
+.type	ecp_nistz256_ord_mul_mont,\@function,3
+.align	32
+ecp_nistz256_ord_mul_mont:
+.cfi_startproc
+___
+$code.=<<___	if ($addx);
+	leaq	OPENSSL_ia32cap_P(%rip), %rcx
+	mov	8(%rcx), %rcx
+	and	\$0x80100, %ecx
+	cmp	\$0x80100, %ecx
+	je	.Lecp_nistz256_ord_mul_montx
+___
+$code.=<<___;
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lord_mul_body:
+
+	mov	8*0($b_org), %rax
+	mov	$b_org, $b_ptr
+	lea	.Lord(%rip), %r14
+	mov	.LordK(%rip), %r15
+
+	################################# * b[0]
+	mov	%rax, $t0
+	mulq	8*0($a_ptr)
+	mov	%rax, $acc0
+	mov	$t0, %rax
+	mov	%rdx, $acc1
+
+	mulq	8*1($a_ptr)
+	add	%rax, $acc1
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $acc2
+
+	mulq	8*2($a_ptr)
+	add	%rax, $acc2
+	mov	$t0, %rax
+	adc	\$0, %rdx
+
+	 mov	$acc0, $acc5
+	 imulq	%r15,$acc0
+
+	mov	%rdx, $acc3
+	mulq	8*3($a_ptr)
+	add	%rax, $acc3
+	 mov	$acc0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $acc4
+
+	################################# First reduction step
+	mulq	8*0(%r14)
+	mov	$acc0, $t1
+	add	%rax, $acc5		# guaranteed to be zero
+	mov	$acc0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t0
+
+	sub	$acc0, $acc2
+	sbb	\$0, $acc0		# can't borrow
+
+	mulq	8*1(%r14)
+	add	$t0, $acc1
+	adc	\$0, %rdx
+	add	%rax, $acc1
+	mov	$t1, %rax
+	adc	%rdx, $acc2
+	mov	$t1, %rdx
+	adc	\$0, $acc0		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc3
+	 mov	8*1($b_ptr), %rax
+	sbb	%rdx, $t1		# can't borrow
+
+	add	$acc0, $acc3
+	adc	$t1, $acc4
+	adc	\$0, $acc5
+
+	################################# * b[1]
+	mov	%rax, $t0
+	mulq	8*0($a_ptr)
+	add	%rax, $acc1
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*1($a_ptr)
+	add	$t1, $acc2
+	adc	\$0, %rdx
+	add	%rax, $acc2
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*2($a_ptr)
+	add	$t1, $acc3
+	adc	\$0, %rdx
+	add	%rax, $acc3
+	mov	$t0, %rax
+	adc	\$0, %rdx
+
+	 mov	$acc1, $t0
+	 imulq	%r15, $acc1
+
+	mov	%rdx, $t1
+	mulq	8*3($a_ptr)
+	add	$t1, $acc4
+	adc	\$0, %rdx
+	xor	$acc0, $acc0
+	add	%rax, $acc4
+	 mov	$acc1, %rax
+	adc	%rdx, $acc5
+	adc	\$0, $acc0
+
+	################################# Second reduction step
+	mulq	8*0(%r14)
+	mov	$acc1, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	$acc1, %rax
+	adc	%rdx, $t0
+
+	sub	$acc1, $acc3
+	sbb	\$0, $acc1		# can't borrow
+
+	mulq	8*1(%r14)
+	add	$t0, $acc2
+	adc	\$0, %rdx
+	add	%rax, $acc2
+	mov	$t1, %rax
+	adc	%rdx, $acc3
+	mov	$t1, %rdx
+	adc	\$0, $acc1		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc4
+	 mov	8*2($b_ptr), %rax
+	sbb	%rdx, $t1		# can't borrow
+
+	add	$acc1, $acc4
+	adc	$t1, $acc5
+	adc	\$0, $acc0
+
+	################################## * b[2]
+	mov	%rax, $t0
+	mulq	8*0($a_ptr)
+	add	%rax, $acc2
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*1($a_ptr)
+	add	$t1, $acc3
+	adc	\$0, %rdx
+	add	%rax, $acc3
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*2($a_ptr)
+	add	$t1, $acc4
+	adc	\$0, %rdx
+	add	%rax, $acc4
+	mov	$t0, %rax
+	adc	\$0, %rdx
+
+	 mov	$acc2, $t0
+	 imulq	%r15, $acc2
+
+	mov	%rdx, $t1
+	mulq	8*3($a_ptr)
+	add	$t1, $acc5
+	adc	\$0, %rdx
+	xor	$acc1, $acc1
+	add	%rax, $acc5
+	 mov	$acc2, %rax
+	adc	%rdx, $acc0
+	adc	\$0, $acc1
+
+	################################# Third reduction step
+	mulq	8*0(%r14)
+	mov	$acc2, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	$acc2, %rax
+	adc	%rdx, $t0
+
+	sub	$acc2, $acc4
+	sbb	\$0, $acc2		# can't borrow
+
+	mulq	8*1(%r14)
+	add	$t0, $acc3
+	adc	\$0, %rdx
+	add	%rax, $acc3
+	mov	$t1, %rax
+	adc	%rdx, $acc4
+	mov	$t1, %rdx
+	adc	\$0, $acc2		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc5
+	 mov	8*3($b_ptr), %rax
+	sbb	%rdx, $t1		# can't borrow
+
+	add	$acc2, $acc5
+	adc	$t1, $acc0
+	adc	\$0, $acc1
+
+	################################# * b[3]
+	mov	%rax, $t0
+	mulq	8*0($a_ptr)
+	add	%rax, $acc3
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*1($a_ptr)
+	add	$t1, $acc4
+	adc	\$0, %rdx
+	add	%rax, $acc4
+	mov	$t0, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mulq	8*2($a_ptr)
+	add	$t1, $acc5
+	adc	\$0, %rdx
+	add	%rax, $acc5
+	mov	$t0, %rax
+	adc	\$0, %rdx
+
+	 mov	$acc3, $t0
+	 imulq	%r15, $acc3
+
+	mov	%rdx, $t1
+	mulq	8*3($a_ptr)
+	add	$t1, $acc0
+	adc	\$0, %rdx
+	xor	$acc2, $acc2
+	add	%rax, $acc0
+	 mov	$acc3, %rax
+	adc	%rdx, $acc1
+	adc	\$0, $acc2
+
+	################################# Last reduction step
+	mulq	8*0(%r14)
+	mov	$acc3, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	$acc3, %rax
+	adc	%rdx, $t0
+
+	sub	$acc3, $acc5
+	sbb	\$0, $acc3		# can't borrow
+
+	mulq	8*1(%r14)
+	add	$t0, $acc4
+	adc	\$0, %rdx
+	add	%rax, $acc4
+	mov	$t1, %rax
+	adc	%rdx, $acc5
+	mov	$t1, %rdx
+	adc	\$0, $acc3		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc0
+	sbb	%rdx, $t1		# can't borrow
+
+	add	$acc3, $acc0
+	adc	$t1, $acc1
+	adc	\$0, $acc2
+
+	################################# Subtract ord
+	 mov	$acc4, $a_ptr
+	sub	8*0(%r14), $acc4
+	 mov	$acc5, $acc3
+	sbb	8*1(%r14), $acc5
+	 mov	$acc0, $t0
+	sbb	8*2(%r14), $acc0
+	 mov	$acc1, $t1
+	sbb	8*3(%r14), $acc1
+	sbb	\$0, $acc2
+
+	cmovc	$a_ptr, $acc4
+	cmovc	$acc3, $acc5
+	cmovc	$t0, $acc0
+	cmovc	$t1, $acc1
+
+	mov	$acc4, 8*0($r_ptr)
+	mov	$acc5, 8*1($r_ptr)
+	mov	$acc0, 8*2($r_ptr)
+	mov	$acc1, 8*3($r_ptr)
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_mul_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+################################################################################
+# void ecp_nistz256_ord_sqr_mont(
+#   uint64_t res[4],
+#   uint64_t a[4],
+#   int rep);
+
+.globl	ecp_nistz256_ord_sqr_mont
+.type	ecp_nistz256_ord_sqr_mont,\@function,3
+.align	32
+ecp_nistz256_ord_sqr_mont:
+.cfi_startproc
+___
+$code.=<<___	if ($addx);
+	leaq	OPENSSL_ia32cap_P(%rip), %rcx
+	mov	8(%rcx), %rcx
+	and	\$0x80100, %ecx
+	cmp	\$0x80100, %ecx
+	je	.Lecp_nistz256_ord_sqr_montx
+___
+$code.=<<___;
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lord_sqr_body:
+
+	mov	8*0($a_ptr), $acc0
+	mov	8*1($a_ptr), %rax
+	mov	8*2($a_ptr), $acc6
+	mov	8*3($a_ptr), $acc7
+	lea	.Lord(%rip), $a_ptr	# pointer to modulus
+	mov	$b_org, $b_ptr
+	jmp	.Loop_ord_sqr
+
+.align	32
+.Loop_ord_sqr:
+	################################# a[1:] * a[0]
+	mov	%rax, $t1		# put aside a[1]
+	mul	$acc0			# a[1] * a[0]
+	mov	%rax, $acc1
+	movq	$t1, %xmm1		# offload a[1]
+	mov	$acc6, %rax
+	mov	%rdx, $acc2
+
+	mul	$acc0			# a[2] * a[0]
+	add	%rax, $acc2
+	mov	$acc7, %rax
+	movq	$acc6, %xmm2		# offload a[2]
+	adc	\$0, %rdx
+	mov	%rdx, $acc3
+
+	mul	$acc0			# a[3] * a[0]
+	add	%rax, $acc3
+	mov	$acc7, %rax
+	movq	$acc7, %xmm3		# offload a[3]
+	adc	\$0, %rdx
+	mov	%rdx, $acc4
+
+	################################# a[3] * a[2]
+	mul	$acc6			# a[3] * a[2]
+	mov	%rax, $acc5
+	mov	$acc6, %rax
+	mov	%rdx, $acc6
+
+	################################# a[2:] * a[1]
+	mul	$t1			# a[2] * a[1]
+	add	%rax, $acc3
+	mov	$acc7, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $acc7
+
+	mul	$t1			# a[3] * a[1]
+	add	%rax, $acc4
+	adc	\$0, %rdx
+
+	add	$acc7, $acc4
+	adc	%rdx, $acc5
+	adc	\$0, $acc6		# can't overflow
+
+	################################# *2
+	xor	$acc7, $acc7
+	mov	$acc0, %rax
+	add	$acc1, $acc1
+	adc	$acc2, $acc2
+	adc	$acc3, $acc3
+	adc	$acc4, $acc4
+	adc	$acc5, $acc5
+	adc	$acc6, $acc6
+	adc	\$0, $acc7
+
+	################################# Missing products
+	mul	%rax			# a[0] * a[0]
+	mov	%rax, $acc0
+	movq	%xmm1, %rax
+	mov	%rdx, $t1
+
+	mul	%rax			# a[1] * a[1]
+	add	$t1, $acc1
+	adc	%rax, $acc2
+	movq	%xmm2, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	mul	%rax			# a[2] * a[2]
+	add	$t1, $acc3
+	adc	%rax, $acc4
+	movq	%xmm3, %rax
+	adc	\$0, %rdx
+	mov	%rdx, $t1
+
+	 mov	$acc0, $t0
+	 imulq	8*4($a_ptr), $acc0	# *= .LordK
+
+	mul	%rax			# a[3] * a[3]
+	add	$t1, $acc5
+	adc	%rax, $acc6
+	 mov	8*0($a_ptr), %rax	# modulus[0]
+	adc	%rdx, $acc7		# can't overflow
+
+	################################# First reduction step
+	mul	$acc0
+	mov	$acc0, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	8*1($a_ptr), %rax	# modulus[1]
+	adc	%rdx, $t0
+
+	sub	$acc0, $acc2
+	sbb	\$0, $t1		# can't borrow
+
+	mul	$acc0
+	add	$t0, $acc1
+	adc	\$0, %rdx
+	add	%rax, $acc1
+	mov	$acc0, %rax
+	adc	%rdx, $acc2
+	mov	$acc0, %rdx
+	adc	\$0, $t1		# can't overflow
+
+	 mov	$acc1, $t0
+	 imulq	8*4($a_ptr), $acc1	# *= .LordK
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc3
+	 mov	8*0($a_ptr), %rax
+	sbb	%rdx, $acc0		# can't borrow
+
+	add	$t1, $acc3
+	adc	\$0, $acc0		# can't overflow
+
+	################################# Second reduction step
+	mul	$acc1
+	mov	$acc1, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	8*1($a_ptr), %rax
+	adc	%rdx, $t0
+
+	sub	$acc1, $acc3
+	sbb	\$0, $t1		# can't borrow
+
+	mul	$acc1
+	add	$t0, $acc2
+	adc	\$0, %rdx
+	add	%rax, $acc2
+	mov	$acc1, %rax
+	adc	%rdx, $acc3
+	mov	$acc1, %rdx
+	adc	\$0, $t1		# can't overflow
+
+	 mov	$acc2, $t0
+	 imulq	8*4($a_ptr), $acc2	# *= .LordK
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc0
+	 mov	8*0($a_ptr), %rax
+	sbb	%rdx, $acc1		# can't borrow
+
+	add	$t1, $acc0
+	adc	\$0, $acc1		# can't overflow
+
+	################################# Third reduction step
+	mul	$acc2
+	mov	$acc2, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	8*1($a_ptr), %rax
+	adc	%rdx, $t0
+
+	sub	$acc2, $acc0
+	sbb	\$0, $t1		# can't borrow
+
+	mul	$acc2
+	add	$t0, $acc3
+	adc	\$0, %rdx
+	add	%rax, $acc3
+	mov	$acc2, %rax
+	adc	%rdx, $acc0
+	mov	$acc2, %rdx
+	adc	\$0, $t1		# can't overflow
+
+	 mov	$acc3, $t0
+	 imulq	8*4($a_ptr), $acc3	# *= .LordK
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc1
+	 mov	8*0($a_ptr), %rax
+	sbb	%rdx, $acc2		# can't borrow
+
+	add	$t1, $acc1
+	adc	\$0, $acc2		# can't overflow
+
+	################################# Last reduction step
+	mul	$acc3
+	mov	$acc3, $t1
+	add	%rax, $t0		# guaranteed to be zero
+	mov	8*1($a_ptr), %rax
+	adc	%rdx, $t0
+
+	sub	$acc3, $acc1
+	sbb	\$0, $t1		# can't borrow
+
+	mul	$acc3
+	add	$t0, $acc0
+	adc	\$0, %rdx
+	add	%rax, $acc0
+	mov	$acc3, %rax
+	adc	%rdx, $acc1
+	mov	$acc3, %rdx
+	adc	\$0, $t1		# can't overflow
+
+	shl	\$32, %rax
+	shr	\$32, %rdx
+	sub	%rax, $acc2
+	sbb	%rdx, $acc3		# can't borrow
+
+	add	$t1, $acc2
+	adc	\$0, $acc3		# can't overflow
+
+	################################# Add bits [511:256] of the sqr result
+	xor	%rdx, %rdx
+	add	$acc4, $acc0
+	adc	$acc5, $acc1
+	 mov	$acc0, $acc4
+	adc	$acc6, $acc2
+	adc	$acc7, $acc3
+	 mov	$acc1, %rax
+	adc	\$0, %rdx
+
+	################################# Compare to modulus
+	sub	8*0($a_ptr), $acc0
+	 mov	$acc2, $acc6
+	sbb	8*1($a_ptr), $acc1
+	sbb	8*2($a_ptr), $acc2
+	 mov	$acc3, $acc7
+	sbb	8*3($a_ptr), $acc3
+	sbb	\$0, %rdx
+
+	cmovc	$acc4, $acc0
+	cmovnc	$acc1, %rax
+	cmovnc	$acc2, $acc6
+	cmovnc	$acc3, $acc7
+
+	dec	$b_ptr
+	jnz	.Loop_ord_sqr
+
+	mov	$acc0, 8*0($r_ptr)
+	mov	%rax,  8*1($r_ptr)
+	pxor	%xmm1, %xmm1
+	mov	$acc6, 8*2($r_ptr)
+	pxor	%xmm2, %xmm2
+	mov	$acc7, 8*3($r_ptr)
+	pxor	%xmm3, %xmm3
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_sqr_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+___
+
+$code.=<<___	if ($addx);
+################################################################################
+.type	ecp_nistz256_ord_mul_montx,\@function,3
+.align	32
+ecp_nistz256_ord_mul_montx:
+.cfi_startproc
+.Lecp_nistz256_ord_mul_montx:
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lord_mulx_body:
+
+	mov	$b_org, $b_ptr
+	mov	8*0($b_org), %rdx
+	mov	8*0($a_ptr), $acc1
+	mov	8*1($a_ptr), $acc2
+	mov	8*2($a_ptr), $acc3
+	mov	8*3($a_ptr), $acc4
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+	lea	.Lord-128(%rip), %r14
+	mov	.LordK(%rip), %r15
+
+	################################# Multiply by b[0]
+	mulx	$acc1, $acc0, $acc1
+	mulx	$acc2, $t0, $acc2
+	mulx	$acc3, $t1, $acc3
+	add	$t0, $acc1
+	mulx	$acc4, $t0, $acc4
+	 mov	$acc0, %rdx
+	 mulx	%r15, %rdx, %rax
+	adc	$t1, $acc2
+	adc	$t0, $acc3
+	adc	\$0, $acc4
+
+	################################# reduction
+	xor	$acc5, $acc5		# $acc5=0, cf=0, of=0
+	mulx	8*0+128(%r14), $t0, $t1
+	adcx	$t0, $acc0		# guaranteed to be zero
+	adox	$t1, $acc1
+
+	mulx	8*1+128(%r14), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+
+	mulx	8*2+128(%r14), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*3+128(%r14), $t0, $t1
+	 mov	8*1($b_ptr), %rdx
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+	adcx	$acc0, $acc4
+	adox	$acc0, $acc5
+	adc	\$0, $acc5		# cf=0, of=0
+
+	################################# Multiply by b[1]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc1, %rdx
+	 mulx	%r15, %rdx, %rax
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	adcx	$acc0, $acc5
+	adox	$acc0, $acc0
+	adc	\$0, $acc0		# cf=0, of=0
+
+	################################# reduction
+	mulx	8*0+128(%r14), $t0, $t1
+	adcx	$t0, $acc1		# guaranteed to be zero
+	adox	$t1, $acc2
+
+	mulx	8*1+128(%r14), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*2+128(%r14), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*3+128(%r14), $t0, $t1
+	 mov	8*2($b_ptr), %rdx
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+	adcx	$acc1, $acc5
+	adox	$acc1, $acc0
+	adc	\$0, $acc0		# cf=0, of=0
+
+	################################# Multiply by b[2]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc2, %rdx
+	 mulx	%r15, %rdx, %rax
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+
+	adcx	$acc1, $acc0
+	adox	$acc1, $acc1
+	adc	\$0, $acc1		# cf=0, of=0
+
+	################################# reduction
+	mulx	8*0+128(%r14), $t0, $t1
+	adcx	$t0, $acc2		# guaranteed to be zero
+	adox	$t1, $acc3
+
+	mulx	8*1+128(%r14), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*2+128(%r14), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*3+128(%r14), $t0, $t1
+	 mov	8*3($b_ptr), %rdx
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+	adcx	$acc2, $acc0
+	adox	$acc2, $acc1
+	adc	\$0, $acc1		# cf=0, of=0
+
+	################################# Multiply by b[3]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc3, %rdx
+	 mulx	%r15, %rdx, %rax
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+
+	adcx	$acc2, $acc1
+	adox	$acc2, $acc2
+	adc	\$0, $acc2		# cf=0, of=0
+
+	################################# reduction
+	mulx	8*0+128(%r14), $t0, $t1
+	adcx	$t0, $acc3		# guranteed to be zero
+	adox	$t1, $acc4
+
+	mulx	8*1+128(%r14), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*2+128(%r14), $t0, $t1
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+
+	mulx	8*3+128(%r14), $t0, $t1
+	lea	128(%r14),%r14
+	 mov	$acc4, $t2
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	 mov	$acc5, $t3
+	adcx	$acc3, $acc1
+	adox	$acc3, $acc2
+	adc	\$0, $acc2
+
+	#################################
+	# Branch-less conditional subtraction of P
+	 mov	$acc0, $t0
+	sub	8*0(%r14), $acc4
+	sbb	8*1(%r14), $acc5
+	sbb	8*2(%r14), $acc0
+	 mov	$acc1, $t1
+	sbb	8*3(%r14), $acc1
+	sbb	\$0, $acc2
+
+	cmovc	$t2, $acc4
+	cmovc	$t3, $acc5
+	cmovc	$t0, $acc0
+	cmovc	$t1, $acc1
+
+	mov	$acc4, 8*0($r_ptr)
+	mov	$acc5, 8*1($r_ptr)
+	mov	$acc0, 8*2($r_ptr)
+	mov	$acc1, 8*3($r_ptr)
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_mulx_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
+
+.type	ecp_nistz256_ord_sqr_montx,\@function,3
+.align	32
+ecp_nistz256_ord_sqr_montx:
+.cfi_startproc
+.Lecp_nistz256_ord_sqr_montx:
+	push	%rbp
+.cfi_push	%rbp
+	push	%rbx
+.cfi_push	%rbx
+	push	%r12
+.cfi_push	%r12
+	push	%r13
+.cfi_push	%r13
+	push	%r14
+.cfi_push	%r14
+	push	%r15
+.cfi_push	%r15
+.Lord_sqrx_body:
+
+	mov	$b_org, $b_ptr
+	mov	8*0($a_ptr), %rdx
+	mov	8*1($a_ptr), $acc6
+	mov	8*2($a_ptr), $acc7
+	mov	8*3($a_ptr), $acc0
+	lea	.Lord(%rip), $a_ptr
+	jmp	.Loop_ord_sqrx
+
+.align	32
+.Loop_ord_sqrx:
+	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
+	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
+	 mov	%rdx, %rax		# offload a[0]
+	 movq	$acc6, %xmm1		# offload a[1]
+	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
+	 mov	$acc6, %rdx
+	add	$t0, $acc2
+	 movq	$acc7, %xmm2		# offload a[2]
+	adc	$t1, $acc3
+	adc	\$0, $acc4
+	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
+	#################################
+	mulx	$acc7, $t0, $t1		# a[1]*a[2]
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	$acc0, $t0, $t1		# a[1]*a[3]
+	 mov	$acc7, %rdx
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+	adc	\$0, $acc5
+	#################################
+	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
+	mov	%rax, %rdx
+	 movq	$acc0, %xmm3		# offload a[3]
+	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
+	 adcx	$acc1, $acc1		# acc1:6<<1
+	adox	$t0, $acc5
+	 adcx	$acc2, $acc2
+	adox	$acc7, $acc6		# of=0
+
+	################################# a[i]*a[i]
+	mulx	%rdx, $acc0, $t1
+	movq	%xmm1, %rdx
+	 adcx	$acc3, $acc3
+	adox	$t1, $acc1
+	 adcx	$acc4, $acc4
+	mulx	%rdx, $t0, $t4
+	movq	%xmm2, %rdx
+	 adcx	$acc5, $acc5
+	adox	$t0, $acc2
+	 adcx	$acc6, $acc6
+	mulx	%rdx, $t0, $t1
+	.byte	0x67
+	movq	%xmm3, %rdx
+	adox	$t4, $acc3
+	 adcx	$acc7, $acc7
+	adox	$t0, $acc4
+	adox	$t1, $acc5
+	mulx	%rdx, $t0, $t4
+	adox	$t0, $acc6
+	adox	$t4, $acc7
+
+	################################# reduction
+	mov	$acc0, %rdx
+	mulx	8*4($a_ptr), %rdx, $t0
+
+	xor	%rax, %rax		# cf=0, of=0
+	mulx	8*0($a_ptr), $t0, $t1
+	adcx	$t0, $acc0		# guaranteed to be zero
+	adox	$t1, $acc1
+	mulx	8*1($a_ptr), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+	mulx	8*2($a_ptr), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+	mulx	8*3($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc0		# of=0
+	adcx	%rax, $acc0		# cf=0
+
+	#################################
+	mov	$acc1, %rdx
+	mulx	8*4($a_ptr), %rdx, $t0
+
+	mulx	8*0($a_ptr), $t0, $t1
+	adox	$t0, $acc1		# guaranteed to be zero
+	adcx	$t1, $acc2
+	mulx	8*1($a_ptr), $t0, $t1
+	adox	$t0, $acc2
+	adcx	$t1, $acc3
+	mulx	8*2($a_ptr), $t0, $t1
+	adox	$t0, $acc3
+	adcx	$t1, $acc0
+	mulx	8*3($a_ptr), $t0, $t1
+	adox	$t0, $acc0
+	adcx	$t1, $acc1		# cf=0
+	adox	%rax, $acc1		# of=0
+
+	#################################
+	mov	$acc2, %rdx
+	mulx	8*4($a_ptr), %rdx, $t0
+
+	mulx	8*0($a_ptr), $t0, $t1
+	adcx	$t0, $acc2		# guaranteed to be zero
+	adox	$t1, $acc3
+	mulx	8*1($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc0
+	mulx	8*2($a_ptr), $t0, $t1
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	mulx	8*3($a_ptr), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2		# of=0
+	adcx	%rax, $acc2		# cf=0
+
+	#################################
+	mov	$acc3, %rdx
+	mulx	8*4($a_ptr), %rdx, $t0
+
+	mulx	8*0($a_ptr), $t0, $t1
+	adox	$t0, $acc3		# guaranteed to be zero
+	adcx	$t1, $acc0
+	mulx	8*1($a_ptr), $t0, $t1
+	adox	$t0, $acc0
+	adcx	$t1, $acc1
+	mulx	8*2($a_ptr), $t0, $t1
+	adox	$t0, $acc1
+	adcx	$t1, $acc2
+	mulx	8*3($a_ptr), $t0, $t1
+	adox	$t0, $acc2
+	adcx	$t1, $acc3
+	adox	%rax, $acc3
+
+	################################# accumulate upper half
+	add	$acc0, $acc4		# add	$acc4, $acc0
+	adc	$acc5, $acc1
+	 mov	$acc4, %rdx
+	adc	$acc6, $acc2
+	adc	$acc7, $acc3
+	 mov	$acc1, $acc6
+	adc	\$0, %rax
+
+	################################# compare to modulus
+	sub	8*0($a_ptr), $acc4
+	 mov	$acc2, $acc7
+	sbb	8*1($a_ptr), $acc1
+	sbb	8*2($a_ptr), $acc2
+	 mov	$acc3, $acc0
+	sbb	8*3($a_ptr), $acc3
+	sbb	\$0, %rax
+
+	cmovnc	$acc4, %rdx
+	cmovnc	$acc1, $acc6
+	cmovnc	$acc2, $acc7
+	cmovnc	$acc3, $acc0
+
+	dec	$b_ptr
+	jnz	.Loop_ord_sqrx
+
+	mov	%rdx, 8*0($r_ptr)
+	mov	$acc6, 8*1($r_ptr)
+	pxor	%xmm1, %xmm1
+	mov	$acc7, 8*2($r_ptr)
+	pxor	%xmm2, %xmm2
+	mov	$acc0, 8*3($r_ptr)
+	pxor	%xmm3, %xmm3
+
+	mov	0(%rsp),%r15
+.cfi_restore	%r15
+	mov	8(%rsp),%r14
+.cfi_restore	%r14
+	mov	16(%rsp),%r13
+.cfi_restore	%r13
+	mov	24(%rsp),%r12
+.cfi_restore	%r12
+	mov	32(%rsp),%rbx
+.cfi_restore	%rbx
+	mov	40(%rsp),%rbp
+.cfi_restore	%rbp
+	lea	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_sqrx_epilogue:
+	ret
+.cfi_endproc
+.size	ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
+___
+
+$code.=<<___;
+################################################################################
 # void ecp_nistz256_mul_mont(
 #   uint64_t res[4],
 #   uint64_t a[4],
@@ -2840,6 +3931,24 @@
 	.rva	.LSEH_end_ecp_nistz256_neg
 	.rva	.LSEH_info_ecp_nistz256_neg
 
+	.rva	.LSEH_begin_ecp_nistz256_ord_mul_mont
+	.rva	.LSEH_end_ecp_nistz256_ord_mul_mont
+	.rva	.LSEH_info_ecp_nistz256_ord_mul_mont
+
+	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_mont
+	.rva	.LSEH_end_ecp_nistz256_ord_sqr_mont
+	.rva	.LSEH_info_ecp_nistz256_ord_sqr_mont
+___
+$code.=<<___	if ($addx);
+	.rva	.LSEH_begin_ecp_nistz256_ord_mul_montx
+	.rva	.LSEH_end_ecp_nistz256_ord_mul_montx
+	.rva	.LSEH_info_ecp_nistz256_ord_mul_montx
+
+	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_montx
+	.rva	.LSEH_end_ecp_nistz256_ord_sqr_montx
+	.rva	.LSEH_info_ecp_nistz256_ord_sqr_montx
+___
+$code.=<<___;
 	.rva	.LSEH_begin_ecp_nistz256_mul_mont
 	.rva	.LSEH_end_ecp_nistz256_mul_mont
 	.rva	.LSEH_info_ecp_nistz256_mul_mont
@@ -2899,6 +4008,30 @@
 	.byte	9,0,0,0
 	.rva	short_handler
 	.rva	.Lneg_body,.Lneg_epilogue		# HandlerData[]
+.LSEH_info_ecp_nistz256_ord_mul_mont:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lord_mul_body,.Lord_mul_epilogue	# HandlerData[]
+	.long	48,0
+.LSEH_info_ecp_nistz256_ord_sqr_mont:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lord_sqr_body,.Lord_sqr_epilogue	# HandlerData[]
+	.long	48,0
+___
+$code.=<<___ if ($addx);
+.LSEH_info_ecp_nistz256_ord_mul_montx:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lord_mulx_body,.Lord_mulx_epilogue	# HandlerData[]
+	.long	48,0
+.LSEH_info_ecp_nistz256_ord_sqr_montx:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.Lord_sqrx_body,.Lord_sqrx_epilogue	# HandlerData[]
+	.long	48,0
+___
+$code.=<<___;
 .LSEH_info_ecp_nistz256_mul_mont:
 	.byte	9,0,0,0
 	.rva	full_handler
diff --git a/src/crypto/fipsmodule/ec/ec.c b/src/crypto/fipsmodule/ec/ec.c
index ee7ec55..07f9c34 100644
--- a/src/crypto/fipsmodule/ec/ec.c
+++ b/src/crypto/fipsmodule/ec/ec.c
@@ -352,7 +352,7 @@
   }
 
   if (BN_num_bytes(order) > EC_MAX_SCALAR_BYTES) {
-    OPENSSL_PUT_ERROR(EC, EC_R_INVALID_FIELD);
+    OPENSSL_PUT_ERROR(EC, EC_R_INVALID_GROUP_ORDER);
     return 0;
   }
 
@@ -839,7 +839,7 @@
   BIGNUM *tmp = BN_CTX_get(ctx);
   int ok = tmp != NULL &&
            BN_nnmod(tmp, in, order, ctx) &&
-           ec_bignum_to_scalar_unchecked(group, out, tmp);
+           ec_bignum_to_scalar(group, out, tmp);
   BN_CTX_end(ctx);
   return ok;
 }
@@ -955,30 +955,3 @@
 
   return OPENSSL_NUM_BUILT_IN_CURVES;
 }
-
-int ec_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out,
-                        const BIGNUM *in) {
-  if (!ec_bignum_to_scalar_unchecked(group, out, in)) {
-    return 0;
-  }
-  if (!bn_less_than_words(out->words, group->order.d, group->order.width)) {
-    OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR);
-    return 0;
-  }
-  return 1;
-}
-
-int ec_bignum_to_scalar_unchecked(const EC_GROUP *group, EC_SCALAR *out,
-                                  const BIGNUM *in) {
-  if (!bn_copy_words(out->words, group->order.width, in)) {
-    OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR);
-    return 0;
-  }
-  return 1;
-}
-
-int ec_random_nonzero_scalar(const EC_GROUP *group, EC_SCALAR *out,
-                             const uint8_t additional_data[32]) {
-  return bn_rand_range_words(out->words, 1, group->order.d, group->order.width,
-                             additional_data);
-}
diff --git a/src/crypto/fipsmodule/ec/ec_montgomery.c b/src/crypto/fipsmodule/ec/ec_montgomery.c
index 165c06f..d80fa23 100644
--- a/src/crypto/fipsmodule/ec/ec_montgomery.c
+++ b/src/crypto/fipsmodule/ec/ec_montgomery.c
@@ -184,68 +184,58 @@
 
   BN_CTX_start(ctx);
 
-  if (BN_cmp(&point->Z, &group->one) == 0) {
-    // |point| is already affine.
-    if (x != NULL && !BN_from_montgomery(x, &point->X, group->mont, ctx)) {
+  // transform  (X, Y, Z)  into  (x, y) := (X/Z^2, Y/Z^3)
+
+  BIGNUM *Z_1 = BN_CTX_get(ctx);
+  BIGNUM *Z_2 = BN_CTX_get(ctx);
+  BIGNUM *Z_3 = BN_CTX_get(ctx);
+  if (Z_1 == NULL ||
+      Z_2 == NULL ||
+      Z_3 == NULL) {
+    goto err;
+  }
+
+  // The straightforward way to calculate the inverse of a Montgomery-encoded
+  // value where the result is Montgomery-encoded is:
+  //
+  //    |BN_from_montgomery| + invert + |BN_to_montgomery|.
+  //
+  // This is equivalent, but more efficient, because |BN_from_montgomery|
+  // is more efficient (at least in theory) than |BN_to_montgomery|, since it
+  // doesn't have to do the multiplication before the reduction.
+  //
+  // Use Fermat's Little Theorem instead of |BN_mod_inverse_odd| since this
+  // inversion may be done as the final step of private key operations.
+  // Unfortunately, this is suboptimal for ECDSA verification.
+  if (!BN_from_montgomery(Z_1, &point->Z, group->mont, ctx) ||
+      !BN_from_montgomery(Z_1, Z_1, group->mont, ctx) ||
+      !bn_mod_inverse_prime(Z_1, Z_1, &group->field, ctx, group->mont)) {
+    goto err;
+  }
+
+  if (!BN_mod_mul_montgomery(Z_2, Z_1, Z_1, group->mont, ctx)) {
+    goto err;
+  }
+
+  // Instead of using |BN_from_montgomery| to convert the |x| coordinate
+  // and then calling |BN_from_montgomery| again to convert the |y|
+  // coordinate below, convert the common factor |Z_2| once now, saving one
+  // reduction.
+  if (!BN_from_montgomery(Z_2, Z_2, group->mont, ctx)) {
+    goto err;
+  }
+
+  if (x != NULL) {
+    if (!BN_mod_mul_montgomery(x, &point->X, Z_2, group->mont, ctx)) {
       goto err;
     }
-    if (y != NULL && !BN_from_montgomery(y, &point->Y, group->mont, ctx)) {
+  }
+
+  if (y != NULL) {
+    if (!BN_mod_mul_montgomery(Z_3, Z_2, Z_1, group->mont, ctx) ||
+        !BN_mod_mul_montgomery(y, &point->Y, Z_3, group->mont, ctx)) {
       goto err;
     }
-  } else {
-    // transform  (X, Y, Z)  into  (x, y) := (X/Z^2, Y/Z^3)
-
-    BIGNUM *Z_1 = BN_CTX_get(ctx);
-    BIGNUM *Z_2 = BN_CTX_get(ctx);
-    BIGNUM *Z_3 = BN_CTX_get(ctx);
-    if (Z_1 == NULL ||
-        Z_2 == NULL ||
-        Z_3 == NULL) {
-      goto err;
-    }
-
-    // The straightforward way to calculate the inverse of a Montgomery-encoded
-    // value where the result is Montgomery-encoded is:
-    //
-    //    |BN_from_montgomery| + invert + |BN_to_montgomery|.
-    //
-    // This is equivalent, but more efficient, because |BN_from_montgomery|
-    // is more efficient (at least in theory) than |BN_to_montgomery|, since it
-    // doesn't have to do the multiplication before the reduction.
-    //
-    // Use Fermat's Little Theorem instead of |BN_mod_inverse_odd| since this
-    // inversion may be done as the final step of private key operations.
-    // Unfortunately, this is suboptimal for ECDSA verification.
-    if (!BN_from_montgomery(Z_1, &point->Z, group->mont, ctx) ||
-        !BN_from_montgomery(Z_1, Z_1, group->mont, ctx) ||
-        !bn_mod_inverse_prime(Z_1, Z_1, &group->field, ctx, group->mont)) {
-      goto err;
-    }
-
-    if (!BN_mod_mul_montgomery(Z_2, Z_1, Z_1, group->mont, ctx)) {
-      goto err;
-    }
-
-    // Instead of using |BN_from_montgomery| to convert the |x| coordinate
-    // and then calling |BN_from_montgomery| again to convert the |y|
-    // coordinate below, convert the common factor |Z_2| once now, saving one
-    // reduction.
-    if (!BN_from_montgomery(Z_2, Z_2, group->mont, ctx)) {
-      goto err;
-    }
-
-    if (x != NULL) {
-      if (!BN_mod_mul_montgomery(x, &point->X, Z_2, group->mont, ctx)) {
-        goto err;
-      }
-    }
-
-    if (y != NULL) {
-      if (!BN_mod_mul_montgomery(Z_3, Z_2, Z_1, group->mont, ctx) ||
-          !BN_mod_mul_montgomery(y, &point->Y, Z_3, group->mont, ctx)) {
-        goto err;
-      }
-    }
   }
 
   ret = 1;
@@ -267,4 +257,5 @@
   out->field_sqr = ec_GFp_mont_field_sqr;
   out->field_encode = ec_GFp_mont_field_encode;
   out->field_decode = ec_GFp_mont_field_decode;
+  out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery;
 }
diff --git a/src/crypto/fipsmodule/ec/internal.h b/src/crypto/fipsmodule/ec/internal.h
index c5d7291..7f72c31 100644
--- a/src/crypto/fipsmodule/ec/internal.h
+++ b/src/crypto/fipsmodule/ec/internal.h
@@ -133,6 +133,12 @@
                       BN_CTX *);  // e.g. to Montgomery
   int (*field_decode)(const EC_GROUP *, BIGNUM *r, const BIGNUM *a,
                       BN_CTX *);  // e.g. from Montgomery
+
+  // scalar_inv_mont sets |out| to |in|^-1, where both input and output are in
+  // Montgomery form.
+  void (*scalar_inv_montgomery)(const EC_GROUP *group, EC_SCALAR *out,
+                                const EC_SCALAR *in);
+
 } /* EC_METHOD */;
 
 const EC_METHOD *EC_GFp_mont_method(void);
@@ -183,16 +189,33 @@
 OPENSSL_EXPORT int ec_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out,
                                        const BIGNUM *in);
 
-// ec_bignum_to_scalar_unchecked behaves like |ec_bignum_to_scalar| but does not
-// check |in| is fully reduced.
-int ec_bignum_to_scalar_unchecked(const EC_GROUP *group, EC_SCALAR *out,
-                                  const BIGNUM *in);
-
 // ec_random_nonzero_scalar sets |out| to a uniformly selected random value from
 // 1 to |group->order| - 1. It returns one on success and zero on error.
 int ec_random_nonzero_scalar(const EC_GROUP *group, EC_SCALAR *out,
                              const uint8_t additional_data[32]);
 
+// ec_scalar_add sets |r| to |a| + |b|.
+void ec_scalar_add(const EC_GROUP *group, EC_SCALAR *r, const EC_SCALAR *a,
+                   const EC_SCALAR *b);
+
+// ec_scalar_to_montgomery sets |r| to |a| in Montgomery form.
+void ec_scalar_to_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                             const EC_SCALAR *a);
+
+// ec_scalar_to_montgomery sets |r| to |a| converted from Montgomery form.
+void ec_scalar_from_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                               const EC_SCALAR *a);
+
+// ec_scalar_mul_montgomery sets |r| to |a| * |b| where inputs and outputs are
+// in Montgomery form.
+void ec_scalar_mul_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                              const EC_SCALAR *a, const EC_SCALAR *b);
+
+// ec_scalar_mul_montgomery sets |r| to |a|^-1 where inputs and outputs are in
+// Montgomery form.
+void ec_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                              const EC_SCALAR *a);
+
 // ec_point_add_mixed behaves like |EC_POINT_add|, but |&b->Z| must be zero or
 // one.
 int ec_point_add_mixed(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a,
@@ -254,6 +277,8 @@
 int ec_GFp_simple_make_affine(const EC_GROUP *, EC_POINT *, BN_CTX *);
 int ec_GFp_simple_points_make_affine(const EC_GROUP *, size_t num,
                                      EC_POINT * [], BN_CTX *);
+void ec_simple_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                                     const EC_SCALAR *a);
 
 // method functions in montgomery.c
 int ec_GFp_mont_group_init(EC_GROUP *);
diff --git a/src/crypto/fipsmodule/ec/p224-64.c b/src/crypto/fipsmodule/ec/p224-64.c
index 7e2f45b..0a379fe 100644
--- a/src/crypto/fipsmodule/ec/p224-64.c
+++ b/src/crypto/fipsmodule/ec/p224-64.c
@@ -203,38 +203,25 @@
   }
 }
 
-// To preserve endianness when using BN_bn2bin and BN_bin2bn
-static void p224_flip_endian(uint8_t *out, const uint8_t *in, size_t len) {
-  for (size_t i = 0; i < len; ++i) {
-    out[i] = in[len - 1 - i];
-  }
-}
-
 // From OpenSSL BIGNUM to internal representation
 static int p224_BN_to_felem(p224_felem out, const BIGNUM *bn) {
   // BN_bn2bin eats leading zeroes
   p224_felem_bytearray b_out;
-  OPENSSL_memset(b_out, 0, sizeof(b_out));
-  size_t num_bytes = BN_num_bytes(bn);
-  if (num_bytes > sizeof(b_out) ||
-      BN_is_negative(bn)) {
+  if (BN_is_negative(bn) ||
+      !BN_bn2le_padded(b_out, sizeof(b_out), bn)) {
     OPENSSL_PUT_ERROR(EC, EC_R_BIGNUM_OUT_OF_RANGE);
     return 0;
   }
 
-  p224_felem_bytearray b_in;
-  num_bytes = BN_bn2bin(bn, b_in);
-  p224_flip_endian(b_out, b_in, num_bytes);
   p224_bin28_to_felem(out, b_out);
   return 1;
 }
 
 // From internal representation to OpenSSL BIGNUM
 static BIGNUM *p224_felem_to_BN(BIGNUM *out, const p224_felem in) {
-  p224_felem_bytearray b_in, b_out;
-  p224_felem_to_bin28(b_in, in);
-  p224_flip_endian(b_out, b_in, sizeof(b_out));
-  return BN_bin2bn(b_out, sizeof(b_out), out);
+  p224_felem_bytearray b_out;
+  p224_felem_to_bin28(b_out, in);
+  return BN_le2bn(b_out, sizeof(b_out), out);
 }
 
 // Field operations, using the internal representation of field elements.
@@ -1127,6 +1114,7 @@
   out->field_sqr = ec_GFp_nistp224_field_sqr;
   out->field_encode = NULL;
   out->field_decode = NULL;
+  out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery;
 };
 
 #endif  // BORINGSSL_HAS_UINT128 && !SMALL
diff --git a/src/crypto/fipsmodule/ec/p256-x86_64.c b/src/crypto/fipsmodule/ec/p256-x86_64.c
index dbe99ed..d8d3a39 100644
--- a/src/crypto/fipsmodule/ec/p256-x86_64.c
+++ b/src/crypto/fipsmodule/ec/p256-x86_64.c
@@ -430,6 +430,87 @@
   return 1;
 }
 
+static void ecp_nistz256_inv_mod_ord(const EC_GROUP *group, EC_SCALAR *out,
+                                     const EC_SCALAR *in) {
+  // table[i] stores a power of |in| corresponding to the matching enum value.
+  enum {
+    // The following indices specify the power in binary.
+    i_1 = 0,
+    i_10,
+    i_11,
+    i_101,
+    i_111,
+    i_1010,
+    i_1111,
+    i_10101,
+    i_101010,
+    i_101111,
+    // The following indices specify 2^N-1, or N ones in a row.
+    i_x6,
+    i_x8,
+    i_x16,
+    i_x32
+  };
+  BN_ULONG table[15][P256_LIMBS];
+
+  // https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion
+  //
+  // Even though this code path spares 12 squarings, 4.5%, and 13
+  // multiplications, 25%, the overall sign operation is not that much faster,
+  // not more that 2%. Most of the performance of this function comes from the
+  // scalar operations.
+
+  // Pre-calculate powers.
+  OPENSSL_memcpy(table[i_1], in->words, P256_LIMBS * sizeof(BN_ULONG));
+
+  ecp_nistz256_ord_sqr_mont(table[i_10], table[i_1], 1);
+
+  ecp_nistz256_ord_mul_mont(table[i_11], table[i_1], table[i_10]);
+
+  ecp_nistz256_ord_mul_mont(table[i_101], table[i_11], table[i_10]);
+
+  ecp_nistz256_ord_mul_mont(table[i_111], table[i_101], table[i_10]);
+
+  ecp_nistz256_ord_sqr_mont(table[i_1010], table[i_101], 1);
+
+  ecp_nistz256_ord_mul_mont(table[i_1111], table[i_1010], table[i_101]);
+
+  ecp_nistz256_ord_sqr_mont(table[i_10101], table[i_1010], 1);
+  ecp_nistz256_ord_mul_mont(table[i_10101], table[i_10101], table[i_1]);
+
+  ecp_nistz256_ord_sqr_mont(table[i_101010], table[i_10101], 1);
+
+  ecp_nistz256_ord_mul_mont(table[i_101111], table[i_101010], table[i_101]);
+
+  ecp_nistz256_ord_mul_mont(table[i_x6], table[i_101010], table[i_10101]);
+
+  ecp_nistz256_ord_sqr_mont(table[i_x8], table[i_x6], 2);
+  ecp_nistz256_ord_mul_mont(table[i_x8], table[i_x8], table[i_11]);
+
+  ecp_nistz256_ord_sqr_mont(table[i_x16], table[i_x8], 8);
+  ecp_nistz256_ord_mul_mont(table[i_x16], table[i_x16], table[i_x8]);
+
+  ecp_nistz256_ord_sqr_mont(table[i_x32], table[i_x16], 16);
+  ecp_nistz256_ord_mul_mont(table[i_x32], table[i_x32], table[i_x16]);
+
+  // Compute |in| raised to the order-2.
+  ecp_nistz256_ord_sqr_mont(out->words, table[i_x32], 64);
+  ecp_nistz256_ord_mul_mont(out->words, out->words, table[i_x32]);
+  static const struct {
+    uint8_t p, i;
+  } kChain[27] = {{32, i_x32},    {6, i_101111}, {5, i_111},    {4, i_11},
+                  {5, i_1111},    {5, i_10101},  {4, i_101},    {3, i_101},
+                  {3, i_101},     {5, i_111},    {9, i_101111}, {6, i_1111},
+                  {2, i_1},       {5, i_1},      {6, i_1111},   {5, i_111},
+                  {4, i_111},     {5, i_111},    {5, i_101},    {3, i_11},
+                  {10, i_101111}, {2, i_11},     {5, i_11},     {5, i_11},
+                  {3, i_1},       {7, i_10101},  {6, i_1111}};
+  for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kChain); i++) {
+    ecp_nistz256_ord_sqr_mont(out->words, out->words, kChain[i].p);
+    ecp_nistz256_ord_mul_mont(out->words, out->words, table[kChain[i].i]);
+  }
+}
+
 DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistz256_method) {
   out->group_init = ec_GFp_mont_group_init;
   out->group_finish = ec_GFp_mont_group_finish;
@@ -441,6 +522,7 @@
   out->field_sqr = ec_GFp_mont_field_sqr;
   out->field_encode = ec_GFp_mont_field_encode;
   out->field_decode = ec_GFp_mont_field_decode;
+  out->scalar_inv_montgomery = ecp_nistz256_inv_mod_ord;
 };
 
 #endif /* !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
diff --git a/src/crypto/fipsmodule/ec/p256-x86_64.h b/src/crypto/fipsmodule/ec/p256-x86_64.h
index 9226124..21b461c 100644
--- a/src/crypto/fipsmodule/ec/p256-x86_64.h
+++ b/src/crypto/fipsmodule/ec/p256-x86_64.h
@@ -62,6 +62,24 @@
 }
 
 
+// P-256 scalar operations.
+//
+// The following functions compute modulo N, where N is the order of P-256. They
+// take fully-reduced inputs and give fully-reduced outputs.
+
+// ecp_nistz256_ord_mul_mont sets |res| to |a| * |b| where inputs and outputs
+// are in Montgomery form. That is, |res| is |a| * |b| * 2^-256 mod N.
+void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS],
+                               const BN_ULONG a[P256_LIMBS],
+                               const BN_ULONG b[P256_LIMBS]);
+
+// ecp_nistz256_ord_sqr_mont sets |res| to |a|^(2*|rep|) where inputs and
+// outputs are in Montgomery form. That is, |res| is
+// (|a| * 2^-256)^(2*|rep|) * 2^256 mod N.
+void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
+                               const BN_ULONG a[P256_LIMBS], int rep);
+
+
 // P-256 point operations.
 //
 // The following functions may be used in-place. All coordinates are in the
diff --git a/src/crypto/fipsmodule/ec/p256-x86_64_test.cc b/src/crypto/fipsmodule/ec/p256-x86_64_test.cc
index 5cd701b..8ed1dd4 100644
--- a/src/crypto/fipsmodule/ec/p256-x86_64_test.cc
+++ b/src/crypto/fipsmodule/ec/p256-x86_64_test.cc
@@ -365,6 +365,47 @@
   }
 }
 
+static void TestOrdMulMont(FileTest *t) {
+  // This test works on scalars rather than field elements, but the
+  // representation is the same.
+  BN_ULONG a[P256_LIMBS], b[P256_LIMBS], result[P256_LIMBS];
+  ASSERT_TRUE(GetFieldElement(t, a, "A"));
+  ASSERT_TRUE(GetFieldElement(t, b, "B"));
+  ASSERT_TRUE(GetFieldElement(t, result, "Result"));
+
+  BN_ULONG ret[P256_LIMBS];
+  ecp_nistz256_ord_mul_mont(ret, a, b);
+  EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
+
+  ecp_nistz256_ord_mul_mont(ret, b, a);
+  EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
+
+  OPENSSL_memcpy(ret, a, sizeof(ret));
+  ecp_nistz256_ord_mul_mont(ret, ret /* a */, b);
+  EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
+
+  OPENSSL_memcpy(ret, a, sizeof(ret));
+  ecp_nistz256_ord_mul_mont(ret, b, ret);
+  EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
+
+  OPENSSL_memcpy(ret, b, sizeof(ret));
+  ecp_nistz256_ord_mul_mont(ret, a, ret /* b */);
+  EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
+
+  OPENSSL_memcpy(ret, b, sizeof(ret));
+  ecp_nistz256_ord_mul_mont(ret, ret /* b */, a);
+  EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
+
+  if (OPENSSL_memcmp(a, b, sizeof(a)) == 0) {
+    ecp_nistz256_ord_sqr_mont(ret, a, 1);
+    EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
+
+    OPENSSL_memcpy(ret, a, sizeof(ret));
+    ecp_nistz256_ord_sqr_mont(ret, ret /* a */, 1);
+    EXPECT_FIELD_ELEMENTS_EQUAL(result, ret);
+  }
+}
+
 TEST(P256_X86_64Test, TestVectors) {
   return FileTestGTest("crypto/fipsmodule/ec/p256-x86_64_tests.txt",
                        [](FileTest *t) {
@@ -376,6 +417,8 @@
       TestFromMont(t);
     } else if (t->GetParameter() == "PointAdd") {
       TestPointAdd(t);
+    } else if (t->GetParameter() == "OrdMulMont") {
+      TestOrdMulMont(t);
     } else {
       FAIL() << "Unknown test type:" << t->GetParameter();
     }
diff --git a/src/crypto/fipsmodule/ec/p256-x86_64_tests.txt b/src/crypto/fipsmodule/ec/p256-x86_64_tests.txt
index a680850..d1fdad0 100644
--- a/src/crypto/fipsmodule/ec/p256-x86_64_tests.txt
+++ b/src/crypto/fipsmodule/ec/p256-x86_64_tests.txt
@@ -1403,3 +1403,141 @@
 B.Z = 1f7c7226d78e51478c683bbb6afe01abc2225dbfc773d0806d30ff5f827b76c8
 Result.X = fba400ae656ec3103c5c5f531d2a0f7368031e01a48a91f1a4f3138d294b13be
 Result.Y = 160e358ad1f059eb62722df01a7440048a1db21ecaea8698efa9677db6e9ff97
+
+
+# Scalar montgomery multiplication tests.
+#
+# The following tests satisfy A * B * 2^-256 = Result (mod N).
+
+Test = OrdMulMont
+A = 0000000000000000000000000000000000000000000000000000000000000000
+B = b4e9b0aea84aa5ed86964a22881a4d0e58f88e9225f30990c18751e7d4b9ec95
+Result = 0000000000000000000000000000000000000000000000000000000000000000
+
+Test = OrdMulMont
+A = 00000000ffffffff00000000000000004319055258e8617b0c46353d039cdaaf
+B = 5d24e62244973fbd829573d5a579b4e89a6512933a2c3d255bbdbc1c89028323
+Result = 5d24e62244973fbd829573d5a579b4e89a6512933a2c3d255bbdbc1c89028323
+
+Test = OrdMulMont
+A = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632550
+B = abafdc695e4c2c850f8fc60f1efdbf7406a3cd2c6c59bb7e608985723896c187
+Result = 917b1214c7b31a7ee7e53be0b41a139e435ff576b51ec6af1e1a944412bea38b
+
+Test = OrdMulMont
+A = cf0f01b83670a1c79154ea16f3574ca2d4c688a3c3b6017795cbe54854418904
+B = c5ec4d3b00fb2e11fb3b1aa09e60f7d187f7c515977d1343dab9745961fcbb43
+Result = 7aaddcee32e3b340af5ad06f854284cbbce5a1ab919e9b7771c3b0e937093438
+
+Test = OrdMulMont
+A = 50023f9913879ac4020bc45a89a0ea89082db6265b96b851af29969dd8a9661c
+B = 7c165b1cba80808db114441563aa0fbfba41b9e8acff77312a2dd2138b74ef89
+Result = 3d2ca1705d8d38cbc76a5409c6535044733cafcb95d12654af1d14de177978b5
+
+Test = OrdMulMont
+A = 4d5341ea735e53d2e4f2934755642adee209bd0e5a1506206513227f3c48b270
+B = 6e48f2b60eb8fb86760134abaf3d61692557862924069c599ceb31309ea18704
+Result = 37cde3e35c814d4287bd345b910d687983929907b7a08afa2acd8596832ea86c
+
+Test = OrdMulMont
+A = 33d06c3f5a595a41a6f9c4356f8ab2b8c550d4c64b806eab5560af247c5fa9ed
+B = 0e52f34adf5754343bcf3529d652620da3c05b5dd9cdcddfb08b674a1ad21a09
+Result = 9dc64d7b4c1bc33b930e0daee2a24fc41f770378659ee71b846d2239b0fea8ea
+
+Test = OrdMulMont
+A = 8f211780cce4f93b7193b9378e6f83e1147fb3602b052eef782de8cc833e54ab
+B = e1e4f7f1feb15be64292cff86b47cd9730bcb15b133340022b824d591a660cdf
+Result = dfa2b683b1ae23027c7c109e0abb40a1366eda027ad2cad1a09061a57bee391f
+
+Test = OrdMulMont
+A = 803c279c7e4c11a5568290c0a5789ceab6860f51a942bf646501a45e1ec0a6bf
+B = c0a1145a12037129c571f5f939bf16ea0b8b480f08ec774c045d059841f7d5ed
+Result = ab48fa3b4aa692a7c077cc55ee3c3fff895118a23728c2fa5f361b30730d955a
+
+Test = OrdMulMont
+A = 0e5c95158297d75dbf0b02c3090730f65bf14704495b14837dd907af569407f1
+B = 5a03e3787c8772b2fb7ab07d7fe7fe653a58bdae7fde3174c6ed305e524f5728
+Result = 71296d305dcf9ce39010ea4f4bbf9f7c1064a413597bdc7574c13dea3fa514dc
+
+Test = OrdMulMont
+A = 366299be07886f7846fc74231db624b169360e3c8f60196a1afc9f2101e03922
+B = d6d7c830a6edb6861868b964519a6b68f6f24f7c09d66003f3f88eadd1e00158
+Result = 0b89596bf5054ebe95a39dab6e975b58190160610b09b2a4f93331ecc0e79fd3
+
+Test = OrdMulMont
+A = 8f36f0ef275a72192c3b7388e84df2b8acf66fc53aaf556e3be05c76b3f782c0
+B = 704e519363d44e8df8d91f5f347eb61e8d3e85c8fc1b82980c370a379b2bc81c
+Result = b70a392e3ce5e85b5efbbded9b8c16a3068ba9b93b4cbed9a9a71dffaad6b58a
+
+Test = OrdMulMont
+A = bf4466ef4dea9f06f0f3b4f14e01140a774262c7e0706584f4d7dac19be46d58
+B = 4af12d528b2cef0f6714961bca2ab682f8abaa97600ea8181f71563d56f8a9f5
+Result = 7b6827c0881b9846e32499e13277efb07917cf4b8c8c72bfb3daa8c1786a8e15
+
+
+# Test cases where A == B to test squaring.
+
+Test = OrdMulMont
+A = 0000000000000000000000000000000000000000000000000000000000000000
+B = 0000000000000000000000000000000000000000000000000000000000000000
+Result = 0000000000000000000000000000000000000000000000000000000000000000
+
+Test = OrdMulMont
+A = 00000000ffffffff00000000000000004319055258e8617b0c46353d039cdaaf
+B = 00000000ffffffff00000000000000004319055258e8617b0c46353d039cdaaf
+Result = 00000000ffffffff00000000000000004319055258e8617b0c46353d039cdaaf
+
+Test = OrdMulMont
+A = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632550
+B = ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632550
+Result = 60d066334905c1e907f8b6041e607725badef3e243566fafce1bc8f79c197c79
+
+Test = OrdMulMont
+A = da43b8dd7fe8830a4fe8980ec585ccbe903a2965a695cdff398200b74b2ede41
+B = da43b8dd7fe8830a4fe8980ec585ccbe903a2965a695cdff398200b74b2ede41
+Result = 5ec68604412205b380e26ee4e4081eccc10ac7d1417b09cd534f8517b0de81ec
+
+Test = OrdMulMont
+A = a82a2b8bdbf8a37dc7cb5799691494a8c9fbf649686a4d250dc30697feb0fa47
+B = a82a2b8bdbf8a37dc7cb5799691494a8c9fbf649686a4d250dc30697feb0fa47
+Result = 552c094a8841621d6cc26b3b54ce5da5664283888445196a6433d3cfdcad3aee
+
+Test = OrdMulMont
+A = d785006e250410d9dcc6d7740795a7374c25b00b9c9a37b8285694a07307eacd
+B = d785006e250410d9dcc6d7740795a7374c25b00b9c9a37b8285694a07307eacd
+Result = 971aaa9e70ad082cf43725f2e65bc73f4bf762459cee13167545072ec7bdcaf8
+
+Test = OrdMulMont
+A = 69d6d9f5417e87d603a3fb6acafa0d1f974abf94ca57ce58d718a0ad5d02a496
+B = 69d6d9f5417e87d603a3fb6acafa0d1f974abf94ca57ce58d718a0ad5d02a496
+Result = eb3284e5799fbe93171f08e6de9f792cd17f036b3a17671b0310e49b48e589b3
+
+Test = OrdMulMont
+A = 1c28f742c3e26e74901d0425f2eb4d5272524668d2405875b32cf6433f212900
+B = 1c28f742c3e26e74901d0425f2eb4d5272524668d2405875b32cf6433f212900
+Result = 74f70a95399b7ad061a2200fa50528d68eee4654341c8158101e1e3f8f16e642
+
+Test = OrdMulMont
+A = 026b2f69f0259d221920b2f358b378a79826f0332ee36afa257765043e3d6732
+B = 026b2f69f0259d221920b2f358b378a79826f0332ee36afa257765043e3d6732
+Result = e1e9cfa4724995bb50971ca22f3c028cd31cb51fbef8a37c31f10fd1d468f13b
+
+Test = OrdMulMont
+A = 376ed4fadcc1c6c4160a0c9c2ab7c62260367968b08d304d47c65f25625d7d60
+B = 376ed4fadcc1c6c4160a0c9c2ab7c62260367968b08d304d47c65f25625d7d60
+Result = b9ccb67f377e1278f1d2eeda26e5eed76f32406c9deed9764fc0aa346d91e02b
+
+Test = OrdMulMont
+A = 50f66867d0a4ef389678d760d2a4db886583b4c068d0e240f7ddf3472c871304
+B = 50f66867d0a4ef389678d760d2a4db886583b4c068d0e240f7ddf3472c871304
+Result = 82c3467bc5f7ca8b45f4ee61546745e2f53755a02e87f65f572418d60e471c8b
+
+Test = OrdMulMont
+A = 5b8bd82b37206d2b727f19ad2d02f63773470074dde7d43d2a77c448ddf2f978
+B = 5b8bd82b37206d2b727f19ad2d02f63773470074dde7d43d2a77c448ddf2f978
+Result = dbf3c2fc67a0688c3b5ff12cab1739d50b6093c5d98943d388652b1207e4a0f2
+
+Test = OrdMulMont
+A = bed7b3a4dada0e16984eb59ee239005ab212e5b1772cdd5d240c8ee268f65c81
+B = bed7b3a4dada0e16984eb59ee239005ab212e5b1772cdd5d240c8ee268f65c81
+Result = 9232aa2759ca9c5efbaefb0cf45cc6bc9c89def8c25e5c169fe623f30787df36
diff --git a/src/crypto/fipsmodule/ec/scalar.c b/src/crypto/fipsmodule/ec/scalar.c
new file mode 100644
index 0000000..aa364de
--- /dev/null
+++ b/src/crypto/fipsmodule/ec/scalar.c
@@ -0,0 +1,74 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/ec.h>
+
+#include "internal.h"
+#include "../bn/internal.h"
+
+
+int ec_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out,
+                        const BIGNUM *in) {
+  if (!bn_copy_words(out->words, group->order.width, in) ||
+      !bn_less_than_words(out->words, group->order.d, group->order.width)) {
+    OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR);
+    return 0;
+  }
+  return 1;
+}
+
+int ec_random_nonzero_scalar(const EC_GROUP *group, EC_SCALAR *out,
+                             const uint8_t additional_data[32]) {
+  return bn_rand_range_words(out->words, 1, group->order.d, group->order.width,
+                             additional_data);
+}
+
+void ec_scalar_add(const EC_GROUP *group, EC_SCALAR *r, const EC_SCALAR *a,
+                   const EC_SCALAR *b) {
+  const BIGNUM *order = &group->order;
+  BN_ULONG tmp[EC_MAX_SCALAR_WORDS];
+  bn_mod_add_words(r->words, a->words, b->words, order->d, tmp, order->width);
+  OPENSSL_cleanse(tmp, sizeof(tmp));
+}
+
+void ec_scalar_to_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                             const EC_SCALAR *a) {
+  const BIGNUM *order = &group->order;
+  bn_to_montgomery_small(r->words, a->words, order->width, group->order_mont);
+}
+
+void ec_scalar_from_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                               const EC_SCALAR *a) {
+  const BIGNUM *order = &group->order;
+  bn_from_montgomery_small(r->words, a->words, order->width, group->order_mont);
+}
+
+void ec_scalar_mul_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                              const EC_SCALAR *a, const EC_SCALAR *b) {
+  const BIGNUM *order = &group->order;
+  bn_mod_mul_montgomery_small(r->words, a->words, b->words, order->width,
+                              group->order_mont);
+}
+
+void ec_simple_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                                     const EC_SCALAR *a) {
+  const BIGNUM *order = &group->order;
+  bn_mod_inverse_prime_mont_small(r->words, a->words, order->width,
+                                  group->order_mont);
+}
+
+void ec_scalar_inv_montgomery(const EC_GROUP *group, EC_SCALAR *r,
+                              const EC_SCALAR *a) {
+  group->meth->scalar_inv_montgomery(group, r, a);
+}
diff --git a/src/crypto/fipsmodule/ecdsa/ecdsa.c b/src/crypto/fipsmodule/ecdsa/ecdsa.c
index 85490fa..f3ce214 100644
--- a/src/crypto/fipsmodule/ecdsa/ecdsa.c
+++ b/src/crypto/fipsmodule/ecdsa/ecdsa.c
@@ -66,22 +66,6 @@
 #include "../../internal.h"
 
 
-static void scalar_add(const EC_GROUP *group, EC_SCALAR *r, const EC_SCALAR *a,
-                       const EC_SCALAR *b) {
-  const BIGNUM *order = &group->order;
-  BN_ULONG tmp[EC_MAX_SCALAR_WORDS];
-  bn_mod_add_words(r->words, a->words, b->words, order->d, tmp, order->width);
-  OPENSSL_cleanse(tmp, sizeof(tmp));
-}
-
-static int scalar_mul_montgomery(const EC_GROUP *group, EC_SCALAR *r,
-                                 const EC_SCALAR *a, const EC_SCALAR *b) {
-  const BIGNUM *order = &group->order;
-  return bn_mod_mul_montgomery_small(r->words, order->width, a->words,
-                                     order->width, b->words, order->width,
-                                     group->order_mont);
-}
-
 // digest_to_scalar interprets |digest_len| bytes from |digest| as a scalar for
 // ECDSA. Note this value is not fully reduced modulo the order, only the
 // correct number of bits.
@@ -217,7 +201,6 @@
   }
 
   EC_SCALAR r, s, u1, u2, s_inv_mont, m;
-  const BIGNUM *order = EC_GROUP_get0_order(group);
   if (BN_is_zero(sig->r) ||
       !ec_bignum_to_scalar(group, &r, sig->r) ||
       BN_is_zero(sig->s) ||
@@ -225,27 +208,22 @@
     OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE);
     goto err;
   }
-  // s_inv_mont = s^-1 mod order. We convert the result to Montgomery form for
-  // the products below.
-  int no_inverse;
-  if (!BN_mod_inverse_odd(X, &no_inverse, sig->s, order, ctx) ||
-      // TODO(davidben): Add a words version of |BN_mod_inverse_odd| and write
-      // into |s_inv_mont| directly.
-      !ec_bignum_to_scalar_unchecked(group, &s_inv_mont, X) ||
-      !bn_to_montgomery_small(s_inv_mont.words, order->width, s_inv_mont.words,
-                              order->width, group->order_mont)) {
-    goto err;
-  }
+
+  // s_inv_mont = s^-1 in the Montgomery domain. This is
+  // |ec_scalar_to_montgomery| followed by |ec_scalar_inv_montgomery|, but
+  // |ec_scalar_inv_montgomery| followed by |ec_scalar_from_montgomery| is
+  // equivalent and slightly more efficient.
+  ec_scalar_inv_montgomery(group, &s_inv_mont, &s);
+  ec_scalar_from_montgomery(group, &s_inv_mont, &s_inv_mont);
+
   // u1 = m * s^-1 mod order
   // u2 = r * s^-1 mod order
   //
   // |s_inv_mont| is in Montgomery form while |m| and |r| are not, so |u1| and
   // |u2| will be taken out of Montgomery form, as desired.
   digest_to_scalar(group, &m, digest, digest_len);
-  if (!scalar_mul_montgomery(group, &u1, &m, &s_inv_mont) ||
-      !scalar_mul_montgomery(group, &u2, &r, &s_inv_mont)) {
-    goto err;
-  }
+  ec_scalar_mul_montgomery(group, &u1, &m, &s_inv_mont);
+  ec_scalar_mul_montgomery(group, &u2, &r, &s_inv_mont);
 
   point = EC_POINT_new(group);
   if (point == NULL) {
@@ -328,15 +306,12 @@
       }
     }
 
-    // Compute k^-1. We leave it in the Montgomery domain as an optimization for
-    // later operations.
-    if (!bn_to_montgomery_small(out_kinv_mont->words, order->width, k.words,
-                                order->width, group->order_mont) ||
-        !bn_mod_inverse_prime_mont_small(out_kinv_mont->words, order->width,
-                                         out_kinv_mont->words, order->width,
-                                         group->order_mont)) {
-      goto err;
-    }
+    // Compute k^-1 in the Montgomery domain. This is |ec_scalar_to_montgomery|
+    // followed by |ec_scalar_inv_montgomery|, but |ec_scalar_inv_montgomery|
+    // followed by |ec_scalar_from_montgomery| is equivalent and slightly more
+    // efficient.
+    ec_scalar_inv_montgomery(group, out_kinv_mont, &k);
+    ec_scalar_from_montgomery(group, out_kinv_mont, out_kinv_mont);
 
     // Compute r, the x-coordinate of generator * k.
     if (!ec_point_mul_scalar(group, tmp_point, &k, NULL, NULL, ctx) ||
@@ -396,20 +371,19 @@
     // Compute priv_key * r (mod order). Note if only one parameter is in the
     // Montgomery domain, |scalar_mod_mul_montgomery| will compute the answer in
     // the normal domain.
-    if (!ec_bignum_to_scalar(group, &r_mont, ret->r) ||
-        !bn_to_montgomery_small(r_mont.words, order->width, r_mont.words,
-                                order->width, group->order_mont) ||
-        !scalar_mul_montgomery(group, &s, priv_key, &r_mont)) {
+    if (!ec_bignum_to_scalar(group, &r_mont, ret->r)) {
       goto err;
     }
+    ec_scalar_to_montgomery(group, &r_mont, &r_mont);
+    ec_scalar_mul_montgomery(group, &s, priv_key, &r_mont);
 
     // Compute tmp = m + priv_key * r.
-    scalar_add(group, &tmp, &m, &s);
+    ec_scalar_add(group, &tmp, &m, &s);
 
     // Finally, multiply s by k^-1. That was retained in Montgomery form, so the
     // same technique as the previous multiplication works.
-    if (!scalar_mul_montgomery(group, &s, &tmp, &kinv_mont) ||
-        !bn_set_words(ret->s, s.words, order->width)) {
+    ec_scalar_mul_montgomery(group, &s, &tmp, &kinv_mont);
+    if (!bn_set_words(ret->s, s.words, order->width)) {
       goto err;
     }
     if (!BN_is_zero(ret->s)) {
diff --git a/src/crypto/fipsmodule/rsa/rsa_impl.c b/src/crypto/fipsmodule/rsa/rsa_impl.c
index 49cbc15..6d1206b 100644
--- a/src/crypto/fipsmodule/rsa/rsa_impl.c
+++ b/src/crypto/fipsmodule/rsa/rsa_impl.c
@@ -167,7 +167,7 @@
 
   if (rsa->p != NULL && rsa->q != NULL) {
     if (rsa->mont_p == NULL) {
-      rsa->mont_p = BN_MONT_CTX_new_for_modulus(rsa->p, ctx);
+      rsa->mont_p = BN_MONT_CTX_new_consttime(rsa->p, ctx);
       if (rsa->mont_p == NULL) {
         goto err;
       }
@@ -175,7 +175,7 @@
     const BIGNUM *p_fixed = &rsa->mont_p->N;
 
     if (rsa->mont_q == NULL) {
-      rsa->mont_q = BN_MONT_CTX_new_for_modulus(rsa->q, ctx);
+      rsa->mont_q = BN_MONT_CTX_new_consttime(rsa->q, ctx);
       if (rsa->mont_q == NULL) {
         goto err;
       }
@@ -715,7 +715,13 @@
   }
 
   if (rsa->p != NULL && rsa->q != NULL && rsa->e != NULL && rsa->dmp1 != NULL &&
-      rsa->dmq1 != NULL && rsa->iqmp != NULL) {
+      rsa->dmq1 != NULL && rsa->iqmp != NULL &&
+      // Require that we can reduce |f| by |rsa->p| and |rsa->q| in constant
+      // time, which requires primes be the same size, rounded to the Montgomery
+      // coefficient. (See |mod_montgomery|.) This is not required by RFC 8017,
+      // but it is true for keys generated by us and all common implementations.
+      bn_less_than_montgomery_R(rsa->q, rsa->mont_p) &&
+      bn_less_than_montgomery_R(rsa->p, rsa->mont_q)) {
     if (!mod_exp(result, f, rsa, ctx)) {
       goto err;
     }
@@ -780,11 +786,11 @@
                           const BN_MONT_CTX *mont_p, const BIGNUM *q,
                           BN_CTX *ctx) {
   // Reducing in constant-time with Montgomery reduction requires I <= p * R. We
-  // have I < p * q, so this follows if q < R. In particular, this always holds
-  // if p and q are the same size, which is true for any RSA keys we or anyone
-  // sane generates. For other keys, we fall back to |BN_mod|.
+  // have I < p * q, so this follows if q < R. The caller should have checked
+  // this already.
   if (!bn_less_than_montgomery_R(q, mont_p)) {
-    return BN_mod(r, I, p, ctx);
+    OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR);
+    return 0;
   }
 
   if (// Reduce mod p with Montgomery reduction. This computes I * R^-1 mod p.
@@ -928,6 +934,8 @@
 // relatively prime to |e|. If |p| is non-NULL, |out| will also not be close to
 // |p|. |sqrt2| must be ⌊2^(bits-1)×√2⌋ (or a slightly overestimate for large
 // sizes), and |pow2_bits_100| must be 2^(bits-100).
+//
+// This function fails with probability around 2^-21.
 static int generate_prime(BIGNUM *out, int bits, const BIGNUM *e,
                           const BIGNUM *p, const BIGNUM *sqrt2,
                           const BIGNUM *pow2_bits_100, BN_CTX *ctx,
@@ -944,11 +952,36 @@
   // Use the limit from steps 4.7 and 5.8 for most values of |e|. When |e| is 3,
   // the 186-4 limit is too low, so we use a higher one. Note this case is not
   // reachable from |RSA_generate_key_fips|.
+  //
+  // |limit| determines the failure probability. We must find a prime that is
+  // not 1 mod |e|. By the prime number theorem, we'll find one with probability
+  // p = (e-1)/e * 2/(ln(2)*bits). Note the second term is doubled because we
+  // discard even numbers.
+  //
+  // The failure probability is thus (1-p)^limit. To convert that to a power of
+  // two, we take logs. -log_2((1-p)^limit) = -limit * ln(1-p) / ln(2).
+  //
+  // >>> def f(bits, e, limit):
+  // ...   p = (e-1.0)/e * 2.0/(math.log(2)*bits)
+  // ...   return -limit * math.log(1 - p) / math.log(2)
+  // ...
+  // >>> f(1024, 65537, 5*1024)
+  // 20.842750558272634
+  // >>> f(1536, 65537, 5*1536)
+  // 20.83294549602474
+  // >>> f(2048, 65537, 5*2048)
+  // 20.828047576234948
+  // >>> f(1024, 3, 8*1024)
+  // 22.222147925962307
+  // >>> f(1536, 3, 8*1536)
+  // 22.21518251065506
+  // >>> f(2048, 3, 8*2048)
+  // 22.211701985875937
   if (bits >= INT_MAX/32) {
     OPENSSL_PUT_ERROR(RSA, RSA_R_MODULUS_TOO_LARGE);
     return 0;
   }
-  int limit = BN_is_word(e, 3) ? bits * 32 : bits * 5;
+  int limit = BN_is_word(e, 3) ? bits * 8 : bits * 5;
 
   int ret = 0, tries = 0, rand_tries = 0;
   BN_CTX_start(ctx);
@@ -1027,7 +1060,14 @@
   return ret;
 }
 
-int RSA_generate_key_ex(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb) {
+// rsa_generate_key_impl generates an RSA key using a generalized version of
+// FIPS 186-4 appendix B.3. |RSA_generate_key_fips| performs additional checks
+// for FIPS-compliant key generation.
+//
+// This function returns one on success and zero on failure. It has a failure
+// probability of about 2^-20.
+static int rsa_generate_key_impl(RSA *rsa, int bits, BIGNUM *e_value,
+                                 BN_GENCB *cb) {
   // See FIPS 186-4 appendix B.3. This function implements a generalized version
   // of the FIPS algorithm. |RSA_generate_key_fips| performs additional checks
   // for FIPS-compliant key generation.
@@ -1113,6 +1153,9 @@
   do {
     // Generate p and q, each of size |prime_bits|, using the steps outlined in
     // appendix FIPS 186-4 appendix B.3.3.
+    //
+    // Each call to |generate_prime| fails with probability p = 2^-21. The
+    // probability that either call fails is 1 - (1-p)^2, which is around 2^-20.
     if (!generate_prime(rsa->p, prime_bits, rsa->e, NULL, sqrt2,
                         pow2_prime_bits_100, ctx, cb) ||
         !BN_GENCB_call(cb, 3, 0) ||
@@ -1192,6 +1235,65 @@
   return ret;
 }
 
+static void replace_bignum(BIGNUM **out, BIGNUM **in) {
+  BN_free(*out);
+  *out = *in;
+  *in = NULL;
+}
+
+static void replace_bn_mont_ctx(BN_MONT_CTX **out, BN_MONT_CTX **in) {
+  BN_MONT_CTX_free(*out);
+  *out = *in;
+  *in = NULL;
+}
+
+int RSA_generate_key_ex(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb) {
+  // |rsa_generate_key_impl|'s 2^-20 failure probability is too high at scale,
+  // so we run the FIPS algorithm four times, bringing it down to 2^-80. We
+  // should just adjust the retry limit, but FIPS 186-4 prescribes that value
+  // and thus results in unnecessary complexity.
+  for (int i = 0; i < 4; i++) {
+    ERR_clear_error();
+    // Generate into scratch space, to avoid leaving partial work on failure.
+    RSA *tmp = RSA_new();
+    if (tmp == NULL) {
+      return 0;
+    }
+    if (rsa_generate_key_impl(tmp, bits, e_value, cb)) {
+      replace_bignum(&rsa->n, &tmp->n);
+      replace_bignum(&rsa->e, &tmp->e);
+      replace_bignum(&rsa->d, &tmp->d);
+      replace_bignum(&rsa->p, &tmp->p);
+      replace_bignum(&rsa->q, &tmp->q);
+      replace_bignum(&rsa->dmp1, &tmp->dmp1);
+      replace_bignum(&rsa->dmq1, &tmp->dmq1);
+      replace_bignum(&rsa->iqmp, &tmp->iqmp);
+      replace_bn_mont_ctx(&rsa->mont_n, &tmp->mont_n);
+      replace_bn_mont_ctx(&rsa->mont_p, &tmp->mont_p);
+      replace_bn_mont_ctx(&rsa->mont_q, &tmp->mont_q);
+      replace_bignum(&rsa->d_fixed, &tmp->d_fixed);
+      replace_bignum(&rsa->dmp1_fixed, &tmp->dmp1_fixed);
+      replace_bignum(&rsa->dmq1_fixed, &tmp->dmq1_fixed);
+      replace_bignum(&rsa->inv_small_mod_large_mont,
+                     &tmp->inv_small_mod_large_mont);
+      rsa->private_key_frozen = tmp->private_key_frozen;
+      RSA_free(tmp);
+      return 1;
+    }
+    uint32_t err = ERR_peek_error();
+    RSA_free(tmp);
+    tmp = NULL;
+    // Only retry on |RSA_R_TOO_MANY_ITERATIONS|. This is so a caller-induced
+    // failure in |BN_GENCB_call| is still fatal.
+    if (ERR_GET_LIB(err) != ERR_LIB_RSA ||
+        ERR_GET_REASON(err) != RSA_R_TOO_MANY_ITERATIONS) {
+      return 0;
+    }
+  }
+
+  return 0;
+}
+
 int RSA_generate_key_fips(RSA *rsa, int bits, BN_GENCB *cb) {
   // FIPS 186-4 allows 2048-bit and 3072-bit RSA keys (1024-bit and 1536-bit
   // primes, respectively) with the prime generation method we use.
diff --git a/src/crypto/mem.c b/src/crypto/mem.c
index 50c6fe6..5d45baa 100644
--- a/src/crypto/mem.c
+++ b/src/crypto/mem.c
@@ -59,7 +59,6 @@
 #include <assert.h>
 #include <stdarg.h>
 #include <stdio.h>
-#include <string.h>
 
 #if defined(OPENSSL_WINDOWS)
 OPENSSL_MSVC_PRAGMA(warning(push, 3))
diff --git a/src/crypto/rsa_extra/rsa_test.cc b/src/crypto/rsa_extra/rsa_test.cc
index a6bfb87..211b690 100644
--- a/src/crypto/rsa_extra/rsa_test.cc
+++ b/src/crypto/rsa_extra/rsa_test.cc
@@ -501,12 +501,12 @@
   ERR_clear_error();
 
   // Test that we can generate 2048-bit and 3072-bit RSA keys.
-  EXPECT_TRUE(RSA_generate_key_fips(rsa.get(), 2048, nullptr));
+  ASSERT_TRUE(RSA_generate_key_fips(rsa.get(), 2048, nullptr));
   EXPECT_EQ(2048u, BN_num_bits(rsa->n));
 
   rsa.reset(RSA_new());
   ASSERT_TRUE(rsa);
-  EXPECT_TRUE(RSA_generate_key_fips(rsa.get(), 3072, nullptr));
+  ASSERT_TRUE(RSA_generate_key_fips(rsa.get(), 3072, nullptr));
   EXPECT_EQ(3072u, BN_num_bits(rsa->n));
 }
 
@@ -653,22 +653,22 @@
 
   bssl::UniquePtr<RSA> rsa(RSA_new());
   ASSERT_TRUE(rsa);
-  EXPECT_TRUE(RSA_generate_key_ex(rsa.get(), 1025, e.get(), nullptr));
+  ASSERT_TRUE(RSA_generate_key_ex(rsa.get(), 1025, e.get(), nullptr));
   EXPECT_EQ(1024u, BN_num_bits(rsa->n));
 
   rsa.reset(RSA_new());
   ASSERT_TRUE(rsa);
-  EXPECT_TRUE(RSA_generate_key_ex(rsa.get(), 1027, e.get(), nullptr));
+  ASSERT_TRUE(RSA_generate_key_ex(rsa.get(), 1027, e.get(), nullptr));
   EXPECT_EQ(1024u, BN_num_bits(rsa->n));
 
   rsa.reset(RSA_new());
   ASSERT_TRUE(rsa);
-  EXPECT_TRUE(RSA_generate_key_ex(rsa.get(), 1151, e.get(), nullptr));
+  ASSERT_TRUE(RSA_generate_key_ex(rsa.get(), 1151, e.get(), nullptr));
   EXPECT_EQ(1024u, BN_num_bits(rsa->n));
 
   rsa.reset(RSA_new());
   ASSERT_TRUE(rsa);
-  EXPECT_TRUE(RSA_generate_key_ex(rsa.get(), 1152, e.get(), nullptr));
+  ASSERT_TRUE(RSA_generate_key_ex(rsa.get(), 1152, e.get(), nullptr));
   EXPECT_EQ(1152u, BN_num_bits(rsa->n));
 }
 
@@ -896,6 +896,123 @@
   ASSERT_TRUE(BN_sub(rsa->iqmp, rsa->iqmp, rsa->p));
 }
 
+TEST(RSATest, KeygenFail) {
+  bssl::UniquePtr<RSA> rsa(RSA_new());
+  ASSERT_TRUE(rsa);
+
+  // Cause RSA key generation after a prime has been generated, to test that
+  // |rsa| is left alone.
+  BN_GENCB cb;
+  BN_GENCB_set(&cb,
+               [](int event, int, BN_GENCB *) -> int { return event != 3; },
+               nullptr);
+
+  bssl::UniquePtr<BIGNUM> e(BN_new());
+  ASSERT_TRUE(e);
+  ASSERT_TRUE(BN_set_word(e.get(), RSA_F4));
+
+  // Key generation should fail.
+  EXPECT_FALSE(RSA_generate_key_ex(rsa.get(), 2048, e.get(), &cb));
+
+  // Failed key generations do not leave garbage in |rsa|.
+  EXPECT_FALSE(rsa->n);
+  EXPECT_FALSE(rsa->e);
+  EXPECT_FALSE(rsa->d);
+  EXPECT_FALSE(rsa->p);
+  EXPECT_FALSE(rsa->q);
+  EXPECT_FALSE(rsa->dmp1);
+  EXPECT_FALSE(rsa->dmq1);
+  EXPECT_FALSE(rsa->iqmp);
+  EXPECT_FALSE(rsa->mont_n);
+  EXPECT_FALSE(rsa->mont_p);
+  EXPECT_FALSE(rsa->mont_q);
+  EXPECT_FALSE(rsa->d_fixed);
+  EXPECT_FALSE(rsa->dmp1_fixed);
+  EXPECT_FALSE(rsa->dmq1_fixed);
+  EXPECT_FALSE(rsa->inv_small_mod_large_mont);
+  EXPECT_FALSE(rsa->private_key_frozen);
+
+  // Failed key generations leave the previous contents alone.
+  EXPECT_TRUE(RSA_generate_key_ex(rsa.get(), 2048, e.get(), nullptr));
+  uint8_t *der;
+  size_t der_len;
+  ASSERT_TRUE(RSA_private_key_to_bytes(&der, &der_len, rsa.get()));
+  bssl::UniquePtr<uint8_t> delete_der(der);
+
+  EXPECT_FALSE(RSA_generate_key_ex(rsa.get(), 2048, e.get(), &cb));
+
+  uint8_t *der2;
+  size_t der2_len;
+  ASSERT_TRUE(RSA_private_key_to_bytes(&der2, &der2_len, rsa.get()));
+  bssl::UniquePtr<uint8_t> delete_der2(der2);
+  EXPECT_EQ(Bytes(der, der_len), Bytes(der2, der2_len));
+
+  // Generating a key over an existing key works, despite any cached state.
+  EXPECT_TRUE(RSA_generate_key_ex(rsa.get(), 2048, e.get(), nullptr));
+  EXPECT_TRUE(RSA_check_key(rsa.get()));
+  uint8_t *der3;
+  size_t der3_len;
+  ASSERT_TRUE(RSA_private_key_to_bytes(&der3, &der3_len, rsa.get()));
+  bssl::UniquePtr<uint8_t> delete_der3(der3);
+  EXPECT_NE(Bytes(der, der_len), Bytes(der3, der3_len));
+}
+
+TEST(RSATest, KeygenFailOnce) {
+  bssl::UniquePtr<RSA> rsa(RSA_new());
+  ASSERT_TRUE(rsa);
+
+  // Cause only the first iteration of RSA key generation to fail.
+  bool failed = false;
+  BN_GENCB cb;
+  BN_GENCB_set(&cb,
+               [](int event, int n, BN_GENCB *cb_ptr) -> int {
+                 bool *failed_ptr = static_cast<bool *>(cb_ptr->arg);
+                 if (*failed_ptr) {
+                   ADD_FAILURE() << "Callback called multiple times.";
+                   return 1;
+                 }
+                 *failed_ptr = true;
+                 return 0;
+               },
+               &failed);
+
+  // Although key generation internally retries, the external behavior of
+  // |BN_GENCB| is preserved.
+  bssl::UniquePtr<BIGNUM> e(BN_new());
+  ASSERT_TRUE(e);
+  ASSERT_TRUE(BN_set_word(e.get(), RSA_F4));
+  EXPECT_FALSE(RSA_generate_key_ex(rsa.get(), 2048, e.get(), &cb));
+}
+
+TEST(RSATest, KeygenInternalRetry) {
+  bssl::UniquePtr<RSA> rsa(RSA_new());
+  ASSERT_TRUE(rsa);
+
+  // Simulate one internal attempt at key generation failing.
+  bool failed = false;
+  BN_GENCB cb;
+  BN_GENCB_set(&cb,
+               [](int event, int n, BN_GENCB *cb_ptr) -> int {
+                 bool *failed_ptr = static_cast<bool *>(cb_ptr->arg);
+                 if (*failed_ptr) {
+                   return 1;
+                 }
+                 *failed_ptr = true;
+                 // This test does not test any public API behavior. It is just
+                 // a hack to exercise the retry codepath and make sure it
+                 // works.
+                 OPENSSL_PUT_ERROR(RSA, RSA_R_TOO_MANY_ITERATIONS);
+                 return 0;
+               },
+               &failed);
+
+  // Key generation internally retries on RSA_R_TOO_MANY_ITERATIONS.
+  bssl::UniquePtr<BIGNUM> e(BN_new());
+  ASSERT_TRUE(e);
+  ASSERT_TRUE(BN_set_word(e.get(), RSA_F4));
+  EXPECT_TRUE(RSA_generate_key_ex(rsa.get(), 2048, e.get(), &cb));
+}
+
 #if !defined(BORINGSSL_SHARED_LIBRARY)
 TEST(RSATest, SqrtTwo) {
   bssl::UniquePtr<BIGNUM> sqrt(BN_new()), pow2(BN_new());
diff --git a/src/crypto/x509/by_dir.c b/src/crypto/x509/by_dir.c
index b3bfffe..9a0e2eb 100644
--- a/src/crypto/x509/by_dir.c
+++ b/src/crypto/x509/by_dir.c
@@ -65,6 +65,8 @@
 #include <openssl/thread.h>
 #include <openssl/x509.h>
 
+#if !defined(OPENSSL_TRUSTY)
+
 #include "../internal.h"
 
 typedef struct lookup_dir_hashes_st {
@@ -452,3 +454,5 @@
         BUF_MEM_free(b);
     return (ok);
 }
+
+#endif  // OPENSSL_TRUSTY
diff --git a/src/include/openssl/bn.h b/src/include/openssl/bn.h
index 1618022..e8cc70a 100644
--- a/src/include/openssl/bn.h
+++ b/src/include/openssl/bn.h
@@ -659,8 +659,7 @@
 // BN_GENCB_set configures |callback| to call |f| and sets |callout->arg| to
 // |arg|.
 OPENSSL_EXPORT void BN_GENCB_set(BN_GENCB *callback,
-                                 int (*f)(int event, int n,
-                                          struct bn_gencb_st *),
+                                 int (*f)(int event, int n, BN_GENCB *),
                                  void *arg);
 
 // BN_GENCB_call calls |callback|, if not NULL, and returns the return value of
@@ -812,10 +811,15 @@
 // Montgomery domain.
 
 // BN_MONT_CTX_new_for_modulus returns a fresh |BN_MONT_CTX| given the modulus,
-// |mod| or NULL on error.
+// |mod| or NULL on error. Note this function assumes |mod| is public.
 OPENSSL_EXPORT BN_MONT_CTX *BN_MONT_CTX_new_for_modulus(const BIGNUM *mod,
                                                         BN_CTX *ctx);
 
+// BN_MONT_CTX_new_consttime behaves like |BN_MONT_CTX_new_for_modulus| but
+// treats |mod| as secret.
+OPENSSL_EXPORT BN_MONT_CTX *BN_MONT_CTX_new_consttime(const BIGNUM *mod,
+                                                      BN_CTX *ctx);
+
 // BN_MONT_CTX_free frees memory associated with |mont|.
 OPENSSL_EXPORT void BN_MONT_CTX_free(BN_MONT_CTX *mont);
 
@@ -826,7 +830,8 @@
 
 // BN_MONT_CTX_set_locked takes |lock| and checks whether |*pmont| is NULL. If
 // so, it creates a new |BN_MONT_CTX| and sets the modulus for it to |mod|. It
-// then stores it as |*pmont|. It returns one on success and zero on error.
+// then stores it as |*pmont|. It returns one on success and zero on error. Note
+// this function assumes |mod| is public.
 //
 // If |*pmont| is already non-NULL then it does nothing and returns one.
 int BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, CRYPTO_MUTEX *lock,
@@ -869,10 +874,14 @@
 OPENSSL_EXPORT int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
                               const BIGNUM *m, BN_CTX *ctx);
 
+// BN_mod_exp_mont behaves like |BN_mod_exp| but treats |a| as secret and
+// requires 0 <= |a| < |m|.
 OPENSSL_EXPORT int BN_mod_exp_mont(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
                                    const BIGNUM *m, BN_CTX *ctx,
                                    const BN_MONT_CTX *mont);
 
+// BN_mod_exp_mont_consttime behaves like |BN_mod_exp| but treats |a|, |p|, and
+// |m| as secret and requires 0 <= |a| < |m|.
 OPENSSL_EXPORT int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a,
                                              const BIGNUM *p, const BIGNUM *m,
                                              BN_CTX *ctx,
@@ -952,9 +961,10 @@
 };
 
 struct bn_mont_ctx_st {
-  // RR is R^2, reduced modulo |N|. It is used to convert to Montgomery form.
+  // RR is R^2, reduced modulo |N|. It is used to convert to Montgomery form. It
+  // is guaranteed to have the same width as |N|.
   BIGNUM RR;
-  // N is the modulus. It is always stored in minimal form, so |N.top|
+  // N is the modulus. It is always stored in minimal form, so |N.width|
   // determines R.
   BIGNUM N;
   BN_ULONG n0[2];  // least significant words of (R*Ri-1)/N
diff --git a/src/include/openssl/cpu.h b/src/include/openssl/cpu.h
index dd95ddc..bb847f9 100644
--- a/src/include/openssl/cpu.h
+++ b/src/include/openssl/cpu.h
@@ -86,7 +86,8 @@
 //     Bit 11 is used to indicate AMD XOP support, not SDBG
 //   Index 2:
 //     EBX for CPUID where EAX = 7
-//   Index 3 is set to zero.
+//   Index 3:
+//     ECX for CPUID where EAX = 7
 //
 // Note: the CPUID bits are pre-adjusted for the OSXSAVE bit and the YMM and XMM
 // bits in XCR0, so it is not necessary to check those.
diff --git a/src/include/openssl/span.h b/src/include/openssl/span.h
index 3a629f7..5ed96b7 100644
--- a/src/include/openssl/span.h
+++ b/src/include/openssl/span.h
@@ -22,7 +22,6 @@
 extern "C++" {
 
 #include <algorithm>
-#include <cassert>
 #include <cstdlib>
 #include <type_traits>
 
@@ -136,16 +135,25 @@
   const T *cend() const { return end(); };
 
   T &front() const {
-    assert(size_ != 0);
+    if (size_ == 0) {
+      abort();
+    }
     return data_[0];
   }
   T &back() const {
-    assert(size_ != 0);
+    if (size_ == 0) {
+      abort();
+    }
     return data_[size_ - 1];
   }
 
-  T &operator[](size_t i) const { return data_[i]; }
-  T &at(size_t i) const { return data_[i]; }
+  T &operator[](size_t i) const {
+    if (i >= size_) {
+      abort();
+    }
+    return data_[i];
+  }
+  T &at(size_t i) const { return (*this)[i]; }
 
   Span subspan(size_t pos = 0, size_t len = npos) const {
     if (pos > size_) {
diff --git a/src/include/openssl/ssl.h b/src/include/openssl/ssl.h
index 2922473..51162ea 100644
--- a/src/include/openssl/ssl.h
+++ b/src/include/openssl/ssl.h
@@ -970,9 +970,9 @@
 #define SSL_SIGN_ECDSA_SECP256R1_SHA256 0x0403
 #define SSL_SIGN_ECDSA_SECP384R1_SHA384 0x0503
 #define SSL_SIGN_ECDSA_SECP521R1_SHA512 0x0603
-#define SSL_SIGN_RSA_PSS_SHA256 0x0804
-#define SSL_SIGN_RSA_PSS_SHA384 0x0805
-#define SSL_SIGN_RSA_PSS_SHA512 0x0806
+#define SSL_SIGN_RSA_PSS_RSAE_SHA256 0x0804
+#define SSL_SIGN_RSA_PSS_RSAE_SHA384 0x0805
+#define SSL_SIGN_RSA_PSS_RSAE_SHA512 0x0806
 #define SSL_SIGN_ED25519 0x0807
 
 // SSL_SIGN_RSA_PKCS1_MD5_SHA1 is an internal signature algorithm used to
@@ -2467,10 +2467,19 @@
 OPENSSL_EXPORT int SSL_set1_verify_cert_store(SSL *ssl, X509_STORE *store);
 
 // SSL_CTX_set_ed25519_enabled configures whether |ctx| advertises support for
-// the Ed25519 signature algorithm when using the default preference list.
+// the Ed25519 signature algorithm when using the default preference list. It is
+// disabled by default and may be enabled if the certificate verifier supports
+// Ed25519.
 OPENSSL_EXPORT void SSL_CTX_set_ed25519_enabled(SSL_CTX *ctx, int enabled);
 
-// SSL_CTX_set_verify_algorithm_prefs confingures |ctx| to use |prefs| as the
+// SSL_CTX_set_rsa_pss_rsae_certs_enabled configures whether |ctx| advertises
+// support for rsa_pss_rsae_* signatures within the certificate chain. It is
+// enabled by default but should be disabled if using a custom certificate
+// verifier which does not support RSA-PSS signatures.
+OPENSSL_EXPORT void SSL_CTX_set_rsa_pss_rsae_certs_enabled(SSL_CTX *ctx,
+                                                           int enabled);
+
+// SSL_CTX_set_verify_algorithm_prefs configures |ctx| to use |prefs| as the
 // preference list when verifying signature's from the peer's long-term key. It
 // returns one on zero on error. |prefs| should not include the internal-only
 // value |SSL_SIGN_RSA_PKCS1_MD5_SHA1|.
@@ -4116,15 +4125,11 @@
 OPENSSL_EXPORT int OPENSSL_init_ssl(uint64_t opts,
                                     const OPENSSL_INIT_SETTINGS *settings);
 
-#if !defined(BORINGSSL_NO_CXX)
-// SSL_CTX_sess_set_get_cb is a legacy C++ overload of |SSL_CTX_sess_set_get_cb|
-// which supports the old callback signature.
-//
-// TODO(davidben): Remove this once Node is compatible with OpenSSL 1.1.0.
-extern "C++" OPENSSL_EXPORT void SSL_CTX_sess_set_get_cb(
-    SSL_CTX *ctx, SSL_SESSION *(*get_session_cb)(SSL *ssl, uint8_t *id,
-                                                 int id_len, int *out_copy));
-#endif
+// The following constants are legacy aliases for RSA-PSS with rsaEncryption
+// keys. Use the new names instead.
+#define SSL_SIGN_RSA_PSS_SHA256 SSL_SIGN_RSA_PSS_RSAE_SHA256
+#define SSL_SIGN_RSA_PSS_SHA384 SSL_SIGN_RSA_PSS_RSAE_SHA384
+#define SSL_SIGN_RSA_PSS_SHA512 SSL_SIGN_RSA_PSS_RSAE_SHA512
 
 
 // Private structures.
diff --git a/src/include/openssl/tls1.h b/src/include/openssl/tls1.h
index 3424f3d..2aa018b 100644
--- a/src/include/openssl/tls1.h
+++ b/src/include/openssl/tls1.h
@@ -219,6 +219,7 @@
 #define TLSEXT_TYPE_cookie 44
 #define TLSEXT_TYPE_psk_key_exchange_modes 45
 #define TLSEXT_TYPE_certificate_authorities 47
+#define TLSEXT_TYPE_signature_algorithms_cert 50
 #define TLSEXT_TYPE_key_share 51
 
 // ExtensionType value from RFC5746
diff --git a/src/ssl/d1_srtp.cc b/src/ssl/d1_srtp.cc
index 1a8e084..f27c9ff 100644
--- a/src/ssl/d1_srtp.cc
+++ b/src/ssl/d1_srtp.cc
@@ -218,7 +218,7 @@
 }
 
 const SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *ssl) {
-  return ssl->srtp_profile;
+  return ssl->s3->srtp_profile;
 }
 
 int SSL_CTX_set_tlsext_use_srtp(SSL_CTX *ctx, const char *profiles) {
diff --git a/src/ssl/handshake_client.cc b/src/ssl/handshake_client.cc
index 087645d..fbef2e1 100644
--- a/src/ssl/handshake_client.cc
+++ b/src/ssl/handshake_client.cc
@@ -1422,7 +1422,8 @@
       return ssl_hs_error;
     }
 
-    UniquePtr<EVP_PKEY_CTX> pctx(EVP_PKEY_CTX_new(ssl->cert->privatekey, NULL));
+    UniquePtr<EVP_PKEY_CTX> pctx(
+        EVP_PKEY_CTX_new(ssl->cert->privatekey.get(), nullptr));
     if (!pctx ||
         !EVP_PKEY_sign_init(pctx.get()) ||
         !EVP_PKEY_sign(pctx.get(), ptr, &sig_len, digest, digest_len)) {
diff --git a/src/ssl/handshake_server.cc b/src/ssl/handshake_server.cc
index 7ade8fc..84004de 100644
--- a/src/ssl/handshake_server.cc
+++ b/src/ssl/handshake_server.cc
@@ -749,8 +749,8 @@
           !CBB_add_u8(&body, TLSEXT_STATUSTYPE_ocsp) ||
           !CBB_add_u24_length_prefixed(&body, &ocsp_response) ||
           !CBB_add_bytes(&ocsp_response,
-                         CRYPTO_BUFFER_data(ssl->cert->ocsp_response),
-                         CRYPTO_BUFFER_len(ssl->cert->ocsp_response)) ||
+                         CRYPTO_BUFFER_data(ssl->cert->ocsp_response.get()),
+                         CRYPTO_BUFFER_len(ssl->cert->ocsp_response.get())) ||
           !ssl_add_message_cbb(ssl, cbb.get())) {
         OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR);
         return ssl_hs_error;
@@ -902,9 +902,12 @@
         !CBB_add_u8(&cert_types, SSL3_CT_RSA_SIGN) ||
         (ssl_protocol_version(ssl) >= TLS1_VERSION &&
          !CBB_add_u8(&cert_types, TLS_CT_ECDSA_SIGN)) ||
+        // TLS 1.2 has no way to specify different signature algorithms for
+        // certificates and the online signature, so emit the more restrictive
+        // certificate list.
         (ssl_protocol_version(ssl) >= TLS1_2_VERSION &&
          (!CBB_add_u16_length_prefixed(&body, &sigalgs_cbb) ||
-          !tls12_add_verify_sigalgs(ssl, &sigalgs_cbb))) ||
+          !tls12_add_verify_sigalgs(ssl, &sigalgs_cbb, true /* certs */))) ||
         !ssl_add_client_CA_list(ssl, &body) ||
         !ssl_add_message_cbb(ssl, cbb.get())) {
       OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR);
diff --git a/src/ssl/internal.h b/src/ssl/internal.h
index f4dc96f..f1fc63f 100644
--- a/src/ssl/internal.h
+++ b/src/ssl/internal.h
@@ -328,11 +328,11 @@
 
   // CopyFrom replaces the array with a newly-allocated copy of |in|. It returns
   // true on success and false on error.
-  bool CopyFrom(Span<const uint8_t> in) {
+  bool CopyFrom(Span<const T> in) {
     if (!Init(in.size())) {
       return false;
     }
-    OPENSSL_memcpy(data_, in.data(), in.size());
+    OPENSSL_memcpy(data_, in.data(), sizeof(T) * in.size());
     return true;
   }
 
@@ -1739,8 +1739,10 @@
 bool tls1_choose_signature_algorithm(SSL_HANDSHAKE *hs, uint16_t *out);
 
 // tls12_add_verify_sigalgs adds the signature algorithms acceptable for the
-// peer signature to |out|. It returns true on success and false on error.
-bool tls12_add_verify_sigalgs(const SSL *ssl, CBB *out);
+// peer signature to |out|. It returns true on success and false on error. If
+// |for_certs| is true, the potentially more restrictive list of algorithms for
+// certificates is used. Otherwise, the online signature one is used.
+bool tls12_add_verify_sigalgs(const SSL *ssl, CBB *out, bool for_certs);
 
 // tls12_check_peer_sigalg checks if |sigalg| is acceptable for the peer
 // signature. It returns true on success and false on error, setting
@@ -1748,6 +1750,11 @@
 bool tls12_check_peer_sigalg(const SSL *ssl, uint8_t *out_alert,
                              uint16_t sigalg);
 
+// tls12_has_different_verify_sigalgs_for_certs returns whether |ssl| has a
+// different, more restrictive, list of signature algorithms acceptable for the
+// certificate than the online signature.
+bool tls12_has_different_verify_sigalgs_for_certs(const SSL *ssl);
+
 
 // Underdocumented functions.
 //
@@ -1759,7 +1766,12 @@
 #define NAMED_CURVE_TYPE 3
 
 struct CERT {
-  EVP_PKEY *privatekey;
+  static constexpr bool kAllowUniquePtr = true;
+
+  explicit CERT(const SSL_X509_METHOD *x509_method);
+  ~CERT();
+
+  UniquePtr<EVP_PKEY> privatekey;
 
   // chain contains the certificate chain, with the leaf at the beginning. The
   // first element of |chain| may be NULL to indicate that the leaf certificate
@@ -1767,35 +1779,34 @@
   //   If |chain| != NULL -> len(chain) >= 1
   //   If |chain[0]| == NULL -> len(chain) >= 2.
   //   |chain[1..]| != NULL
-  STACK_OF(CRYPTO_BUFFER) *chain;
+  UniquePtr<STACK_OF(CRYPTO_BUFFER)> chain;
 
   // x509_chain may contain a parsed copy of |chain[1..]|. This is only used as
   // a cache in order to implement “get0” functions that return a non-owning
   // pointer to the certificate chain.
-  STACK_OF(X509) *x509_chain;
+  STACK_OF(X509) *x509_chain = nullptr;
 
   // x509_leaf may contain a parsed copy of the first element of |chain|. This
   // is only used as a cache in order to implement “get0” functions that return
   // a non-owning pointer to the certificate chain.
-  X509 *x509_leaf;
+  X509 *x509_leaf = nullptr;
 
   // x509_stash contains the last |X509| object append to the chain. This is a
   // workaround for some third-party code that continue to use an |X509| object
   // even after passing ownership with an “add0” function.
-  X509 *x509_stash;
+  X509 *x509_stash = nullptr;
 
   // key_method, if non-NULL, is a set of callbacks to call for private key
   // operations.
-  const SSL_PRIVATE_KEY_METHOD *key_method;
+  const SSL_PRIVATE_KEY_METHOD *key_method = nullptr;
 
   // x509_method contains pointers to functions that might deal with |X509|
   // compatibility, or might be a no-op, depending on the application.
-  const SSL_X509_METHOD *x509_method;
+  const SSL_X509_METHOD *x509_method = nullptr;
 
-  // sigalgs, if non-NULL, is the set of signature algorithms supported by
+  // sigalgs, if non-empty, is the set of signature algorithms supported by
   // |privatekey| in decreasing order of preference.
-  uint16_t *sigalgs;
-  size_t num_sigalgs;
+  Array<uint16_t> sigalgs;
 
   // Certificate setup callback: if set is called whenever a
   // certificate may be required (client or server). the callback
@@ -1803,23 +1814,23 @@
   // certificates required. This allows advanced applications
   // to select certificates on the fly: for example based on
   // supported signature algorithms or curves.
-  int (*cert_cb)(SSL *ssl, void *arg);
-  void *cert_cb_arg;
+  int (*cert_cb)(SSL *ssl, void *arg) = nullptr;
+  void *cert_cb_arg = nullptr;
 
   // Optional X509_STORE for certificate validation. If NULL the parent SSL_CTX
   // store is used instead.
-  X509_STORE *verify_store;
+  X509_STORE *verify_store = nullptr;
 
   // Signed certificate timestamp list to be sent to the client, if requested
-  CRYPTO_BUFFER *signed_cert_timestamp_list;
+  UniquePtr<CRYPTO_BUFFER> signed_cert_timestamp_list;
 
   // OCSP response to be sent to the client, if requested.
-  CRYPTO_BUFFER *ocsp_response;
+  UniquePtr<CRYPTO_BUFFER> ocsp_response;
 
   // sid_ctx partitions the session space within a shared session cache or
   // ticket key. Only sessions with a matching value will be accepted.
-  uint8_t sid_ctx_length;
-  uint8_t sid_ctx[SSL_MAX_SID_CTX_LENGTH];
+  uint8_t sid_ctx_length = 0;
+  uint8_t sid_ctx[SSL_MAX_SID_CTX_LENGTH] = {0};
 
   // If enable_early_data is true, early data can be sent and accepted.
   bool enable_early_data:1;
@@ -2027,8 +2038,6 @@
   void (*remove_session_cb)(SSL_CTX *ctx, SSL_SESSION *sess);
   SSL_SESSION *(*get_session_cb)(SSL *ssl, const uint8_t *data, int len,
                                  int *copy);
-  SSL_SESSION *(*get_session_cb_legacy)(SSL *ssl, uint8_t *data, int len,
-                                        int *copy);
 
   CRYPTO_refcount_t references;
 
@@ -2229,6 +2238,10 @@
   // ed25519_enabled is whether Ed25519 is advertised in the handshake.
   bool ed25519_enabled:1;
 
+  // rsa_pss_rsae_certs_enabled is whether rsa_pss_rsae_* are supported by the
+  // certificate verifier.
+  bool rsa_pss_rsae_certs_enabled:1;
+
   // false_start_allowed_without_alpn is whether False Start (if
   // |SSL_MODE_ENABLE_FALSE_START| is enabled) is allowed without ALPN.
   bool false_start_allowed_without_alpn:1;
@@ -2440,6 +2453,10 @@
 
   // Contains the QUIC transport params received by the peer.
   Array<uint8_t> peer_quic_transport_params;
+
+  // srtp_profile is the selected SRTP protection profile for
+  // DTLS-SRTP.
+  const SRTP_PROTECTION_PROFILE *srtp_profile = nullptr;
 };
 
 // lengths of messages
@@ -2671,10 +2688,6 @@
   // DTLS-SRTP.
   STACK_OF(SRTP_PROTECTION_PROFILE) *srtp_profiles;
 
-  // srtp_profile is the selected SRTP protection profile for
-  // DTLS-SRTP.
-  const SRTP_PROTECTION_PROFILE *srtp_profile;
-
   // The client's Channel ID private key.
   EVP_PKEY *tlsext_channel_id_private;
 
@@ -2753,10 +2766,8 @@
 // kMaxEarlyDataSkipped in tls_record.c, which is measured in ciphertext.
 static const size_t kMaxEarlyDataAccepted = 14336;
 
-CERT *ssl_cert_new(const SSL_X509_METHOD *x509_method);
-CERT *ssl_cert_dup(CERT *cert);
+UniquePtr<CERT> ssl_cert_dup(CERT *cert);
 void ssl_cert_clear_certs(CERT *cert);
-void ssl_cert_free(CERT *cert);
 int ssl_set_cert(CERT *cert, UniquePtr<CRYPTO_BUFFER> buffer);
 int ssl_is_key_type_supported(int key_type);
 // ssl_compare_public_and_private_key returns one if |pubkey| is the public
diff --git a/src/ssl/ssl_cert.cc b/src/ssl/ssl_cert.cc
index 9a3eef3..20b4514 100644
--- a/src/ssl/ssl_cert.cc
+++ b/src/ssl/ssl_cert.cc
@@ -135,16 +135,12 @@
 
 namespace bssl {
 
-CERT *ssl_cert_new(const SSL_X509_METHOD *x509_method) {
-  CERT *ret = (CERT *)OPENSSL_malloc(sizeof(CERT));
-  if (ret == NULL) {
-    OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
-    return NULL;
-  }
-  OPENSSL_memset(ret, 0, sizeof(CERT));
-  ret->x509_method = x509_method;
+CERT::CERT(const SSL_X509_METHOD *x509_method_arg)
+    : x509_method(x509_method_arg), enable_early_data(false) {}
 
-  return ret;
+CERT::~CERT() {
+  ssl_cert_clear_certs(this);
+  x509_method->cert_free(this);
 }
 
 static CRYPTO_BUFFER *buffer_up_ref(CRYPTO_BUFFER *buffer) {
@@ -152,47 +148,45 @@
   return buffer;
 }
 
-CERT *ssl_cert_dup(CERT *cert) {
-  CERT *ret = (CERT *)OPENSSL_malloc(sizeof(CERT));
-  if (ret == NULL) {
-    OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
-    return NULL;
+UniquePtr<CERT> ssl_cert_dup(CERT *cert) {
+  UniquePtr<CERT> ret = MakeUnique<CERT>(cert->x509_method);
+  if (!ret) {
+    return nullptr;
   }
-  OPENSSL_memset(ret, 0, sizeof(CERT));
 
-  ret->chain = sk_CRYPTO_BUFFER_deep_copy(cert->chain, buffer_up_ref,
-                                          CRYPTO_BUFFER_free);
+  if (cert->chain) {
+    ret->chain.reset(sk_CRYPTO_BUFFER_deep_copy(
+        cert->chain.get(), buffer_up_ref, CRYPTO_BUFFER_free));
+    if (!ret->chain) {
+      return nullptr;
+    }
+  }
 
-  if (cert->privatekey != NULL) {
-    EVP_PKEY_up_ref(cert->privatekey);
-    ret->privatekey = cert->privatekey;
+  if (cert->privatekey) {
+    EVP_PKEY_up_ref(cert->privatekey.get());
+    ret->privatekey.reset(cert->privatekey.get());
   }
 
   ret->key_method = cert->key_method;
-  ret->x509_method = cert->x509_method;
 
-  if (cert->sigalgs != NULL) {
-    ret->sigalgs = (uint16_t *)BUF_memdup(
-        cert->sigalgs, cert->num_sigalgs * sizeof(cert->sigalgs[0]));
-    if (ret->sigalgs == NULL) {
-      goto err;
-    }
+  if (!ret->sigalgs.CopyFrom(cert->sigalgs)) {
+    return nullptr;
   }
-  ret->num_sigalgs = cert->num_sigalgs;
 
   ret->cert_cb = cert->cert_cb;
   ret->cert_cb_arg = cert->cert_cb_arg;
 
-  ret->x509_method->cert_dup(ret, cert);
+  ret->x509_method->cert_dup(ret.get(), cert);
 
-  if (cert->signed_cert_timestamp_list != NULL) {
-    CRYPTO_BUFFER_up_ref(cert->signed_cert_timestamp_list);
-    ret->signed_cert_timestamp_list = cert->signed_cert_timestamp_list;
+  if (cert->signed_cert_timestamp_list) {
+    CRYPTO_BUFFER_up_ref(cert->signed_cert_timestamp_list.get());
+    ret->signed_cert_timestamp_list.reset(
+        cert->signed_cert_timestamp_list.get());
   }
 
-  if (cert->ocsp_response != NULL) {
-    CRYPTO_BUFFER_up_ref(cert->ocsp_response);
-    ret->ocsp_response = cert->ocsp_response;
+  if (cert->ocsp_response) {
+    CRYPTO_BUFFER_up_ref(cert->ocsp_response.get());
+    ret->ocsp_response.reset(cert->ocsp_response.get());
   }
 
   ret->sid_ctx_length = cert->sid_ctx_length;
@@ -201,10 +195,6 @@
   ret->enable_early_data = cert->enable_early_data;
 
   return ret;
-
-err:
-  ssl_cert_free(ret);
-  return NULL;
 }
 
 // Free up and clear all certificates and chains
@@ -215,25 +205,9 @@
 
   cert->x509_method->cert_clear(cert);
 
-  sk_CRYPTO_BUFFER_pop_free(cert->chain, CRYPTO_BUFFER_free);
-  cert->chain = NULL;
-  EVP_PKEY_free(cert->privatekey);
-  cert->privatekey = NULL;
-  cert->key_method = NULL;
-}
-
-void ssl_cert_free(CERT *cert) {
-  if (cert == NULL) {
-    return;
-  }
-
-  ssl_cert_clear_certs(cert);
-  cert->x509_method->cert_free(cert);
-  OPENSSL_free(cert->sigalgs);
-  CRYPTO_BUFFER_free(cert->signed_cert_timestamp_list);
-  CRYPTO_BUFFER_free(cert->ocsp_response);
-
-  OPENSSL_free(cert);
+  cert->chain.reset();
+  cert->privatekey.reset();
+  cert->key_method = nullptr;
 }
 
 static void ssl_cert_set_cert_cb(CERT *cert, int (*cb)(SSL *ssl, void *arg),
@@ -311,42 +285,37 @@
       break;
   }
 
-  STACK_OF(CRYPTO_BUFFER) *certs_sk = sk_CRYPTO_BUFFER_new_null();
-  if (certs_sk == NULL) {
+  UniquePtr<STACK_OF(CRYPTO_BUFFER)> certs_sk(sk_CRYPTO_BUFFER_new_null());
+  if (!certs_sk) {
     return 0;
   }
 
   for (size_t i = 0; i < num_certs; i++) {
-    if (!sk_CRYPTO_BUFFER_push(certs_sk, certs[i])) {
-      sk_CRYPTO_BUFFER_pop_free(certs_sk, CRYPTO_BUFFER_free);
+    if (!sk_CRYPTO_BUFFER_push(certs_sk.get(), certs[i])) {
       return 0;
     }
     CRYPTO_BUFFER_up_ref(certs[i]);
   }
 
-  EVP_PKEY_free(cert->privatekey);
-  cert->privatekey = privkey;
-  if (privkey != NULL) {
+  if (privkey != nullptr) {
     EVP_PKEY_up_ref(privkey);
   }
+  cert->privatekey.reset(privkey);
   cert->key_method = privkey_method;
 
-  sk_CRYPTO_BUFFER_pop_free(cert->chain, CRYPTO_BUFFER_free);
-  cert->chain = certs_sk;
-
+  cert->chain = std::move(certs_sk);
   return 1;
 }
 
 int ssl_set_cert(CERT *cert, UniquePtr<CRYPTO_BUFFER> buffer) {
-  switch (check_leaf_cert_and_privkey(buffer.get(), cert->privatekey)) {
+  switch (check_leaf_cert_and_privkey(buffer.get(), cert->privatekey.get())) {
     case leaf_cert_and_privkey_error:
       return 0;
     case leaf_cert_and_privkey_mismatch:
       // don't fail for a cert/key mismatch, just free current private key
       // (when switching to a different cert & key, first this function should
       // be used, then |ssl_set_pkey|.
-      EVP_PKEY_free(cert->privatekey);
-      cert->privatekey = NULL;
+      cert->privatekey.reset();
       break;
     case leaf_cert_and_privkey_ok:
       break;
@@ -354,20 +323,19 @@
 
   cert->x509_method->cert_flush_cached_leaf(cert);
 
-  if (cert->chain != NULL) {
-    CRYPTO_BUFFER_free(sk_CRYPTO_BUFFER_value(cert->chain, 0));
-    sk_CRYPTO_BUFFER_set(cert->chain, 0, buffer.release());
+  if (cert->chain != nullptr) {
+    CRYPTO_BUFFER_free(sk_CRYPTO_BUFFER_value(cert->chain.get(), 0));
+    sk_CRYPTO_BUFFER_set(cert->chain.get(), 0, buffer.release());
     return 1;
   }
 
-  cert->chain = sk_CRYPTO_BUFFER_new_null();
-  if (cert->chain == NULL) {
+  cert->chain.reset(sk_CRYPTO_BUFFER_new_null());
+  if (cert->chain == nullptr) {
     return 0;
   }
 
-  if (!PushToStack(cert->chain, std::move(buffer))) {
-    sk_CRYPTO_BUFFER_free(cert->chain);
-    cert->chain = NULL;
+  if (!PushToStack(cert->chain.get(), std::move(buffer))) {
+    cert->chain.reset();
     return 0;
   }
 
@@ -375,8 +343,8 @@
 }
 
 int ssl_has_certificate(const SSL *ssl) {
-  return ssl->cert->chain != NULL &&
-         sk_CRYPTO_BUFFER_value(ssl->cert->chain, 0) != NULL &&
+  return ssl->cert->chain != nullptr &&
+         sk_CRYPTO_BUFFER_value(ssl->cert->chain.get(), 0) != nullptr &&
          ssl_has_private_key(ssl);
 }
 
@@ -455,7 +423,7 @@
     return 0;
   }
 
-  STACK_OF(CRYPTO_BUFFER) *chain = ssl->cert->chain;
+  STACK_OF(CRYPTO_BUFFER) *chain = ssl->cert->chain.get();
   for (size_t i = 0; i < sk_CRYPTO_BUFFER_num(chain); i++) {
     CRYPTO_BUFFER *buffer = sk_CRYPTO_BUFFER_value(chain, i);
     CBB child;
@@ -558,19 +526,20 @@
 }
 
 int ssl_cert_check_private_key(const CERT *cert, const EVP_PKEY *privkey) {
-  if (privkey == NULL) {
+  if (privkey == nullptr) {
     OPENSSL_PUT_ERROR(SSL, SSL_R_NO_PRIVATE_KEY_ASSIGNED);
     return 0;
   }
 
-  if (cert->chain == NULL ||
-      sk_CRYPTO_BUFFER_value(cert->chain, 0) == NULL) {
+  if (cert->chain == nullptr ||
+      sk_CRYPTO_BUFFER_value(cert->chain.get(), 0) == nullptr) {
     OPENSSL_PUT_ERROR(SSL, SSL_R_NO_CERTIFICATE_ASSIGNED);
     return 0;
   }
 
   CBS cert_cbs;
-  CRYPTO_BUFFER_init_CBS(sk_CRYPTO_BUFFER_value(cert->chain, 0), &cert_cbs);
+  CRYPTO_BUFFER_init_CBS(sk_CRYPTO_BUFFER_value(cert->chain.get(), 0),
+                         &cert_cbs);
   UniquePtr<EVP_PKEY> pubkey = ssl_cert_parse_pubkey(&cert_cbs);
   if (!pubkey) {
     OPENSSL_PUT_ERROR(X509, X509_R_UNKNOWN_KEY_TYPE);
@@ -793,7 +762,8 @@
   }
 
   CBS leaf;
-  CRYPTO_BUFFER_init_CBS(sk_CRYPTO_BUFFER_value(ssl->cert->chain, 0), &leaf);
+  CRYPTO_BUFFER_init_CBS(sk_CRYPTO_BUFFER_value(ssl->cert->chain.get(), 0),
+                         &leaf);
 
   hs->local_pubkey = ssl_cert_parse_pubkey(&leaf);
   return hs->local_pubkey != NULL;
@@ -862,7 +832,7 @@
 }
 
 static int set_signed_cert_timestamp_list(CERT *cert, const uint8_t *list,
-                                           size_t list_len) {
+                                          size_t list_len) {
   CBS sct_list;
   CBS_init(&sct_list, list, list_len);
   if (!ssl_is_sct_list_valid(&sct_list)) {
@@ -870,10 +840,9 @@
     return 0;
   }
 
-  CRYPTO_BUFFER_free(cert->signed_cert_timestamp_list);
-  cert->signed_cert_timestamp_list =
-      CRYPTO_BUFFER_new(CBS_data(&sct_list), CBS_len(&sct_list), NULL);
-  return cert->signed_cert_timestamp_list != NULL;
+  cert->signed_cert_timestamp_list.reset(
+      CRYPTO_BUFFER_new(CBS_data(&sct_list), CBS_len(&sct_list), nullptr));
+  return cert->signed_cert_timestamp_list != nullptr;
 }
 
 int SSL_CTX_set_signed_cert_timestamp_list(SSL_CTX *ctx, const uint8_t *list,
@@ -888,16 +857,16 @@
 
 int SSL_CTX_set_ocsp_response(SSL_CTX *ctx, const uint8_t *response,
                               size_t response_len) {
-  CRYPTO_BUFFER_free(ctx->cert->ocsp_response);
-  ctx->cert->ocsp_response = CRYPTO_BUFFER_new(response, response_len, NULL);
-  return ctx->cert->ocsp_response != NULL;
+  ctx->cert->ocsp_response.reset(
+      CRYPTO_BUFFER_new(response, response_len, nullptr));
+  return ctx->cert->ocsp_response != nullptr;
 }
 
 int SSL_set_ocsp_response(SSL *ssl, const uint8_t *response,
                           size_t response_len) {
-  CRYPTO_BUFFER_free(ssl->cert->ocsp_response);
-  ssl->cert->ocsp_response = CRYPTO_BUFFER_new(response, response_len, NULL);
-  return ssl->cert->ocsp_response != NULL;
+  ssl->cert->ocsp_response.reset(
+      CRYPTO_BUFFER_new(response, response_len, nullptr));
+  return ssl->cert->ocsp_response != nullptr;
 }
 
 void SSL_CTX_set0_client_CAs(SSL_CTX *ctx, STACK_OF(CRYPTO_BUFFER) *name_list) {
diff --git a/src/ssl/ssl_lib.cc b/src/ssl/ssl_lib.cc
index 14fb0ff..8f1de02 100644
--- a/src/ssl/ssl_lib.cc
+++ b/src/ssl/ssl_lib.cc
@@ -549,7 +549,7 @@
 
   ret->max_cert_list = SSL_MAX_CERT_LIST_DEFAULT;
   ret->verify_mode = SSL_VERIFY_NONE;
-  ret->cert = ssl_cert_new(method->x509_method);
+  ret->cert = New<CERT>(method->x509_method);
   if (ret->cert == NULL) {
     goto err;
   }
@@ -580,6 +580,8 @@
   // problems, the feature will be removed entirely.
   ret->mode = SSL_MODE_NO_AUTO_CHAIN;
 
+  ret->rsa_pss_rsae_certs_enabled = true;
+
   // Lock the SSL_CTX to the specified version, for compatibility with legacy
   // uses of SSL_METHOD.
   if (!SSL_CTX_set_max_proto_version(ret, method->version) ||
@@ -621,7 +623,7 @@
   CRYPTO_MUTEX_cleanup(&ctx->lock);
   lh_SSL_SESSION_free(ctx->sessions);
   ssl_cipher_preference_list_free(ctx->cipher_list);
-  ssl_cert_free(ctx->cert);
+  Delete(ctx->cert);
   sk_SSL_CUSTOM_EXTENSION_pop_free(ctx->client_custom_extensions,
                                    SSL_CUSTOM_EXTENSION_free);
   sk_SSL_CUSTOM_EXTENSION_pop_free(ctx->server_custom_extensions,
@@ -668,7 +670,7 @@
   ssl->mode = ctx->mode;
   ssl->max_cert_list = ctx->max_cert_list;
 
-  ssl->cert = ssl_cert_dup(ctx->cert);
+  ssl->cert = ssl_cert_dup(ctx->cert).release();
   if (ssl->cert == NULL) {
     goto err;
   }
@@ -767,7 +769,7 @@
 
   SSL_SESSION_free(ssl->session);
 
-  ssl_cert_free(ssl->cert);
+  Delete(ssl->cert);
 
   OPENSSL_free(ssl->tlsext_hostname);
   SSL_CTX_free(ssl->session_ctx);
@@ -1604,12 +1606,12 @@
 
 // Fix this so it checks all the valid key/cert options
 int SSL_CTX_check_private_key(const SSL_CTX *ctx) {
-  return ssl_cert_check_private_key(ctx->cert, ctx->cert->privatekey);
+  return ssl_cert_check_private_key(ctx->cert, ctx->cert->privatekey.get());
 }
 
 // Fix this function so that it takes an optional type parameter
 int SSL_check_private_key(const SSL *ssl) {
-  return ssl_cert_check_private_key(ssl->cert, ssl->cert->privatekey);
+  return ssl_cert_check_private_key(ssl->cert, ssl->cert->privatekey.get());
 }
 
 long SSL_get_default_timeout(const SSL *ssl) {
@@ -2182,7 +2184,7 @@
 
 EVP_PKEY *SSL_get_privatekey(const SSL *ssl) {
   if (ssl->cert != NULL) {
-    return ssl->cert->privatekey;
+    return ssl->cert->privatekey.get();
   }
 
   return NULL;
@@ -2190,7 +2192,7 @@
 
 EVP_PKEY *SSL_CTX_get0_privatekey(const SSL_CTX *ctx) {
   if (ctx->cert != NULL) {
-    return ctx->cert->privatekey;
+    return ctx->cert->privatekey.get();
   }
 
   return NULL;
@@ -2271,8 +2273,8 @@
     ctx = ssl->session_ctx;
   }
 
-  ssl_cert_free(ssl->cert);
-  ssl->cert = ssl_cert_dup(ctx->cert);
+  Delete(ssl->cert);
+  ssl->cert = ssl_cert_dup(ctx->cert).release();
 
   SSL_CTX_up_ref(ctx);
   SSL_CTX_free(ssl->ctx);
diff --git a/src/ssl/ssl_privkey.cc b/src/ssl/ssl_privkey.cc
index 33cc720..bba03b7 100644
--- a/src/ssl/ssl_privkey.cc
+++ b/src/ssl/ssl_privkey.cc
@@ -82,16 +82,15 @@
     return 0;
   }
 
-  if (cert->chain != NULL &&
-      sk_CRYPTO_BUFFER_value(cert->chain, 0) != NULL &&
+  if (cert->chain != nullptr &&
+      sk_CRYPTO_BUFFER_value(cert->chain.get(), 0) != nullptr &&
       // Sanity-check that the private key and the certificate match.
       !ssl_cert_check_private_key(cert, pkey)) {
     return 0;
   }
 
-  EVP_PKEY_free(cert->privatekey);
   EVP_PKEY_up_ref(pkey);
-  cert->privatekey = pkey;
+  cert->privatekey.reset(pkey);
 
   return 1;
 }
@@ -111,9 +110,9 @@
     {SSL_SIGN_RSA_PKCS1_SHA384, EVP_PKEY_RSA, NID_undef, &EVP_sha384, 0},
     {SSL_SIGN_RSA_PKCS1_SHA512, EVP_PKEY_RSA, NID_undef, &EVP_sha512, 0},
 
-    {SSL_SIGN_RSA_PSS_SHA256, EVP_PKEY_RSA, NID_undef, &EVP_sha256, 1},
-    {SSL_SIGN_RSA_PSS_SHA384, EVP_PKEY_RSA, NID_undef, &EVP_sha384, 1},
-    {SSL_SIGN_RSA_PSS_SHA512, EVP_PKEY_RSA, NID_undef, &EVP_sha512, 1},
+    {SSL_SIGN_RSA_PSS_RSAE_SHA256, EVP_PKEY_RSA, NID_undef, &EVP_sha256, 1},
+    {SSL_SIGN_RSA_PSS_RSAE_SHA384, EVP_PKEY_RSA, NID_undef, &EVP_sha384, 1},
+    {SSL_SIGN_RSA_PSS_RSAE_SHA512, EVP_PKEY_RSA, NID_undef, &EVP_sha512, 1},
 
     {SSL_SIGN_ECDSA_SHA1, EVP_PKEY_EC, NID_undef, &EVP_sha1, 0},
     {SSL_SIGN_ECDSA_SECP256R1_SHA256, EVP_PKEY_EC, NID_X9_62_prime256v1,
@@ -136,7 +135,7 @@
 }
 
 int ssl_has_private_key(const SSL *ssl) {
-  return ssl->cert->privatekey != NULL || ssl->cert->key_method != NULL;
+  return ssl->cert->privatekey != nullptr || ssl->cert->key_method != nullptr;
 }
 
 static int pkey_supports_algorithm(const SSL *ssl, EVP_PKEY *pkey,
@@ -214,7 +213,8 @@
 
   *out_len = max_out;
   ScopedEVP_MD_CTX ctx;
-  if (!setup_ctx(ssl, ctx.get(), ssl->cert->privatekey, sigalg, 0 /* sign */) ||
+  if (!setup_ctx(ssl, ctx.get(), ssl->cert->privatekey.get(), sigalg,
+                 0 /* sign */) ||
       !EVP_DigestSign(ctx.get(), out, out_len, in.data(), in.size())) {
     return ssl_private_key_failure;
   }
@@ -251,7 +251,7 @@
     return ret;
   }
 
-  RSA *rsa = EVP_PKEY_get0_RSA(ssl->cert->privatekey);
+  RSA *rsa = EVP_PKEY_get0_RSA(ssl->cert->privatekey.get());
   if (rsa == NULL) {
     // Decrypt operations are only supported for RSA keys.
     OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR);
@@ -429,12 +429,12 @@
       return include_curve ? "ecdsa_secp384r1_sha384" : "ecdsa_sha384";
     case SSL_SIGN_ECDSA_SECP521R1_SHA512:
       return include_curve ? "ecdsa_secp521r1_sha512" : "ecdsa_sha512";
-    case SSL_SIGN_RSA_PSS_SHA256:
-      return "rsa_pss_sha256";
-    case SSL_SIGN_RSA_PSS_SHA384:
-      return "rsa_pss_sha384";
-    case SSL_SIGN_RSA_PSS_SHA512:
-      return "rsa_pss_sha512";
+    case SSL_SIGN_RSA_PSS_RSAE_SHA256:
+      return "rsa_pss_rsae_sha256";
+    case SSL_SIGN_RSA_PSS_RSAE_SHA384:
+      return "rsa_pss_rsae_sha384";
+    case SSL_SIGN_RSA_PSS_RSAE_SHA512:
+      return "rsa_pss_rsae_sha512";
     case SSL_SIGN_ED25519:
       return "ed25519";
     default:
@@ -477,14 +477,12 @@
 
 int SSL_CTX_set_signing_algorithm_prefs(SSL_CTX *ctx, const uint16_t *prefs,
                                         size_t num_prefs) {
-  return set_algorithm_prefs(&ctx->cert->sigalgs, &ctx->cert->num_sigalgs,
-                             prefs, num_prefs);
+  return ctx->cert->sigalgs.CopyFrom(MakeConstSpan(prefs, num_prefs));
 }
 
 int SSL_set_signing_algorithm_prefs(SSL *ssl, const uint16_t *prefs,
                                     size_t num_prefs) {
-  return set_algorithm_prefs(&ssl->cert->sigalgs, &ssl->cert->num_sigalgs,
-                             prefs, num_prefs);
+  return ssl->cert->sigalgs.CopyFrom(MakeConstSpan(prefs, num_prefs));
 }
 
 int SSL_CTX_set_verify_algorithm_prefs(SSL_CTX *ctx, const uint16_t *prefs,
diff --git a/src/ssl/ssl_session.cc b/src/ssl/ssl_session.cc
index bc2c14c..a18ddd1 100644
--- a/src/ssl/ssl_session.cc
+++ b/src/ssl/ssl_session.cc
@@ -682,17 +682,10 @@
   }
 
   // Fall back to the external cache, if it exists.
-  if (!session && (ssl->session_ctx->get_session_cb != nullptr ||
-                   ssl->session_ctx->get_session_cb_legacy != nullptr)) {
+  if (!session && ssl->session_ctx->get_session_cb != nullptr) {
     int copy = 1;
-    if (ssl->session_ctx->get_session_cb != nullptr) {
-      session.reset(ssl->session_ctx->get_session_cb(ssl, session_id,
-                                                     session_id_len, &copy));
-    } else {
-      session.reset(ssl->session_ctx->get_session_cb_legacy(
-          ssl, const_cast<uint8_t *>(session_id), session_id_len, &copy));
-    }
-
+    session.reset(ssl->session_ctx->get_session_cb(ssl, session_id,
+                                                   session_id_len, &copy));
     if (!session) {
       return ssl_hs_ok;
     }
@@ -1192,12 +1185,6 @@
   ctx->get_session_cb = cb;
 }
 
-void SSL_CTX_sess_set_get_cb(SSL_CTX *ctx,
-                             SSL_SESSION *(*cb)(SSL *ssl, uint8_t *id,
-                                                int id_len, int *out_copy)) {
-  ctx->get_session_cb_legacy = cb;
-}
-
 SSL_SESSION *(*SSL_CTX_sess_get_get_cb(SSL_CTX *ctx))(SSL *ssl,
                                                       const uint8_t *id,
                                                       int id_len,
diff --git a/src/ssl/ssl_test.cc b/src/ssl/ssl_test.cc
index 12f044c..5fb4fb4 100644
--- a/src/ssl/ssl_test.cc
+++ b/src/ssl/ssl_test.cc
@@ -3447,9 +3447,11 @@
   }
 }
 
+using TicketAEADMethodParam =
+    testing::tuple<uint16_t, unsigned, ssl_test_ticket_aead_failure_mode>;
+
 class TicketAEADMethodTest
-    : public ::testing::TestWithParam<testing::tuple<
-          uint16_t, unsigned, ssl_test_ticket_aead_failure_mode>> {};
+    : public ::testing::TestWithParam<TicketAEADMethodParam> {};
 
 TEST_P(TicketAEADMethodTest, Resume) {
   bssl::UniquePtr<X509> cert = GetTestCertificate();
@@ -3525,15 +3527,49 @@
   }
 }
 
+std::string TicketAEADMethodParamToString(
+    const testing::TestParamInfo<TicketAEADMethodParam> &params) {
+  std::string ret = GetVersionName(std::get<0>(params.param));
+  // GTest only allows alphanumeric characters and '_' in the parameter
+  // string. Additionally filter out the 'v' to get "TLS13" over "TLSv13".
+  for (auto it = ret.begin(); it != ret.end();) {
+    if (*it == '.' || *it == 'v') {
+      it = ret.erase(it);
+    } else {
+      ++it;
+    }
+  }
+  char retry_count[256];
+  snprintf(retry_count, sizeof(retry_count), "%d", std::get<1>(params.param));
+  ret += "_";
+  ret += retry_count;
+  ret += "Retries_";
+  switch (std::get<2>(params.param)) {
+    case ssl_test_ticket_aead_ok:
+      ret += "OK";
+      break;
+    case ssl_test_ticket_aead_seal_fail:
+      ret += "SealFail";
+      break;
+    case ssl_test_ticket_aead_open_soft_fail:
+      ret += "OpenSoftFail";
+      break;
+    case ssl_test_ticket_aead_open_hard_fail:
+      ret += "OpenHardFail";
+      break;
+  }
+  return ret;
+}
+
 INSTANTIATE_TEST_CASE_P(
     TicketAEADMethodTests, TicketAEADMethodTest,
-    testing::Combine(
-        testing::Values(TLS1_2_VERSION, TLS1_3_VERSION),
-        testing::Values(0, 1, 2),
-        testing::Values(ssl_test_ticket_aead_ok,
-                        ssl_test_ticket_aead_seal_fail,
-                        ssl_test_ticket_aead_open_soft_fail,
-                        ssl_test_ticket_aead_open_hard_fail)));
+    testing::Combine(testing::Values(TLS1_2_VERSION, TLS1_3_VERSION),
+                     testing::Values(0, 1, 2),
+                     testing::Values(ssl_test_ticket_aead_ok,
+                                     ssl_test_ticket_aead_seal_fail,
+                                     ssl_test_ticket_aead_open_soft_fail,
+                                     ssl_test_ticket_aead_open_hard_fail)),
+    TicketAEADMethodParamToString);
 
 TEST(SSLTest, SelectNextProto) {
   uint8_t *result;
@@ -3884,10 +3920,10 @@
       SSL_is_signature_algorithm_rsa_pss(SSL_SIGN_ECDSA_SECP256R1_SHA256));
 
   EXPECT_EQ(EVP_PKEY_RSA,
-            SSL_get_signature_algorithm_key_type(SSL_SIGN_RSA_PSS_SHA384));
+            SSL_get_signature_algorithm_key_type(SSL_SIGN_RSA_PSS_RSAE_SHA384));
   EXPECT_EQ(EVP_sha384(),
-            SSL_get_signature_algorithm_digest(SSL_SIGN_RSA_PSS_SHA384));
-  EXPECT_TRUE(SSL_is_signature_algorithm_rsa_pss(SSL_SIGN_RSA_PSS_SHA384));
+            SSL_get_signature_algorithm_digest(SSL_SIGN_RSA_PSS_RSAE_SHA384));
+  EXPECT_TRUE(SSL_is_signature_algorithm_rsa_pss(SSL_SIGN_RSA_PSS_RSAE_SHA384));
 }
 
 void MoveBIOs(SSL *dest, SSL *src) {
diff --git a/src/ssl/ssl_x509.cc b/src/ssl/ssl_x509.cc
index cc27a60..cb35339 100644
--- a/src/ssl/ssl_x509.cc
+++ b/src/ssl/ssl_x509.cc
@@ -186,15 +186,11 @@
 }
 
 // new_leafless_chain returns a fresh stack of buffers set to {NULL}.
-static STACK_OF(CRYPTO_BUFFER) *new_leafless_chain(void) {
-  STACK_OF(CRYPTO_BUFFER) *chain = sk_CRYPTO_BUFFER_new_null();
-  if (chain == NULL) {
-    return NULL;
-  }
-
-  if (!sk_CRYPTO_BUFFER_push(chain, NULL)) {
-    sk_CRYPTO_BUFFER_free(chain);
-    return NULL;
+static UniquePtr<STACK_OF(CRYPTO_BUFFER)> new_leafless_chain(void) {
+  UniquePtr<STACK_OF(CRYPTO_BUFFER)> chain(sk_CRYPTO_BUFFER_new_null());
+  if (!chain ||
+      !sk_CRYPTO_BUFFER_push(chain.get(), nullptr)) {
+    return nullptr;
   }
 
   return chain;
@@ -207,25 +203,25 @@
 static int ssl_cert_set_chain(CERT *cert, STACK_OF(X509) *chain) {
   UniquePtr<STACK_OF(CRYPTO_BUFFER)> new_chain;
 
-  if (cert->chain != NULL) {
+  if (cert->chain != nullptr) {
     new_chain.reset(sk_CRYPTO_BUFFER_new_null());
     if (!new_chain) {
       return 0;
     }
 
-    CRYPTO_BUFFER *leaf = sk_CRYPTO_BUFFER_value(cert->chain, 0);
+    CRYPTO_BUFFER *leaf = sk_CRYPTO_BUFFER_value(cert->chain.get(), 0);
     if (!sk_CRYPTO_BUFFER_push(new_chain.get(), leaf)) {
       return 0;
     }
     // |leaf| might be NULL if it's a “leafless” chain.
-    if (leaf != NULL) {
+    if (leaf != nullptr) {
       CRYPTO_BUFFER_up_ref(leaf);
     }
   }
 
   for (X509 *x509 : chain) {
     if (!new_chain) {
-      new_chain.reset(new_leafless_chain());
+      new_chain = new_leafless_chain();
       if (!new_chain) {
         return 0;
       }
@@ -238,9 +234,7 @@
     }
   }
 
-  sk_CRYPTO_BUFFER_pop_free(cert->chain, CRYPTO_BUFFER_free);
-  cert->chain = new_chain.release();
-
+  cert->chain = std::move(new_chain);
   return 1;
 }
 
@@ -441,13 +435,13 @@
   // isn't disabled.
   if ((ssl->mode & SSL_MODE_NO_AUTO_CHAIN) ||
       !ssl_has_certificate(ssl) ||
-      ssl->cert->chain == NULL ||
-      sk_CRYPTO_BUFFER_num(ssl->cert->chain) > 1) {
+      ssl->cert->chain == nullptr ||
+      sk_CRYPTO_BUFFER_num(ssl->cert->chain.get()) > 1) {
     return 1;
   }
 
-  UniquePtr<X509> leaf(
-      X509_parse_from_buffer(sk_CRYPTO_BUFFER_value(ssl->cert->chain, 0)));
+  UniquePtr<X509> leaf(X509_parse_from_buffer(
+      sk_CRYPTO_BUFFER_value(ssl->cert->chain.get(), 0)));
   if (!leaf) {
     OPENSSL_PUT_ERROR(SSL, ERR_R_X509_LIB);
     return 0;
@@ -750,7 +744,7 @@
     return 1;
   }
 
-  CRYPTO_BUFFER *leaf = sk_CRYPTO_BUFFER_value(cert->chain, 0);
+  CRYPTO_BUFFER *leaf = sk_CRYPTO_BUFFER_value(cert->chain.get(), 0);
   if (!leaf) {
     return 1;
   }
@@ -807,14 +801,13 @@
   }
 
   if (cert->chain != NULL) {
-    return PushToStack(cert->chain, std::move(buffer));
+    return PushToStack(cert->chain.get(), std::move(buffer));
   }
 
   cert->chain = new_leafless_chain();
-  if (cert->chain == NULL ||
-      !PushToStack(cert->chain, std::move(buffer))) {
-    sk_CRYPTO_BUFFER_free(cert->chain);
-    cert->chain = NULL;
+  if (!cert->chain ||
+      !PushToStack(cert->chain.get(), std::move(buffer))) {
+    cert->chain.reset();
     return 0;
   }
 
@@ -906,9 +899,9 @@
 static int ssl_cert_cache_chain_certs(CERT *cert) {
   assert(cert->x509_method);
 
-  if (cert->x509_chain != NULL ||
-      cert->chain == NULL ||
-      sk_CRYPTO_BUFFER_num(cert->chain) < 2) {
+  if (cert->x509_chain != nullptr ||
+      cert->chain == nullptr ||
+      sk_CRYPTO_BUFFER_num(cert->chain.get()) < 2) {
     return 1;
   }
 
@@ -917,8 +910,8 @@
     return 0;
   }
 
-  for (size_t i = 1; i < sk_CRYPTO_BUFFER_num(cert->chain); i++) {
-    CRYPTO_BUFFER *buffer = sk_CRYPTO_BUFFER_value(cert->chain, i);
+  for (size_t i = 1; i < sk_CRYPTO_BUFFER_num(cert->chain.get()); i++) {
+    CRYPTO_BUFFER *buffer = sk_CRYPTO_BUFFER_value(cert->chain.get(), i);
     UniquePtr<X509> x509(X509_parse_from_buffer(buffer));
     if (!x509 ||
         !PushToStack(chain.get(), std::move(x509))) {
diff --git a/src/ssl/t1_lib.cc b/src/ssl/t1_lib.cc
index 2d3a664..a821246 100644
--- a/src/ssl/t1_lib.cc
+++ b/src/ssl/t1_lib.cc
@@ -419,15 +419,15 @@
     // List our preferred algorithms first.
     SSL_SIGN_ED25519,
     SSL_SIGN_ECDSA_SECP256R1_SHA256,
-    SSL_SIGN_RSA_PSS_SHA256,
+    SSL_SIGN_RSA_PSS_RSAE_SHA256,
     SSL_SIGN_RSA_PKCS1_SHA256,
 
     // Larger hashes are acceptable.
     SSL_SIGN_ECDSA_SECP384R1_SHA384,
-    SSL_SIGN_RSA_PSS_SHA384,
+    SSL_SIGN_RSA_PSS_RSAE_SHA384,
     SSL_SIGN_RSA_PKCS1_SHA384,
 
-    SSL_SIGN_RSA_PSS_SHA512,
+    SSL_SIGN_RSA_PSS_RSAE_SHA512,
     SSL_SIGN_RSA_PKCS1_SHA512,
 
     // For now, SHA-1 is still accepted but least preferable.
@@ -445,18 +445,18 @@
     // List our preferred algorithms first.
     SSL_SIGN_ED25519,
     SSL_SIGN_ECDSA_SECP256R1_SHA256,
-    SSL_SIGN_RSA_PSS_SHA256,
+    SSL_SIGN_RSA_PSS_RSAE_SHA256,
     SSL_SIGN_RSA_PKCS1_SHA256,
 
     // If needed, sign larger hashes.
     //
     // TODO(davidben): Determine which of these may be pruned.
     SSL_SIGN_ECDSA_SECP384R1_SHA384,
-    SSL_SIGN_RSA_PSS_SHA384,
+    SSL_SIGN_RSA_PSS_RSAE_SHA384,
     SSL_SIGN_RSA_PKCS1_SHA384,
 
     SSL_SIGN_ECDSA_SECP521R1_SHA512,
-    SSL_SIGN_RSA_PSS_SHA512,
+    SSL_SIGN_RSA_PSS_RSAE_SHA512,
     SSL_SIGN_RSA_PKCS1_SHA512,
 
     // If the peer supports nothing else, sign with SHA-1.
@@ -464,44 +464,78 @@
     SSL_SIGN_RSA_PKCS1_SHA1,
 };
 
-bool tls12_add_verify_sigalgs(const SSL *ssl, CBB *out) {
-  bool use_default = ssl->ctx->num_verify_sigalgs == 0;
-  Span<const uint16_t> sigalgs = kVerifySignatureAlgorithms;
-  if (!use_default) {
-    sigalgs = MakeConstSpan(ssl->ctx->verify_sigalgs,
-                            ssl->ctx->num_verify_sigalgs);
+struct SSLSignatureAlgorithmList {
+  bool Next(uint16_t *out) {
+    while (!list.empty()) {
+      uint16_t sigalg = list[0];
+      list = list.subspan(1);
+      if (skip_ed25519 && sigalg == SSL_SIGN_ED25519) {
+        continue;
+      }
+      if (skip_rsa_pss_rsae && SSL_is_signature_algorithm_rsa_pss(sigalg)) {
+        continue;
+      }
+      *out = sigalg;
+      return true;
+    }
+    return false;
   }
 
-  for (uint16_t sigalg : sigalgs) {
-    if (use_default &&
-        sigalg == SSL_SIGN_ED25519 &&
-        !ssl->ctx->ed25519_enabled) {
-      continue;
+  bool operator==(const SSLSignatureAlgorithmList &other) const {
+    SSLSignatureAlgorithmList a = *this;
+    SSLSignatureAlgorithmList b = other;
+    uint16_t a_val, b_val;
+    while (a.Next(&a_val)) {
+      if (!b.Next(&b_val) ||
+          a_val != b_val) {
+        return false;
+      }
     }
+    return !b.Next(&b_val);
+  }
+
+  bool operator!=(const SSLSignatureAlgorithmList &other) const {
+    return !(*this == other);
+  }
+
+  Span<const uint16_t> list;
+  bool skip_ed25519 = false;
+  bool skip_rsa_pss_rsae = false;
+};
+
+static SSLSignatureAlgorithmList tls12_get_verify_sigalgs(const SSL *ssl,
+                                                          bool for_certs) {
+  SSLSignatureAlgorithmList ret;
+  if (ssl->ctx->num_verify_sigalgs != 0) {
+    ret.list =
+        MakeConstSpan(ssl->ctx->verify_sigalgs, ssl->ctx->num_verify_sigalgs);
+  } else {
+    ret.list = kVerifySignatureAlgorithms;
+    ret.skip_ed25519 = !ssl->ctx->ed25519_enabled;
+  }
+  if (for_certs) {
+    ret.skip_rsa_pss_rsae = !ssl->ctx->rsa_pss_rsae_certs_enabled;
+  }
+  return ret;
+}
+
+bool tls12_add_verify_sigalgs(const SSL *ssl, CBB *out, bool for_certs) {
+  SSLSignatureAlgorithmList list = tls12_get_verify_sigalgs(ssl, for_certs);
+  uint16_t sigalg;
+  while (list.Next(&sigalg)) {
     if (!CBB_add_u16(out, sigalg)) {
       return false;
     }
   }
-
   return true;
 }
 
 bool tls12_check_peer_sigalg(const SSL *ssl, uint8_t *out_alert,
                              uint16_t sigalg) {
-  const uint16_t *sigalgs = kVerifySignatureAlgorithms;
-  size_t num_sigalgs = OPENSSL_ARRAY_SIZE(kVerifySignatureAlgorithms);
-  if (ssl->ctx->num_verify_sigalgs != 0) {
-    sigalgs = ssl->ctx->verify_sigalgs;
-    num_sigalgs = ssl->ctx->num_verify_sigalgs;
-  }
-
-  for (size_t i = 0; i < num_sigalgs; i++) {
-    if (sigalgs == kVerifySignatureAlgorithms &&
-        sigalgs[i] == SSL_SIGN_ED25519 &&
-        !ssl->ctx->ed25519_enabled) {
-      continue;
-    }
-    if (sigalg == sigalgs[i]) {
+  SSLSignatureAlgorithmList list = tls12_get_verify_sigalgs(ssl, false);
+  uint16_t verify_sigalg;
+  while (list.Next(&verify_sigalg)) {
+    if (verify_sigalg == sigalg) {
       return true;
     }
   }
@@ -511,6 +545,11 @@
   return false;
 }
 
+bool tls12_has_different_verify_sigalgs_for_certs(const SSL *ssl) {
+  return tls12_get_verify_sigalgs(ssl, true) !=
+         tls12_get_verify_sigalgs(ssl, false);
+}
+
 // tls_extension represents a TLS extension that is handled internally. The
 // |init| function is called for each handshake, before any other functions of
 // the extension. Then the add and parse callbacks are called as needed.
@@ -985,11 +1024,23 @@
     return true;
   }
 
+  // Prior to TLS 1.3, there was no way to signal different signature algorithm
+  // preferences between the online signature and certificates. If we do not
+  // send the signature_algorithms_cert extension, use the potentially more
+  // restrictive certificate list.
+  //
+  // TODO(davidben): When TLS 1.3 is finalized, we can likely remove the TLS 1.3
+  // check both here and in signature_algorithms_cert. |hs->max_version| is not
+  // the negotiated version. Rather the expectation is that any server consuming
+  // signature algorithms added in TLS 1.3 will also know to look at
+  // signature_algorithms_cert. For now, TLS 1.3 is not quite yet final and it
+  // seems prudent to condition this new extension on it.
+  bool for_certs = hs->max_version < TLS1_3_VERSION;
   CBB contents, sigalgs_cbb;
   if (!CBB_add_u16(out, TLSEXT_TYPE_signature_algorithms) ||
       !CBB_add_u16_length_prefixed(out, &contents) ||
       !CBB_add_u16_length_prefixed(&contents, &sigalgs_cbb) ||
-      !tls12_add_verify_sigalgs(ssl, &sigalgs_cbb) ||
+      !tls12_add_verify_sigalgs(ssl, &sigalgs_cbb, for_certs) ||
       !CBB_flush(out)) {
     return false;
   }
@@ -1016,6 +1067,35 @@
 }
 
 
+// Signature Algorithms for Certificates.
+//
+// https://tools.ietf.org/html/draft-ietf-tls-tls13-23#section-4.2.3
+
+static bool ext_sigalgs_cert_add_clienthello(SSL_HANDSHAKE *hs, CBB *out) {
+  SSL *const ssl = hs->ssl;
+  // If this extension is omitted, it defaults to the signature_algorithms
+  // extension, so only emit it if the list is different.
+  //
+  // This extension is also new in TLS 1.3, so omit it if TLS 1.3 is disabled.
+  // There is a corresponding version check in |ext_sigalgs_add_clienthello|.
+  if (hs->max_version < TLS1_3_VERSION ||
+      !tls12_has_different_verify_sigalgs_for_certs(ssl)) {
+    return true;
+  }
+
+  CBB contents, sigalgs_cbb;
+  if (!CBB_add_u16(out, TLSEXT_TYPE_signature_algorithms_cert) ||
+      !CBB_add_u16_length_prefixed(out, &contents) ||
+      !CBB_add_u16_length_prefixed(&contents, &sigalgs_cbb) ||
+      !tls12_add_verify_sigalgs(ssl, &sigalgs_cbb, true /* certs */) ||
+      !CBB_flush(out)) {
+    return false;
+  }
+
+  return true;
+}
+
+
 // OCSP Stapling.
 //
 // https://tools.ietf.org/html/rfc6066#section-8
@@ -1309,8 +1389,8 @@
          CBB_add_u16_length_prefixed(out, &contents) &&
          CBB_add_bytes(
              &contents,
-             CRYPTO_BUFFER_data(ssl->cert->signed_cert_timestamp_list),
-             CRYPTO_BUFFER_len(ssl->cert->signed_cert_timestamp_list)) &&
+             CRYPTO_BUFFER_data(ssl->cert->signed_cert_timestamp_list.get()),
+             CRYPTO_BUFFER_len(ssl->cert->signed_cert_timestamp_list.get())) &&
          CBB_flush(out);
 }
 
@@ -1566,7 +1646,7 @@
 
 
 static void ext_srtp_init(SSL_HANDSHAKE *hs) {
-  hs->ssl->srtp_profile = NULL;
+  hs->ssl->s3->srtp_profile = NULL;
 }
 
 static bool ext_srtp_add_clienthello(SSL_HANDSHAKE *hs, CBB *out) {
@@ -1633,7 +1713,7 @@
   // offered).
   for (const SRTP_PROTECTION_PROFILE *profile : profiles) {
     if (profile->id == profile_id) {
-      ssl->srtp_profile = profile;
+      ssl->s3->srtp_profile = profile;
       return true;
     }
   }
@@ -1675,7 +1755,7 @@
       }
 
       if (server_profile->id == profile_id) {
-        ssl->srtp_profile = server_profile;
+        ssl->s3->srtp_profile = server_profile;
         return true;
       }
     }
@@ -1686,7 +1766,7 @@
 
 static bool ext_srtp_add_serverhello(SSL_HANDSHAKE *hs, CBB *out) {
   SSL *const ssl = hs->ssl;
-  if (ssl->srtp_profile == NULL) {
+  if (ssl->s3->srtp_profile == NULL) {
     return true;
   }
 
@@ -1694,7 +1774,7 @@
   if (!CBB_add_u16(out, TLSEXT_TYPE_srtp) ||
       !CBB_add_u16_length_prefixed(out, &contents) ||
       !CBB_add_u16_length_prefixed(&contents, &profile_ids) ||
-      !CBB_add_u16(&profile_ids, ssl->srtp_profile->id) ||
+      !CBB_add_u16(&profile_ids, ssl->s3->srtp_profile->id) ||
       !CBB_add_u8(&contents, 0 /* empty MKI */) ||
       !CBB_flush(out)) {
     return false;
@@ -2726,6 +2806,14 @@
     dont_add_serverhello,
   },
   {
+    TLSEXT_TYPE_signature_algorithms_cert,
+    NULL,
+    ext_sigalgs_cert_add_clienthello,
+    forbid_parse_serverhello,
+    ignore_parse_clienthello,
+    dont_add_serverhello,
+  },
+  {
     TLSEXT_TYPE_status_request,
     NULL,
     ext_ocsp_add_clienthello,
@@ -3500,8 +3588,8 @@
   }
 
   Span<const uint16_t> sigalgs = kSignSignatureAlgorithms;
-  if (cert->sigalgs != nullptr) {
-    sigalgs = MakeConstSpan(cert->sigalgs, cert->num_sigalgs);
+  if (!cert->sigalgs.empty()) {
+    sigalgs = cert->sigalgs;
   }
 
   Span<const uint16_t> peer_sigalgs = hs->peer_sigalgs;
@@ -3776,6 +3864,10 @@
   ctx->ed25519_enabled = !!enabled;
 }
 
+void SSL_CTX_set_rsa_pss_rsae_certs_enabled(SSL_CTX *ctx, int enabled) {
+  ctx->rsa_pss_rsae_certs_enabled = !!enabled;
+}
+
 int SSL_extension_supported(unsigned extension_value) {
   uint32_t index;
   return extension_value == TLSEXT_TYPE_padding ||
diff --git a/src/ssl/test/bssl_shim.cc b/src/ssl/test/bssl_shim.cc
index 107de52..824dc94 100644
--- a/src/ssl/test/bssl_shim.cc
+++ b/src/ssl/test/bssl_shim.cc
@@ -333,40 +333,14 @@
     abort();
   }
 
-  // Determine the hash.
-  const EVP_MD *md;
-  switch (signature_algorithm) {
-    case SSL_SIGN_RSA_PKCS1_SHA1:
-    case SSL_SIGN_ECDSA_SHA1:
-      md = EVP_sha1();
-      break;
-    case SSL_SIGN_RSA_PKCS1_SHA256:
-    case SSL_SIGN_ECDSA_SECP256R1_SHA256:
-    case SSL_SIGN_RSA_PSS_SHA256:
-      md = EVP_sha256();
-      break;
-    case SSL_SIGN_RSA_PKCS1_SHA384:
-    case SSL_SIGN_ECDSA_SECP384R1_SHA384:
-    case SSL_SIGN_RSA_PSS_SHA384:
-      md = EVP_sha384();
-      break;
-    case SSL_SIGN_RSA_PKCS1_SHA512:
-    case SSL_SIGN_ECDSA_SECP521R1_SHA512:
-    case SSL_SIGN_RSA_PSS_SHA512:
-      md = EVP_sha512();
-      break;
-    case SSL_SIGN_RSA_PKCS1_MD5_SHA1:
-      md = EVP_md5_sha1();
-      break;
-    case SSL_SIGN_ED25519:
-      md = nullptr;
-      break;
-    default:
-      fprintf(stderr, "Unknown signature algorithm %04x.\n",
-              signature_algorithm);
-      return ssl_private_key_failure;
+  if (EVP_PKEY_id(test_state->private_key.get()) !=
+      SSL_get_signature_algorithm_key_type(signature_algorithm)) {
+    fprintf(stderr, "Key type does not match signature algorithm.\n");
+    abort();
   }
 
+  // Determine the hash.
+  const EVP_MD *md = SSL_get_signature_algorithm_digest(signature_algorithm);
   bssl::ScopedEVP_MD_CTX ctx;
   EVP_PKEY_CTX *pctx;
   if (!EVP_DigestSignInit(ctx.get(), &pctx, md, nullptr,
@@ -375,15 +349,11 @@
   }
 
   // Configure additional signature parameters.
-  switch (signature_algorithm) {
-    case SSL_SIGN_RSA_PSS_SHA256:
-    case SSL_SIGN_RSA_PSS_SHA384:
-    case SSL_SIGN_RSA_PSS_SHA512:
-      if (!EVP_PKEY_CTX_set_rsa_padding(pctx, RSA_PKCS1_PSS_PADDING) ||
-          !EVP_PKEY_CTX_set_rsa_pss_saltlen(pctx,
-                                            -1 /* salt len = hash len */)) {
-        return ssl_private_key_failure;
-      }
+  if (SSL_is_signature_algorithm_rsa_pss(signature_algorithm)) {
+    if (!EVP_PKEY_CTX_set_rsa_padding(pctx, RSA_PKCS1_PSS_PADDING) ||
+        !EVP_PKEY_CTX_set_rsa_pss_saltlen(pctx, -1 /* salt len = hash len */)) {
+      return ssl_private_key_failure;
+    }
   }
 
   // Write the signature into |test_state|.
@@ -1318,6 +1288,9 @@
   if (config->enable_ed25519) {
     SSL_CTX_set_ed25519_enabled(ssl_ctx.get(), 1);
   }
+  if (config->no_rsa_pss_rsae_certs) {
+    SSL_CTX_set_rsa_pss_rsae_certs_enabled(ssl_ctx.get(), 0);
+  }
 
   if (!config->verify_prefs.empty()) {
     std::vector<uint16_t> u16s(config->verify_prefs.begin(),
diff --git a/src/ssl/test/runner/common.go b/src/ssl/test/runner/common.go
index 8f354e9..7115b4d 100644
--- a/src/ssl/test/runner/common.go
+++ b/src/ssl/test/runner/common.go
@@ -10,6 +10,7 @@
 	"crypto/ecdsa"
 	"crypto/rand"
 	"crypto/x509"
+	"errors"
 	"fmt"
 	"io"
 	"math/big"
@@ -122,13 +123,13 @@
 	extensionTokenBinding               uint16 = 24
 	extensionQUICTransportParams        uint16 = 26
 	extensionSessionTicket              uint16 = 35
-	extensionPreSharedKey               uint16 = 41    // draft-ietf-tls-tls13-16
-	extensionEarlyData                  uint16 = 42    // draft-ietf-tls-tls13-16
-	extensionSupportedVersions          uint16 = 43    // draft-ietf-tls-tls13-16
-	extensionCookie                     uint16 = 44    // draft-ietf-tls-tls13-16
-	extensionPSKKeyExchangeModes        uint16 = 45    // draft-ietf-tls-tls13-18
-	extensionTicketEarlyDataInfo        uint16 = 46    // draft-ietf-tls-tls13-18
-	extensionCertificateAuthorities     uint16 = 47    // draft-ietf-tls-tls13-21
+	extensionPreSharedKey               uint16 = 41    // draft-ietf-tls-tls13-23
+	extensionEarlyData                  uint16 = 42    // draft-ietf-tls-tls13-23
+	extensionSupportedVersions          uint16 = 43    // draft-ietf-tls-tls13-23
+	extensionCookie                     uint16 = 44    // draft-ietf-tls-tls13-23
+	extensionPSKKeyExchangeModes        uint16 = 45    // draft-ietf-tls-tls13-23
+	extensionCertificateAuthorities     uint16 = 47    // draft-ietf-tls-tls13-23
+	extensionSignatureAlgorithmsCert    uint16 = 50    // draft-ietf-tls-tls13-23
 	extensionKeyShare                   uint16 = 51    // draft-ietf-tls-tls13-23
 	extensionCustom                     uint16 = 1234  // not IANA assigned
 	extensionNextProtoNeg               uint16 = 13172 // not IANA assigned
@@ -531,6 +532,15 @@
 	NumRSABadValues
 )
 
+type RSAPSSSupport int
+
+const (
+	RSAPSSSupportAny RSAPSSSupport = iota
+	RSAPSSSupportNone
+	RSAPSSSupportOnlineSignatureOnly
+	RSAPSSSupportBoth
+)
+
 type ProtocolBugs struct {
 	// InvalidSignature specifies that the signature in a ServerKeyExchange
 	// or CertificateVerify message should be invalid.
@@ -1562,6 +1572,14 @@
 	// SendCompressedCoordinates, if true, causes ECDH key shares over NIST
 	// curves to use compressed coordinates.
 	SendCompressedCoordinates bool
+
+	// ExpectRSAPSSSupport specifies the level of RSA-PSS support expected
+	// from the peer.
+	ExpectRSAPSSSupport RSAPSSSupport
+
+	// SetX25519HighBit, if true, causes X25519 key shares to set their
+	// high-order bit.
+	SetX25519HighBit bool
 }
 
 func (c *Config) serverInit() {
@@ -2000,3 +2018,50 @@
 	}
 	return false
 }
+
+func checkRSAPSSSupport(support RSAPSSSupport, sigAlgs, sigAlgsCert []signatureAlgorithm) error {
+	if sigAlgsCert == nil {
+		sigAlgsCert = sigAlgs
+	} else if eqSignatureAlgorithms(sigAlgs, sigAlgsCert) {
+		// The peer should have only sent the list once.
+		return errors.New("tls: signature_algorithms and signature_algorithms_cert extensions were identical")
+	}
+
+	if support == RSAPSSSupportAny {
+		return nil
+	}
+
+	var foundPSS, foundPSSCert bool
+	for _, sigAlg := range sigAlgs {
+		if sigAlg == signatureRSAPSSWithSHA256 || sigAlg == signatureRSAPSSWithSHA384 || sigAlg == signatureRSAPSSWithSHA512 {
+			foundPSS = true
+			break
+		}
+	}
+	for _, sigAlg := range sigAlgsCert {
+		if sigAlg == signatureRSAPSSWithSHA256 || sigAlg == signatureRSAPSSWithSHA384 || sigAlg == signatureRSAPSSWithSHA512 {
+			foundPSSCert = true
+			break
+		}
+	}
+
+	expectPSS := support != RSAPSSSupportNone
+	if foundPSS != expectPSS {
+		if expectPSS {
+			return errors.New("tls: peer did not support PSS")
+		} else {
+			return errors.New("tls: peer unexpectedly supported PSS")
+		}
+	}
+
+	expectPSSCert := support == RSAPSSSupportBoth
+	if foundPSSCert != expectPSSCert {
+		if expectPSSCert {
+			return errors.New("tls: peer did not support PSS in certificates")
+		} else {
+			return errors.New("tls: peer unexpectedly supported PSS in certificates")
+		}
+	}
+
+	return nil
+}
diff --git a/src/ssl/test/runner/handshake_client.go b/src/ssl/test/runner/handshake_client.go
index d74c953..4178dc1 100644
--- a/src/ssl/test/runner/handshake_client.go
+++ b/src/ssl/test/runner/handshake_client.go
@@ -843,6 +843,10 @@
 				return errors.New("tls: expected no certificate_authorities extension")
 			}
 
+			if err := checkRSAPSSSupport(c.config.Bugs.ExpectRSAPSSSupport, certReq.signatureAlgorithms, certReq.signatureAlgorithmsCert); err != nil {
+				return err
+			}
+
 			if c.config.Bugs.IgnorePeerSignatureAlgorithmPreferences {
 				certReq.signatureAlgorithms = c.config.signSignatureAlgorithms()
 			}
@@ -1163,6 +1167,9 @@
 	certReq, ok := msg.(*certificateRequestMsg)
 	if ok {
 		certRequested = true
+		if err := checkRSAPSSSupport(c.config.Bugs.ExpectRSAPSSSupport, certReq.signatureAlgorithms, certReq.signatureAlgorithmsCert); err != nil {
+			return err
+		}
 		if c.config.Bugs.IgnorePeerSignatureAlgorithmPreferences {
 			certReq.signatureAlgorithms = c.config.signSignatureAlgorithms()
 		}
diff --git a/src/ssl/test/runner/handshake_messages.go b/src/ssl/test/runner/handshake_messages.go
index b9fb89d..64936b4 100644
--- a/src/ssl/test/runner/handshake_messages.go
+++ b/src/ssl/test/runner/handshake_messages.go
@@ -275,6 +275,7 @@
 	ticketSupported         bool
 	sessionTicket           []uint8
 	signatureAlgorithms     []signatureAlgorithm
+	signatureAlgorithmsCert []signatureAlgorithm
 	supportedVersions       []uint16
 	secureRenegotiation     []byte
 	alpnProtocols           []string
@@ -327,6 +328,7 @@
 		m.ticketSupported == m1.ticketSupported &&
 		bytes.Equal(m.sessionTicket, m1.sessionTicket) &&
 		eqSignatureAlgorithms(m.signatureAlgorithms, m1.signatureAlgorithms) &&
+		eqSignatureAlgorithms(m.signatureAlgorithmsCert, m1.signatureAlgorithmsCert) &&
 		eqUint16s(m.supportedVersions, m1.supportedVersions) &&
 		bytes.Equal(m.secureRenegotiation, m1.secureRenegotiation) &&
 		(m.secureRenegotiation == nil) == (m1.secureRenegotiation == nil) &&
@@ -495,6 +497,14 @@
 			signatureAlgorithms.addU16(uint16(sigAlg))
 		}
 	}
+	if len(m.signatureAlgorithmsCert) > 0 {
+		extensions.addU16(extensionSignatureAlgorithmsCert)
+		signatureAlgorithmsCertExtension := extensions.addU16LengthPrefixed()
+		signatureAlgorithmsCert := signatureAlgorithmsCertExtension.addU16LengthPrefixed()
+		for _, sigAlg := range m.signatureAlgorithmsCert {
+			signatureAlgorithmsCert.addU16(uint16(sigAlg))
+		}
+	}
 	if len(m.supportedVersions) > 0 {
 		extensions.addU16(extensionSupportedVersions)
 		supportedVersionsExtension := extensions.addU16LengthPrefixed()
@@ -618,11 +628,14 @@
 	return m.raw
 }
 
-func parseSignatureAlgorithms(reader *byteReader, out *[]signatureAlgorithm) bool {
+func parseSignatureAlgorithms(reader *byteReader, out *[]signatureAlgorithm, allowEmpty bool) bool {
 	var sigAlgs byteReader
 	if !reader.readU16LengthPrefixed(&sigAlgs) {
 		return false
 	}
+	if !allowEmpty && len(sigAlgs) == 0 {
+		return false
+	}
 	*out = make([]signatureAlgorithm, 0, len(sigAlgs)/2)
 	for len(sigAlgs) > 0 {
 		var v uint16
@@ -676,6 +689,7 @@
 	m.ticketSupported = false
 	m.sessionTicket = nil
 	m.signatureAlgorithms = nil
+	m.signatureAlgorithmsCert = nil
 	m.supportedVersions = nil
 	m.alpnProtocols = nil
 	m.extendedMasterSecret = false
@@ -806,7 +820,11 @@
 			}
 		case extensionSignatureAlgorithms:
 			// https://tools.ietf.org/html/rfc5246#section-7.4.1.4.1
-			if !parseSignatureAlgorithms(&body, &m.signatureAlgorithms) || len(body) != 0 {
+			if !parseSignatureAlgorithms(&body, &m.signatureAlgorithms, false) || len(body) != 0 {
+				return false
+			}
+		case extensionSignatureAlgorithmsCert:
+			if !parseSignatureAlgorithms(&body, &m.signatureAlgorithmsCert, false) || len(body) != 0 {
 				return false
 			}
 		case extensionSupportedVersions:
@@ -1839,12 +1857,13 @@
 	// TLS 1.3.
 	hasRequestContext bool
 
-	certificateTypes       []byte
-	requestContext         []byte
-	signatureAlgorithms    []signatureAlgorithm
-	certificateAuthorities [][]byte
-	hasCAExtension         bool
-	customExtension        uint16
+	certificateTypes        []byte
+	requestContext          []byte
+	signatureAlgorithms     []signatureAlgorithm
+	signatureAlgorithmsCert []signatureAlgorithm
+	certificateAuthorities  [][]byte
+	hasCAExtension          bool
+	customExtension         uint16
 }
 
 func (m *certificateRequestMsg) marshal() []byte {
@@ -1869,6 +1888,13 @@
 				signatureAlgorithms.addU16(uint16(sigAlg))
 			}
 		}
+		if len(m.signatureAlgorithmsCert) > 0 {
+			extensions.addU16(extensionSignatureAlgorithmsCert)
+			signatureAlgorithmsCert := extensions.addU16LengthPrefixed().addU16LengthPrefixed()
+			for _, sigAlg := range m.signatureAlgorithmsCert {
+				signatureAlgorithmsCert.addU16(uint16(sigAlg))
+			}
+		}
 		if len(m.certificateAuthorities) > 0 {
 			extensions.addU16(extensionCertificateAuthorities)
 			certificateAuthorities := extensions.addU16LengthPrefixed().addU16LengthPrefixed()
@@ -1939,7 +1965,11 @@
 			}
 			switch extension {
 			case extensionSignatureAlgorithms:
-				if !parseSignatureAlgorithms(&body, &m.signatureAlgorithms) || len(body) != 0 {
+				if !parseSignatureAlgorithms(&body, &m.signatureAlgorithms, false) || len(body) != 0 {
+					return false
+				}
+			case extensionSignatureAlgorithmsCert:
+				if !parseSignatureAlgorithms(&body, &m.signatureAlgorithmsCert, false) || len(body) != 0 {
 					return false
 				}
 			case extensionCertificateAuthorities:
@@ -1953,7 +1983,9 @@
 		if !reader.readU8LengthPrefixedBytes(&m.certificateTypes) {
 			return false
 		}
-		if m.hasSignatureAlgorithm && !parseSignatureAlgorithms(&reader, &m.signatureAlgorithms) {
+		// In TLS 1.2, the supported_signature_algorithms field in
+		// CertificateRequest may be empty.
+		if m.hasSignatureAlgorithm && !parseSignatureAlgorithms(&reader, &m.signatureAlgorithms, true) {
 			return false
 		}
 		if !parseCAs(&reader, &m.certificateAuthorities) ||
diff --git a/src/ssl/test/runner/handshake_server.go b/src/ssl/test/runner/handshake_server.go
index 0c24592..e5f2944 100644
--- a/src/ssl/test/runner/handshake_server.go
+++ b/src/ssl/test/runner/handshake_server.go
@@ -341,6 +341,10 @@
 		return fmt.Errorf("tls: expected dummy PQ padding extension of length %d, but got one of length %d", expected, config.Bugs.ExpectDummyPQPaddingLength)
 	}
 
+	if err := checkRSAPSSSupport(config.Bugs.ExpectRSAPSSSupport, hs.clientHello.signatureAlgorithms, hs.clientHello.signatureAlgorithmsCert); err != nil {
+		return err
+	}
+
 	applyBugsToClientHello(hs.clientHello, config)
 
 	return nil
diff --git a/src/ssl/test/runner/key_agreement.go b/src/ssl/test/runner/key_agreement.go
index 1b4dfc4..5a30469 100644
--- a/src/ssl/test/runner/key_agreement.go
+++ b/src/ssl/test/runner/key_agreement.go
@@ -302,6 +302,7 @@
 // x25519ECDHCurve implements ecdhCurve with X25519.
 type x25519ECDHCurve struct {
 	privateKey [32]byte
+	setHighBit bool
 }
 
 func (e *x25519ECDHCurve) offer(rand io.Reader) (publicKey []byte, err error) {
@@ -311,6 +312,9 @@
 	}
 	var out [32]byte
 	curve25519.ScalarBaseMult(&out, &e.privateKey)
+	if e.setHighBit {
+		out[31] |= 0x80
+	}
 	return out[:], nil
 }
 
@@ -354,7 +358,7 @@
 	case CurveP521:
 		return &ellipticECDHCurve{curve: elliptic.P521(), sendCompressed: config.Bugs.SendCompressedCoordinates}, true
 	case CurveX25519:
-		return &x25519ECDHCurve{}, true
+		return &x25519ECDHCurve{setHighBit: config.Bugs.SetX25519HighBit}, true
 	default:
 		return nil, false
 	}
diff --git a/src/ssl/test/runner/runner.go b/src/ssl/test/runner/runner.go
index 683fd3a..102cf91 100644
--- a/src/ssl/test/runner/runner.go
+++ b/src/ssl/test/runner/runner.go
@@ -9456,6 +9456,119 @@
 			"-verify-prefs", strconv.Itoa(int(signatureEd25519)),
 		},
 	})
+
+	rsaPSSSupportTests := []struct {
+		name                string
+		expectRSAPSSSupport RSAPSSSupport
+		verifyPrefs         []signatureAlgorithm
+		noCerts             bool
+	}{
+		// By default, RSA-PSS is fully advertised.
+		{
+			name:                "Default",
+			expectRSAPSSSupport: RSAPSSSupportBoth,
+		},
+		// Disabling RSA-PSS certificates makes it online-signature-only.
+		{
+			name:                "Default-NoCerts",
+			expectRSAPSSSupport: RSAPSSSupportOnlineSignatureOnly,
+			noCerts:             true,
+		},
+		// The above also apply if verify preferences were explicitly configured.
+		{
+			name:                "ConfigPSS",
+			expectRSAPSSSupport: RSAPSSSupportBoth,
+			verifyPrefs:         []signatureAlgorithm{signatureRSAPSSWithSHA256, signatureRSAPKCS1WithSHA256, signatureECDSAWithP256AndSHA256},
+		},
+		{
+			name:                "ConfigPSS-NoCerts",
+			expectRSAPSSSupport: RSAPSSSupportOnlineSignatureOnly,
+			verifyPrefs:         []signatureAlgorithm{signatureRSAPSSWithSHA256, signatureRSAPKCS1WithSHA256, signatureECDSAWithP256AndSHA256},
+			noCerts:             true,
+		},
+		// If verify preferences were explicitly configured without RSA-PSS support,
+		// NoCerts is a no-op and the shim correctly only sends one extension.
+		// (This is checked internally in the runner.)
+		{
+			name:                "ConfigNoPSS",
+			expectRSAPSSSupport: RSAPSSSupportNone,
+			verifyPrefs:         []signatureAlgorithm{signatureRSAPKCS1WithSHA256, signatureECDSAWithP256AndSHA256},
+		},
+		{
+			name:                "ConfigNoPSS-NoCerts",
+			expectRSAPSSSupport: RSAPSSSupportNone,
+			verifyPrefs:         []signatureAlgorithm{signatureRSAPKCS1WithSHA256, signatureECDSAWithP256AndSHA256},
+			noCerts:             true,
+		},
+	}
+
+	for _, test := range rsaPSSSupportTests {
+		var pssFlags []string
+		for _, pref := range test.verifyPrefs {
+			pssFlags = append(pssFlags, "-verify-prefs", strconv.Itoa(int(pref)))
+		}
+		if test.noCerts {
+			pssFlags = append(pssFlags, "-no-rsa-pss-rsae-certs")
+		}
+		for _, ver := range tlsVersions {
+			if ver.version < VersionTLS12 {
+				continue
+			}
+
+			// TLS 1.2 cannot express different RSAPSSSupportOnlineSignatureOnly,
+			// so it decays to RSAPSSSupportNone.
+			expect := test.expectRSAPSSSupport
+			if ver.version < VersionTLS13 && expect == RSAPSSSupportOnlineSignatureOnly {
+				expect = RSAPSSSupportNone
+			}
+
+			// If the configuration results in no RSA-PSS support, the handshake won't complete.
+			// (The test, however, still covers the RSA-PSS assertion.)
+			var localError string
+			var shouldFail bool
+			if ver.version >= VersionTLS13 && expect == RSAPSSSupportNone {
+				shouldFail = true
+				localError = "tls: no common signature algorithms"
+			}
+
+			flags := []string{"-max-version", ver.shimFlag(tls)}
+			flags = append(flags, pssFlags...)
+			testCases = append(testCases, testCase{
+				name: fmt.Sprintf("RSAPSSSupport-%s-%s-Client", test.name, ver.name),
+				config: Config{
+					MinVersion:   ver.version,
+					MaxVersion:   ver.version,
+					Certificates: []Certificate{rsaCertificate},
+					Bugs: ProtocolBugs{
+						ExpectRSAPSSSupport: expect,
+					},
+				},
+				tls13Variant:       ver.tls13Variant,
+				flags:              flags,
+				shouldFail:         shouldFail,
+				expectedLocalError: localError,
+			})
+
+			serverFlags := []string{"-require-any-client-certificate"}
+			serverFlags = append(flags, serverFlags...)
+			testCases = append(testCases, testCase{
+				testType: serverTest,
+				name:     fmt.Sprintf("RSAPSSSupport-%s-%s-Server", test.name, ver.name),
+				config: Config{
+					MinVersion:   ver.version,
+					MaxVersion:   ver.version,
+					Certificates: []Certificate{rsaCertificate},
+					Bugs: ProtocolBugs{
+						ExpectRSAPSSSupport: expect,
+					},
+				},
+				tls13Variant:       ver.tls13Variant,
+				flags:              serverFlags,
+				shouldFail:         shouldFail,
+				expectedLocalError: localError,
+			})
+		}
+	}
 }
 
 // timeouts is the retransmit schedule for BoringSSL. It doubles and
@@ -10904,6 +11017,22 @@
 		shouldFail:    true,
 		expectedError: ":ERROR_PARSING_EXTENSION:",
 	})
+
+	// Implementations should mask off the high order bit in X25519.
+	testCases = append(testCases, testCase{
+		name: "SetX25519HighBit",
+		config: Config{
+			CipherSuites: []uint16{
+				TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
+				TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,
+				TLS_AES_128_GCM_SHA256,
+			},
+			CurvePreferences: []CurveID{CurveX25519},
+			Bugs: ProtocolBugs{
+				SetX25519HighBit: true,
+			},
+		},
+	})
 }
 
 func addTLS13RecordTests() {
diff --git a/src/ssl/test/test_config.cc b/src/ssl/test/test_config.cc
index f50251d..f74267d 100644
--- a/src/ssl/test/test_config.cc
+++ b/src/ssl/test/test_config.cc
@@ -133,6 +133,7 @@
   { "-expect-draft-downgrade", &TestConfig::expect_draft_downgrade },
   { "-handoff", &TestConfig::handoff },
   { "-expect-dummy-pq-padding", &TestConfig::expect_dummy_pq_padding },
+  { "-no-rsa-pss-rsae-certs", &TestConfig::no_rsa_pss_rsae_certs },
 };
 
 const Flag<std::string> kStringFlags[] = {
diff --git a/src/ssl/test/test_config.h b/src/ssl/test/test_config.h
index fb479d1..95f38e0 100644
--- a/src/ssl/test/test_config.h
+++ b/src/ssl/test/test_config.h
@@ -154,6 +154,7 @@
   int dummy_pq_padding_len = 0;
   bool handoff = false;
   bool expect_dummy_pq_padding = false;
+  bool no_rsa_pss_rsae_certs = false;
 };
 
 bool ParseConfig(int argc, char **argv, TestConfig *out_initial,
diff --git a/src/ssl/tls13_both.cc b/src/ssl/tls13_both.cc
index 2a5a935..defdac1 100644
--- a/src/ssl/tls13_both.cc
+++ b/src/ssl/tls13_both.cc
@@ -368,7 +368,7 @@
   }
 
   CERT *cert = ssl->cert;
-  CRYPTO_BUFFER *leaf_buf = sk_CRYPTO_BUFFER_value(cert->chain, 0);
+  CRYPTO_BUFFER *leaf_buf = sk_CRYPTO_BUFFER_value(cert->chain.get(), 0);
   CBB leaf, extensions;
   if (!CBB_add_u24_length_prefixed(&certificate_list, &leaf) ||
       !CBB_add_bytes(&leaf, CRYPTO_BUFFER_data(leaf_buf),
@@ -378,14 +378,14 @@
     return 0;
   }
 
-  if (hs->scts_requested && ssl->cert->signed_cert_timestamp_list != NULL) {
+  if (hs->scts_requested && ssl->cert->signed_cert_timestamp_list != nullptr) {
     CBB contents;
     if (!CBB_add_u16(&extensions, TLSEXT_TYPE_certificate_timestamp) ||
         !CBB_add_u16_length_prefixed(&extensions, &contents) ||
         !CBB_add_bytes(
             &contents,
-            CRYPTO_BUFFER_data(ssl->cert->signed_cert_timestamp_list),
-            CRYPTO_BUFFER_len(ssl->cert->signed_cert_timestamp_list)) ||
+            CRYPTO_BUFFER_data(ssl->cert->signed_cert_timestamp_list.get()),
+            CRYPTO_BUFFER_len(ssl->cert->signed_cert_timestamp_list.get())) ||
         !CBB_flush(&extensions)) {
       OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR);
       return 0;
@@ -400,16 +400,16 @@
         !CBB_add_u8(&contents, TLSEXT_STATUSTYPE_ocsp) ||
         !CBB_add_u24_length_prefixed(&contents, &ocsp_response) ||
         !CBB_add_bytes(&ocsp_response,
-                       CRYPTO_BUFFER_data(ssl->cert->ocsp_response),
-                       CRYPTO_BUFFER_len(ssl->cert->ocsp_response)) ||
+                       CRYPTO_BUFFER_data(ssl->cert->ocsp_response.get()),
+                       CRYPTO_BUFFER_len(ssl->cert->ocsp_response.get())) ||
         !CBB_flush(&extensions)) {
       OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR);
       return 0;
     }
   }
 
-  for (size_t i = 1; i < sk_CRYPTO_BUFFER_num(cert->chain); i++) {
-    CRYPTO_BUFFER *cert_buf = sk_CRYPTO_BUFFER_value(cert->chain, i);
+  for (size_t i = 1; i < sk_CRYPTO_BUFFER_num(cert->chain.get()); i++) {
+    CRYPTO_BUFFER *cert_buf = sk_CRYPTO_BUFFER_value(cert->chain.get(), i);
     CBB child;
     if (!CBB_add_u24_length_prefixed(&certificate_list, &child) ||
         !CBB_add_bytes(&child, CRYPTO_BUFFER_data(cert_buf),
diff --git a/src/ssl/tls13_server.cc b/src/ssl/tls13_server.cc
index 9d7f5e0..fe0449d 100644
--- a/src/ssl/tls13_server.cc
+++ b/src/ssl/tls13_server.cc
@@ -619,10 +619,22 @@
         !CBB_add_u16_length_prefixed(&cert_request_extensions,
                                      &sigalg_contents) ||
         !CBB_add_u16_length_prefixed(&sigalg_contents, &sigalgs_cbb) ||
-        !tls12_add_verify_sigalgs(ssl, &sigalgs_cbb)) {
+        !tls12_add_verify_sigalgs(ssl, &sigalgs_cbb,
+                                  false /* online signature */)) {
       return ssl_hs_error;
     }
 
+    if (tls12_has_different_verify_sigalgs_for_certs(ssl)) {
+      if (!CBB_add_u16(&cert_request_extensions,
+                       TLSEXT_TYPE_signature_algorithms_cert) ||
+          !CBB_add_u16_length_prefixed(&cert_request_extensions,
+                                       &sigalg_contents) ||
+          !CBB_add_u16_length_prefixed(&sigalg_contents, &sigalgs_cbb) ||
+          !tls12_add_verify_sigalgs(ssl, &sigalgs_cbb, true /* certs */)) {
+        return ssl_hs_error;
+      }
+    }
+
     if (ssl_has_client_CAs(ssl)) {
       CBB ca_contents;
       if (!CBB_add_u16(&cert_request_extensions,
diff --git a/src/third_party/fiat/p256.c b/src/third_party/fiat/p256.c
index 82f3608..02719c5 100644
--- a/src/third_party/fiat/p256.c
+++ b/src/third_party/fiat/p256.c
@@ -1819,6 +1819,7 @@
   out->field_sqr = ec_GFp_mont_field_sqr;
   out->field_encode = ec_GFp_mont_field_encode;
   out->field_decode = ec_GFp_mont_field_decode;
+  out->scalar_inv_montgomery = ec_simple_scalar_inv_montgomery;
 };
 
 #undef BORINGSSL_NISTP256_64BIT
diff --git a/src/util/fipstools/delocate.go b/src/util/fipstools/delocate.go
index 7f45b87..827e446 100644
--- a/src/util/fipstools/delocate.go
+++ b/src/util/fipstools/delocate.go
@@ -1158,6 +1158,11 @@
 	symbols := make(map[string]struct{})
 	// localEntrySymbols contains all symbols with a .localentry directive.
 	localEntrySymbols := make(map[string]struct{})
+	// fileNumbers is the set of IDs seen in .file directives.
+	fileNumbers := make(map[int]struct{})
+	// maxObservedFileNumber contains the largest seen file number in a
+	// .file directive. Zero is not a valid number.
+	maxObservedFileNumber := 0
 
 	for _, input := range inputs {
 		forEachPath(input.ast.up, func(node *node32) {
@@ -1186,6 +1191,35 @@
 			}
 			localEntrySymbols[symbol] = struct{}{}
 		}, ruleStatement, ruleLabelContainingDirective)
+
+		forEachPath(input.ast.up, func(node *node32) {
+			assertNodeType(node, ruleLocationDirective)
+			directive := input.contents[node.begin:node.end]
+			if !strings.HasPrefix(directive, ".file") {
+				return
+			}
+			parts := strings.Fields(directive)
+			if len(parts) == 2 {
+				// This is a .file directive with just a
+				// filename. Clang appears to generate just one
+				// of these at the beginning of the output for
+				// the compilation unit. Ignore it.
+				return
+			}
+			fileNo, err := strconv.Atoi(parts[1])
+			if err != nil {
+				panic(fmt.Sprintf("Failed to parse file number from .file: %q", directive))
+			}
+
+			if _, ok := fileNumbers[fileNo]; ok {
+				panic(fmt.Sprintf("Duplicate file number %d observed", fileNo))
+			}
+			fileNumbers[fileNo] = struct{}{}
+
+			if fileNo > maxObservedFileNumber {
+				maxObservedFileNumber = fileNo
+			}
+		}, ruleStatement, ruleLocationDirective)
 	}
 
 	processor := x86_64
@@ -1204,7 +1238,10 @@
 		gotExternalsNeeded: make(map[string]struct{}),
 	}
 
-	w.WriteString(".text\nBORINGSSL_bcm_text_start:\n")
+	w.WriteString(".text\n")
+	w.WriteString(fmt.Sprintf(".file %d \"inserted_by_delocate.c\"\n", maxObservedFileNumber + 1))
+	w.WriteString(fmt.Sprintf(".loc %d 1 0\n", maxObservedFileNumber + 1))
+	w.WriteString("BORINGSSL_bcm_text_start:\n")
 
 	for _, input := range inputs {
 		if err := d.processInput(input); err != nil {
@@ -1212,7 +1249,9 @@
 		}
 	}
 
-	w.WriteString(".text\nBORINGSSL_bcm_text_end:\n")
+	w.WriteString(".text\n")
+	w.WriteString(fmt.Sprintf(".loc %d 2 0\n", maxObservedFileNumber + 1))
+	w.WriteString("BORINGSSL_bcm_text_end:\n")
 
 	// Emit redirector functions. Each is a single jump instruction.
 	var redirectorNames []string
diff --git a/src/util/fipstools/testdata/ppc64le-GlobalEntry/out.s b/src/util/fipstools/testdata/ppc64le-GlobalEntry/out.s
index 56f5b5f..304f697 100644
--- a/src/util/fipstools/testdata/ppc64le-GlobalEntry/out.s
+++ b/src/util/fipstools/testdata/ppc64le-GlobalEntry/out.s
@@ -1,4 +1,6 @@
 .text
+.file 1 "inserted_by_delocate.c"
+.loc 1 1 0
 BORINGSSL_bcm_text_start:
 	.text
 .Lfoo_local_target:
@@ -19,6 +21,7 @@
 
 	bl
 .text
+.loc 1 2 0
 BORINGSSL_bcm_text_end:
 .LBORINGSSL_external_toc:
 .quad .TOC.-.LBORINGSSL_external_toc
diff --git a/src/util/fipstools/testdata/ppc64le-LoadToR0/out.s b/src/util/fipstools/testdata/ppc64le-LoadToR0/out.s
index c42dbe0..5fdbeb8 100644
--- a/src/util/fipstools/testdata/ppc64le-LoadToR0/out.s
+++ b/src/util/fipstools/testdata/ppc64le-LoadToR0/out.s
@@ -1,4 +1,6 @@
 .text
+.file 1 "inserted_by_delocate.c"
+.loc 1 1 0
 BORINGSSL_bcm_text_start:
 	.text
 .Lfoo_local_target:
@@ -23,6 +25,7 @@
 	ld 3, -8(1)
 	addi 1, 1, 288
 .text
+.loc 1 2 0
 BORINGSSL_bcm_text_end:
 .type bcm_loadtoc_bar, @function
 bcm_loadtoc_bar:
diff --git a/src/util/fipstools/testdata/ppc64le-Sample/out.s b/src/util/fipstools/testdata/ppc64le-Sample/out.s
index 5823d62..e3d682e 100644
--- a/src/util/fipstools/testdata/ppc64le-Sample/out.s
+++ b/src/util/fipstools/testdata/ppc64le-Sample/out.s
@@ -1,4 +1,6 @@
 .text
+.file 1 "inserted_by_delocate.c"
+.loc 1 1 0
 BORINGSSL_bcm_text_start:
 	.file	"foo.c"
 	.abiversion 2
@@ -415,6 +417,7 @@
 	.ident	"GCC: (Ubuntu 4.9.2-10ubuntu13) 4.9.2"
 	.section	.note.GNU-stack,"",@progbits
 .text
+.loc 1 2 0
 BORINGSSL_bcm_text_end:
 .section ".toc", "aw"
 .Lredirector_toc_fprintf:
diff --git a/src/util/fipstools/testdata/ppc64le-Sample2/out.s b/src/util/fipstools/testdata/ppc64le-Sample2/out.s
index 3e9c3cb..54cbd6f 100644
--- a/src/util/fipstools/testdata/ppc64le-Sample2/out.s
+++ b/src/util/fipstools/testdata/ppc64le-Sample2/out.s
@@ -1,4 +1,6 @@
 .text
+.file 1 "inserted_by_delocate.c"
+.loc 1 1 0
 BORINGSSL_bcm_text_start:
 	.file	"foo.c"
 	.abiversion 2
@@ -534,6 +536,7 @@
 	.ident	"GCC: (Ubuntu 4.9.2-10ubuntu13) 4.9.2"
 	.section	.note.GNU-stack,"",@progbits
 .text
+.loc 1 2 0
 BORINGSSL_bcm_text_end:
 .section ".toc", "aw"
 .Lredirector_toc___fprintf_chk:
diff --git a/src/util/fipstools/testdata/ppc64le-TOCWithOffset/out.s b/src/util/fipstools/testdata/ppc64le-TOCWithOffset/out.s
index b43523a..2fff0ef 100644
--- a/src/util/fipstools/testdata/ppc64le-TOCWithOffset/out.s
+++ b/src/util/fipstools/testdata/ppc64le-TOCWithOffset/out.s
@@ -1,4 +1,6 @@
 .text
+.file 1 "inserted_by_delocate.c"
+.loc 1 1 0
 BORINGSSL_bcm_text_start:
 	.text
 .Lfoo_local_target:
@@ -99,6 +101,7 @@
 	ld 3, -16(1)
 	addi 1, 1, 288
 .text
+.loc 1 2 0
 BORINGSSL_bcm_text_end:
 .type bcm_loadtoc__dot_Lfoo_local_target, @function
 bcm_loadtoc__dot_Lfoo_local_target:
diff --git a/src/util/fipstools/testdata/x86_64-BSS/out.s b/src/util/fipstools/testdata/x86_64-BSS/out.s
index 77dc40d..5c576d9 100644
--- a/src/util/fipstools/testdata/x86_64-BSS/out.s
+++ b/src/util/fipstools/testdata/x86_64-BSS/out.s
@@ -1,4 +1,6 @@
 .text
+.file 1 "inserted_by_delocate.c"
+.loc 1 1 0
 BORINGSSL_bcm_text_start:
 	.text
 	movq %rax, %rax
@@ -41,6 +43,7 @@
 
 	.quad 0
 .text
+.loc 1 2 0
 BORINGSSL_bcm_text_end:
 .type aes_128_ctr_generic_storage_bss_get, @function
 aes_128_ctr_generic_storage_bss_get:
diff --git a/src/util/fipstools/testdata/x86_64-Basic/out.s b/src/util/fipstools/testdata/x86_64-Basic/out.s
index a03b5d7..02a6025 100644
--- a/src/util/fipstools/testdata/x86_64-Basic/out.s
+++ b/src/util/fipstools/testdata/x86_64-Basic/out.s
@@ -1,4 +1,6 @@
 .text
+.file 2 "inserted_by_delocate.c"
+.loc 2 1 0
 BORINGSSL_bcm_text_start:
 	# Most instructions and lines should pass unaltered. This is made up of
 	# copy-and-pasted bits of compiler output and likely does not actually
@@ -51,6 +53,7 @@
 .size	foo, .-foo
 .type	foo, @function
 .text
+.loc 2 2 0
 BORINGSSL_bcm_text_end:
 .type OPENSSL_ia32cap_get, @function
 OPENSSL_ia32cap_get:
diff --git a/src/util/fipstools/testdata/x86_64-GOTRewrite/out.s b/src/util/fipstools/testdata/x86_64-GOTRewrite/out.s
index 0420af6..0485c87 100644
--- a/src/util/fipstools/testdata/x86_64-GOTRewrite/out.s
+++ b/src/util/fipstools/testdata/x86_64-GOTRewrite/out.s
@@ -1,4 +1,6 @@
 .text
+.file 1 "inserted_by_delocate.c"
+.loc 1 1 0
 BORINGSSL_bcm_text_start:
 	.text
 .Lfoo_local_target:
@@ -148,6 +150,7 @@
 
 .comm foobar,64,32
 .text
+.loc 1 2 0
 BORINGSSL_bcm_text_end:
 .type foobar_bss_get, @function
 foobar_bss_get:
diff --git a/src/util/fipstools/testdata/x86_64-LabelRewrite/out.s b/src/util/fipstools/testdata/x86_64-LabelRewrite/out.s
index 5dd8feb..4a01853 100644
--- a/src/util/fipstools/testdata/x86_64-LabelRewrite/out.s
+++ b/src/util/fipstools/testdata/x86_64-LabelRewrite/out.s
@@ -1,4 +1,6 @@
 .text
+.file 1 "inserted_by_delocate.c"
+.loc 1 1 0
 BORINGSSL_bcm_text_start:
 	.type foo, @function
 	.globl foo
@@ -82,6 +84,7 @@
 	.quad	.L2_BCM_1-.L1_BCM_1
 
 .text
+.loc 1 2 0
 BORINGSSL_bcm_text_end:
 .type bcm_redirector_memcpy, @function
 bcm_redirector_memcpy:
diff --git a/src/util/fipstools/testdata/x86_64-Sections/out.s b/src/util/fipstools/testdata/x86_64-Sections/out.s
index 768fef3..ba427ad 100644
--- a/src/util/fipstools/testdata/x86_64-Sections/out.s
+++ b/src/util/fipstools/testdata/x86_64-Sections/out.s
@@ -1,4 +1,6 @@
 .text
+.file 1 "inserted_by_delocate.c"
+.loc 1 1 0
 BORINGSSL_bcm_text_start:
 	# .text stays in .text
 	.text
@@ -43,6 +45,7 @@
 	.byte	0x1
 	.long	.L3
 .text
+.loc 1 2 0
 BORINGSSL_bcm_text_end:
 .type OPENSSL_ia32cap_get, @function
 OPENSSL_ia32cap_get: