external/boringssl: Sync to 35941f2923155664bd9fa5d897cb336a0ab729a1.

This includes the following changes:

https://boringssl.googlesource.com/boringssl/+log/c3889634a1aa52575c5d26497696238208fbd0f5..35941f2923155664bd9fa5d897cb336a0ab729a1

Test: atest CtsLibcoreTestCases
Change-Id: I7c3352e0ce7c9e236d75c1b0beb580012db2d14d
diff --git a/src/crypto/cipher_extra/e_aesccm.c b/src/crypto/cipher_extra/e_aesccm.c
index 144a909..4e6668c 100644
--- a/src/crypto/cipher_extra/e_aesccm.c
+++ b/src/crypto/cipher_extra/e_aesccm.c
@@ -67,7 +67,7 @@
 
   block128_f block;
   ctr128_f ctr = aes_ctr_set_key(&ccm_ctx->ks.ks, NULL, &block, key, key_len,
-                                 1 /* large inputs */);
+                                 0 /* small inputs */);
   ctx->tag_len = tag_len;
   if (!CRYPTO_ccm128_init(&ccm_ctx->ccm, &ccm_ctx->ks.ks, block, ctr, M, L)) {
     OPENSSL_PUT_ERROR(CIPHER, ERR_R_INTERNAL_ERROR);
diff --git a/src/crypto/cipher_extra/e_aesgcmsiv.c b/src/crypto/cipher_extra/e_aesgcmsiv.c
index 0e5063c..fb08a42 100644
--- a/src/crypto/cipher_extra/e_aesgcmsiv.c
+++ b/src/crypto/cipher_extra/e_aesgcmsiv.c
@@ -595,7 +595,7 @@
   OPENSSL_memset(gcm_siv_ctx, 0, sizeof(struct aead_aes_gcm_siv_ctx));
 
   aes_ctr_set_key(&gcm_siv_ctx->ks.ks, NULL, &gcm_siv_ctx->kgk_block, key,
-                  key_len, 1 /* large inputs */);
+                  key_len, 0 /* small inputs */);
   gcm_siv_ctx->is_256 = (key_len == 32);
   ctx->tag_len = tag_len;
 
@@ -720,7 +720,7 @@
   OPENSSL_memcpy(out_keys->auth_key, key_material, 16);
   aes_ctr_set_key(&out_keys->enc_key.ks, NULL, &out_keys->enc_block,
                   key_material + 16, gcm_siv_ctx->is_256 ? 32 : 16,
-                  1 /* large inputs */);
+                  0 /* small inputs */);
 }
 
 static int aead_aes_gcm_siv_seal_scatter(
diff --git a/src/crypto/fipsmodule/CMakeLists.txt b/src/crypto/fipsmodule/CMakeLists.txt
index b459263..09d210b 100644
--- a/src/crypto/fipsmodule/CMakeLists.txt
+++ b/src/crypto/fipsmodule/CMakeLists.txt
@@ -32,6 +32,7 @@
     aesni-x86.${ASM_EXT}
     bn-586.${ASM_EXT}
     co-586.${ASM_EXT}
+    ghash-ssse3-x86.${ASM_EXT}
     ghash-x86.${ASM_EXT}
     md5-586.${ASM_EXT}
     sha1-586.${ASM_EXT}
@@ -68,6 +69,7 @@
     sha1-armv8.${ASM_EXT}
     sha256-armv8.${ASM_EXT}
     sha512-armv8.${ASM_EXT}
+    vpaes-armv8.${ASM_EXT}
   )
 endif()
 
@@ -98,6 +100,7 @@
 perlasm(ghashp8-ppc.${ASM_EXT} modes/asm/ghashp8-ppc.pl)
 perlasm(ghashv8-armx.${ASM_EXT} modes/asm/ghashv8-armx.pl)
 perlasm(ghash-ssse3-x86_64.${ASM_EXT} modes/asm/ghash-ssse3-x86_64.pl)
+perlasm(ghash-ssse3-x86.${ASM_EXT} modes/asm/ghash-ssse3-x86.pl)
 perlasm(ghash-x86_64.${ASM_EXT} modes/asm/ghash-x86_64.pl)
 perlasm(ghash-x86.${ASM_EXT} modes/asm/ghash-x86.pl)
 perlasm(md5-586.${ASM_EXT} md5/asm/md5-586.pl)
@@ -118,6 +121,7 @@
 perlasm(sha512-armv4.${ASM_EXT} sha/asm/sha512-armv4.pl)
 perlasm(sha512-armv8.${ASM_EXT} sha/asm/sha512-armv8.pl)
 perlasm(sha512-x86_64.${ASM_EXT} sha/asm/sha512-x86_64.pl)
+perlasm(vpaes-armv8.${ASM_EXT} aes/asm/vpaes-armv8.pl)
 perlasm(vpaes-x86_64.${ASM_EXT} aes/asm/vpaes-x86_64.pl)
 perlasm(vpaes-x86.${ASM_EXT} aes/asm/vpaes-x86.pl)
 perlasm(x86_64-mont5.${ASM_EXT} bn/asm/x86_64-mont5.pl)
diff --git a/src/crypto/fipsmodule/aes/aes_test.cc b/src/crypto/fipsmodule/aes/aes_test.cc
index a0c9411..2222b63 100644
--- a/src/crypto/fipsmodule/aes/aes_test.cc
+++ b/src/crypto/fipsmodule/aes/aes_test.cc
@@ -250,6 +250,9 @@
         SCOPED_TRACE(blocks);
         CHECK_ABI(vpaes_cbc_encrypt, buf, buf, AES_BLOCK_SIZE * blocks, &key,
                   block, AES_ENCRYPT);
+#if defined(VPAES_CTR32)
+        CHECK_ABI(vpaes_ctr32_encrypt_blocks, buf, buf, blocks, &key, block);
+#endif
       }
 
       CHECK_ABI(vpaes_set_decrypt_key, kKey, bits, &key);
diff --git a/src/crypto/fipsmodule/aes/asm/vpaes-armv8.pl b/src/crypto/fipsmodule/aes/asm/vpaes-armv8.pl
new file mode 100755
index 0000000..5fa06d8
--- /dev/null
+++ b/src/crypto/fipsmodule/aes/asm/vpaes-armv8.pl
@@ -0,0 +1,1362 @@
+#! /usr/bin/env perl
+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+##
+######################################################################
+# ARMv8 NEON adaptation by <appro@openssl.org>
+#
+# Reason for undertaken effort is that there is at least one popular
+# SoC based on Cortex-A53 that doesn't have crypto extensions.
+#
+#                   CBC enc     ECB enc/dec(*)   [bit-sliced enc/dec]
+# Cortex-A53        21.5        18.1/20.6        [17.5/19.8         ]
+# Cortex-A57        36.0(**)    20.4/24.9(**)    [14.4/16.6         ]
+# X-Gene            45.9(**)    45.8/57.7(**)    [33.1/37.6(**)     ]
+# Denver(***)       16.6(**)    15.1/17.8(**)    [8.80/9.93         ]
+# Apple A7(***)     22.7(**)    10.9/14.3        [8.45/10.0         ]
+# Mongoose(***)     26.3(**)    21.0/25.0(**)    [13.3/16.8         ]
+#
+# (*)	ECB denotes approximate result for parallelizable modes
+#	such as CBC decrypt, CTR, etc.;
+# (**)	these results are worse than scalar compiler-generated
+#	code, but it's constant-time and therefore preferred;
+# (***)	presented for reference/comparison purposes;
+
+$flavour = shift;
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$code.=<<___;
+.section	.rodata
+
+.type	_vpaes_consts,%object
+.align	7	// totally strategic alignment
+_vpaes_consts:
+.Lk_mc_forward:	// mc_forward
+	.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+	.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+	.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+	.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+.Lk_mc_backward:// mc_backward
+	.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+	.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+	.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+	.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+.Lk_sr:		// sr
+	.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+	.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+	.quad	0x0F060D040B020900, 0x070E050C030A0108
+	.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+.Lk_inv:	// inv, inva
+	.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+	.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_ipt:	// input transform (lo, hi)
+	.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+	.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_sbo:	// sbou, sbot
+	.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+	.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+.Lk_sb1:	// sb1u, sb1t
+	.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+	.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.Lk_sb2:	// sb2u, sb2t
+	.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+	.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+//  Decryption stuff
+//
+.Lk_dipt:	// decryption input transform
+	.quad	0x0F505B040B545F00, 0x154A411E114E451A
+	.quad	0x86E383E660056500, 0x12771772F491F194
+.Lk_dsbo:	// decryption sbox final output
+	.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+	.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.Lk_dsb9:	// decryption sbox output *9*u, *9*t
+	.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+	.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:	// decryption sbox output *D*u, *D*t
+	.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+	.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:	// decryption sbox output *B*u, *B*t
+	.quad	0xD022649296B44200, 0x602646F6B0F2D404
+	.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:	// decryption sbox output *E*u, *E*t
+	.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+	.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+
+//
+//  Key schedule constants
+//
+.Lk_dksd:	// decryption key schedule: invskew x*D
+	.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+	.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:	// decryption key schedule: invskew x*B
+	.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+	.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:	// decryption key schedule: invskew x*E + 0x63
+	.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+	.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:	// decryption key schedule: invskew x*9
+	.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+	.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+.Lk_rcon:	// rcon
+	.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_opt:	// output transform
+	.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+	.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew:	// deskew tables: inverts the sbox's "skew"
+	.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+	.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.asciz  "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)"
+.size	_vpaes_consts,.-_vpaes_consts
+.align	6
+
+.text
+___
+
+{
+my ($inp,$out,$key) = map("x$_",(0..2));
+
+my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23));
+my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
+my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
+
+$code.=<<___;
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+.type	_vpaes_encrypt_preheat,%function
+.align	4
+_vpaes_encrypt_preheat:
+	adrp	x10, :pg_hi21:.Lk_inv
+	add	x10, x10, :lo12:.Lk_inv
+	movi	v17.16b, #0x0f
+	ld1	{v18.2d-v19.2d}, [x10],#32	// .Lk_inv
+	ld1	{v20.2d-v23.2d}, [x10],#64	// .Lk_ipt, .Lk_sbo
+	ld1	{v24.2d-v27.2d}, [x10]		// .Lk_sb1, .Lk_sb2
+	ret
+.size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type	_vpaes_encrypt_core,%function
+.align 4
+_vpaes_encrypt_core:
+	mov	x9, $key
+	ldr	w8, [$key,#240]			// pull rounds
+	adrp	x11, :pg_hi21:.Lk_mc_forward+16
+	add	x11, x11, :lo12:.Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
+	tbl	v1.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	.Lenc_entry
+
+.align 4
+.Lenc_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b, {$sb1t}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	tbl	v0.16b, {$sb1u}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	tbl	v5.16b,	{$sb2t}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v2.16b, {$sb2u}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	x11, x11, #~(1<<6)		// and		\$0x30,	%r11		# ... mod 4
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	sub	w8, w8, #1			// nr--
+
+.Lenc_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v5.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v3.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, .Lenc_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b, {$sbou}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	tbl	v0.16b, {$sbot}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+.globl	vpaes_encrypt
+.type	vpaes_encrypt,%function
+.align	4
+vpaes_encrypt:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v7.16b}, [$inp]
+	bl	_vpaes_encrypt_preheat
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [$out]
+
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_encrypt,.-vpaes_encrypt
+
+.type	_vpaes_encrypt_2x,%function
+.align 4
+_vpaes_encrypt_2x:
+	mov	x9, $key
+	ldr	w8, [$key,#240]			// pull rounds
+	adrp	x11, :pg_hi21:.Lk_mc_forward+16
+	add	x11, x11, :lo12:.Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	\$4,	%xmm0,	%xmm0
+	 and	v9.16b,  v15.16b,  v17.16b
+	 ushr	v8.16b,  v15.16b,  #4
+	tbl	v1.16b,  {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+	 tbl	v9.16b,  {$iptlo}, v9.16b
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b,  {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	 tbl	v10.16b, {$ipthi}, v8.16b
+	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
+	 eor	v8.16b,  v9.16b,   v16.16b
+	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	 eor	v8.16b,  v8.16b,   v10.16b
+	b	.Lenc_2x_entry
+
+.align 4
+.Lenc_2x_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b,  {$sb1t}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	 tbl	v12.16b, {$sb1t}, v10.16b
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	tbl	v0.16b,  {$sb1u}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	 tbl	v8.16b,  {$sb1u}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	 eor	v12.16b, v12.16b, v16.16b
+	tbl	v5.16b,	 {$sb2t}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	 tbl	v13.16b, {$sb2t}, v10.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	 eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v2.16b,  {$sb2u}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	 tbl	v10.16b, {$sb2u}, v11.16b
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	 tbl	v11.16b, {v8.16b}, v1.16b
+	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	 eor	v10.16b, v10.16b, v13.16b
+	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	 tbl	v8.16b,  {v8.16b}, v4.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	 eor	v11.16b, v11.16b, v10.16b
+	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	 tbl	v12.16b, {v11.16b},v1.16b
+	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	 eor	v8.16b,  v8.16b,  v11.16b
+	and	x11, x11, #~(1<<6)		// and		\$0x30,	%r11		# ... mod 4
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	 eor	v8.16b,  v8.16b,  v12.16b
+	sub	w8, w8, #1			// nr--
+
+.Lenc_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b,  v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
+	 and	v9.16b,  v8.16b, v17.16b
+	 ushr	v8.16b,  v8.16b, #4
+	tbl	v5.16b,  {$invhi},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	 tbl	v13.16b, {$invhi},v9.16b
+	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	 eor	v9.16b,  v9.16b,  v8.16b
+	tbl	v3.16b,  {$invlo},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	 tbl	v11.16b, {$invlo},v8.16b
+	tbl	v4.16b,  {$invlo},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	 tbl	v12.16b, {$invlo},v9.16b
+	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	 eor	v11.16b, v11.16b, v13.16b
+	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	 eor	v12.16b, v12.16b, v13.16b
+	tbl	v2.16b,  {$invlo},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	 tbl	v10.16b, {$invlo},v11.16b
+	tbl	v3.16b,  {$invlo},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	 tbl	v11.16b, {$invlo},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	 eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	 eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, .Lenc_2x_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b,  {$sbou}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	 tbl	v12.16b, {$sbou}, v10.16b
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	tbl	v0.16b,  {$sbot}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	 tbl	v8.16b,  {$sbot}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	 eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	 eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	 tbl	v1.16b,  {v8.16b},v1.16b
+	ret
+.size	_vpaes_encrypt_2x,.-_vpaes_encrypt_2x
+
+.type	_vpaes_decrypt_preheat,%function
+.align	4
+_vpaes_decrypt_preheat:
+	adrp	x10, :pg_hi21:.Lk_inv
+	add	x10, x10, :lo12:.Lk_inv
+	movi	v17.16b, #0x0f
+	adrp	x11, :pg_hi21:.Lk_dipt
+	add	x11, x11, :lo12:.Lk_dipt
+	ld1	{v18.2d-v19.2d}, [x10],#32	// .Lk_inv
+	ld1	{v20.2d-v23.2d}, [x11],#64	// .Lk_dipt, .Lk_dsbo
+	ld1	{v24.2d-v27.2d}, [x11],#64	// .Lk_dsb9, .Lk_dsbd
+	ld1	{v28.2d-v31.2d}, [x11]		// .Lk_dsbb, .Lk_dsbe
+	ret
+.size	_vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
+
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+.type	_vpaes_decrypt_core,%function
+.align	4
+_vpaes_decrypt_core:
+	mov	x9, $key
+	ldr	w8, [$key,#240]			// pull rounds
+
+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	\$4, %r11
+	eor	x11, x11, #0x30			// xor		\$0x30,	%r11
+	adrp	x10, :pg_hi21:.Lk_sr
+	add	x10, x10, :lo12:.Lk_sr
+	and	x11, x11, #0x30			// and		\$0x30,	%r11
+	add	x11, x11, x10
+	adrp	x10, :pg_hi21:.Lk_mc_forward+48
+	add	x10, x10, :lo12:.Lk_mc_forward+48
+
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
+	tbl	v2.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	tbl	v0.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	.Ldec_entry
+
+.align 4
+.Ldec_loop:
+//
+//  Inverse mix columns
+//
+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	tbl	v4.16b, {$sb9u}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	tbl	v1.16b, {$sb9t}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+
+	tbl	v4.16b, {$sbdu}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	tbl 	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {$sbdt}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	tbl	v4.16b, {$sbbu}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {$sbbt}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
+
+	tbl	v4.16b, {$sbeu}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {$sbet}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr \$12,	%xmm5,	%xmm5,	%xmm5
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	sub	w8, w8, #1			// sub		\$1,%rax			# nr--
+
+.Ldec_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v2.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	tbl	v3.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
+	cbnz	w8, .Ldec_loop
+
+	// middle of last round
+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	tbl	v4.16b, {$sbou}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
+	tbl	v1.16b, {$sbot}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+.globl	vpaes_decrypt
+.type	vpaes_decrypt,%function
+.align	4
+vpaes_decrypt:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v7.16b}, [$inp]
+	bl	_vpaes_decrypt_preheat
+	bl	_vpaes_decrypt_core
+	st1	{v0.16b}, [$out]
+
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_decrypt,.-vpaes_decrypt
+
+// v14-v15 input, v0-v1 output
+.type	_vpaes_decrypt_2x,%function
+.align	4
+_vpaes_decrypt_2x:
+	mov	x9, $key
+	ldr	w8, [$key,#240]			// pull rounds
+
+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	\$4, %r11
+	eor	x11, x11, #0x30			// xor		\$0x30,	%r11
+	adrp	x10, :pg_hi21:.Lk_sr
+	add	x10, x10, :lo12:.Lk_sr
+	and	x11, x11, #0x30			// and		\$0x30,	%r11
+	add	x11, x11, x10
+	adrp	x10, :pg_hi21:.Lk_mc_forward+48
+	add	x10, x10, :lo12:.Lk_mc_forward+48
+
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
+	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
+	 and	v9.16b,  v15.16b, v17.16b
+	 ushr	v8.16b,  v15.16b, #4
+	tbl	v2.16b,  {$iptlo},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+	 tbl	v10.16b, {$iptlo},v9.16b
+	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	tbl	v0.16b,  {$ipthi},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	 tbl	v8.16b,  {$ipthi},v8.16b
+	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
+	 eor	v10.16b, v10.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	 eor	v8.16b,  v8.16b,  v10.16b
+	b	.Ldec_2x_entry
+
+.align 4
+.Ldec_2x_loop:
+//
+//  Inverse mix columns
+//
+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	tbl	v4.16b,  {$sb9u}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	 tbl	v12.16b, {$sb9u}, v10.16b
+	tbl	v1.16b,  {$sb9t}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	 tbl	v9.16b,  {$sb9t}, v11.16b
+	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
+	 eor	v8.16b,  v12.16b, v16.16b
+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	 eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+
+	tbl	v4.16b,  {$sbdu}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	 tbl	v12.16b, {$sbdu}, v10.16b
+	tbl 	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	 tbl 	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {$sbdt}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	 tbl	v9.16b,  {$sbdt}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	 eor	v8.16b,  v8.16b,  v12.16b
+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	 eor	v8.16b,  v8.16b,  v9.16b
+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	tbl	v4.16b,  {$sbbu}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	 tbl	v12.16b, {$sbbu}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	 tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {$sbbt}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	 tbl	v9.16b,  {$sbbt}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	 eor	v8.16b,  v8.16b,  v12.16b
+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	 eor	v8.16b,  v8.16b,  v9.16b
+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
+
+	tbl	v4.16b,  {$sbeu}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	 tbl	v12.16b, {$sbeu}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	 tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {$sbet}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	 tbl	v9.16b,  {$sbet}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	 eor	v8.16b,  v8.16b,  v12.16b
+	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr \$12,	%xmm5,	%xmm5,	%xmm5
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	 eor	v8.16b,  v8.16b,  v9.16b
+	sub	w8, w8, #1			// sub		\$1,%rax			# nr--
+
+.Ldec_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
+	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
+	 and	v9.16b,  v8.16b,  v17.16b
+	 ushr	v8.16b,  v8.16b,  #4
+	tbl	v2.16b,  {$invhi},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	 tbl	v10.16b, {$invhi},v9.16b
+	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	 eor	v9.16b,	 v9.16b,  v8.16b
+	tbl	v3.16b,  {$invlo},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	 tbl	v11.16b, {$invlo},v8.16b
+	tbl	v4.16b,  {$invlo},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	 tbl	v12.16b, {$invlo},v9.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	 eor	v11.16b, v11.16b, v10.16b
+	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	 eor	v12.16b, v12.16b, v10.16b
+	tbl	v2.16b,  {$invlo},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	 tbl	v10.16b, {$invlo},v11.16b
+	tbl	v3.16b,  {$invlo},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	 tbl	v11.16b, {$invlo},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
+	 eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	 eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
+	cbnz	w8, .Ldec_2x_loop
+
+	// middle of last round
+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	tbl	v4.16b,  {$sbou}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	 tbl	v12.16b, {$sbou}, v10.16b
+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	tbl	v1.16b,  {$sbot}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	 tbl	v9.16b,  {$sbot}, v11.16b
+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	 eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
+	 eor	v8.16b,  v9.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
+	 tbl	v1.16b,  {v8.16b},v2.16b
+	ret
+.size	_vpaes_decrypt_2x,.-_vpaes_decrypt_2x
+___
+}
+{
+my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
+my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
+
+$code.=<<___;
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+.type	_vpaes_key_preheat,%function
+.align	4
+_vpaes_key_preheat:
+	adrp	x10, :pg_hi21:.Lk_inv
+	add	x10, x10, :lo12:.Lk_inv
+	movi	v16.16b, #0x5b			// .Lk_s63
+	adrp	x11, :pg_hi21:.Lk_sb1
+	add	x11, x11, :lo12:.Lk_sb1
+	movi	v17.16b, #0x0f			// .Lk_s0F
+	ld1	{v18.2d-v21.2d}, [x10]		// .Lk_inv, .Lk_ipt
+	adrp	x10, :pg_hi21:.Lk_dksd
+	add	x10, x10, :lo12:.Lk_dksd
+	ld1	{v22.2d-v23.2d}, [x11]		// .Lk_sb1
+	adrp	x11, :pg_hi21:.Lk_mc_forward
+	add	x11, x11, :lo12:.Lk_mc_forward
+	ld1	{v24.2d-v27.2d}, [x10],#64	// .Lk_dksd, .Lk_dksb
+	ld1	{v28.2d-v31.2d}, [x10],#64	// .Lk_dkse, .Lk_dks9
+	ld1	{v8.2d}, [x10]			// .Lk_rcon
+	ld1	{v9.2d}, [x11]			// .Lk_mc_forward[0]
+	ret
+.size	_vpaes_key_preheat,.-_vpaes_key_preheat
+
+.type	_vpaes_schedule_core,%function
+.align	4
+_vpaes_schedule_core:
+	stp	x29, x30, [sp,#-16]!
+	add	x29,sp,#0
+
+	bl	_vpaes_key_preheat		// load the tables
+
+	ld1	{v0.16b}, [$inp],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	// input transform
+	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
+
+	adrp	x10, :pg_hi21:.Lk_sr		// lea	.Lk_sr(%rip),%r10
+	add	x10, x10, :lo12:.Lk_sr
+
+	add	x8, x8, x10
+	cbnz	$dir, .Lschedule_am_decrypting
+
+	// encrypting, output zeroth round key after transform
+	st1	{v0.2d}, [$out]			// vmovdqu	%xmm0,	(%rdx)
+	b	.Lschedule_go
+
+.Lschedule_am_decrypting:
+	// decrypting, output zeroth round key after shiftrows
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
+	st1	{v3.2d}, [$out]			// vmovdqu	%xmm3,	(%rdx)
+	eor	x8, x8, #0x30			// xor	\$0x30, %r8
+
+.Lschedule_go:
+	cmp	$bits, #192			// cmp	\$192,	%esi
+	b.hi	.Lschedule_256
+	b.eq	.Lschedule_192
+	// 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+.Lschedule_128:
+	mov	$inp, #10			// mov	\$10, %esi
+
+.Loop_schedule_128:
+	sub	$inp, $inp, #1			// dec	%esi
+	bl 	_vpaes_schedule_round
+	cbz 	$inp, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// write output
+	b 	.Loop_schedule_128
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align	4
+.Lschedule_192:
+	sub	$inp, $inp, #8
+	ld1	{v0.16b}, [$inp]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
+	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
+	mov	$inp, #4			// mov	\$4,	%esi
+
+.Loop_schedule_192:
+	sub	$inp, $inp, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	\$8,%xmm6,%xmm0,%xmm0
+	bl	_vpaes_schedule_mangle		// save key n
+	bl	_vpaes_schedule_192_smear
+	bl	_vpaes_schedule_mangle		// save key n+1
+	bl	_vpaes_schedule_round
+	cbz 	$inp, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// save key n+2
+	bl	_vpaes_schedule_192_smear
+	b	.Loop_schedule_192
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align	4
+.Lschedule_256:
+	ld1	{v0.16b}, [$inp]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	$inp, #7			// mov	\$7, %esi
+
+.Loop_schedule_256:
+	sub	$inp, $inp, #1			// dec	%esi
+	bl	_vpaes_schedule_mangle		// output low result
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	// high round
+	bl	_vpaes_schedule_round
+	cbz 	$inp, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	// low round. swap xmm7 and xmm6
+	dup	v0.4s, v0.s[3]			// vpshufd	\$0xFF,	%xmm0,	%xmm0
+	movi	v4.16b, #0
+	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
+	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
+
+	b	.Loop_schedule_256
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align	4
+.Lschedule_mangle_last:
+	// schedule last round key from xmm0
+	adrp	x11, :pg_hi21:.Lk_deskew	// lea	.Lk_deskew(%rip),%r11	# prepare to deskew
+	add	x11, x11, :lo12:.Lk_deskew
+
+	cbnz	$dir, .Lschedule_mangle_last_dec
+
+	// encrypting
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
+	adrp	x11, :pg_hi21:.Lk_opt		// lea	.Lk_opt(%rip),	%r11		# prepare to output transform
+	add	x11, x11, :lo12:.Lk_opt
+	add	$out, $out, #32			// add	\$32,	%rdx
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+
+.Lschedule_mangle_last_dec:
+	ld1	{v20.2d-v21.2d}, [x11]		// reload constants
+	sub	$out, $out, #16			// add	\$-16,	%rdx
+	eor	v0.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	// output transform
+	st1	{v0.2d}, [$out]			// vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	// cleanup
+	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
+	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
+	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
+	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
+	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
+	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
+	ldp	x29, x30, [sp],#16
+	ret
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+.type	_vpaes_schedule_192_smear,%function
+.align	4
+_vpaes_schedule_192_smear:
+	movi	v1.16b, #0
+	dup	v0.4s, v7.s[3]
+	ins	v1.s[3], v6.s[2]	// vpshufd	\$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
+	ins	v0.s[0], v7.s[2]	// vpshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
+	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
+	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
+	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
+	ret
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.type	_vpaes_schedule_round,%function
+.align	4
+_vpaes_schedule_round:
+	// extract rcon from xmm8
+	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
+	ext	v1.16b, $rcon, v4.16b, #15	// vpalignr	\$15,	%xmm8,	%xmm4,	%xmm1
+	ext	$rcon, $rcon, $rcon, #15	// vpalignr	\$15,	%xmm8,	%xmm8,	%xmm8
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+
+	// rotate
+	dup	v0.4s, v0.s[3]			// vpshufd	\$0xFF,	%xmm0,	%xmm0
+	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	\$1,	%xmm0,	%xmm0,	%xmm0
+
+	// fall through...
+
+	// low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	// smear xmm7
+	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	\$4,	%xmm7,	%xmm1
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	\$8,	%xmm7,	%xmm4
+
+	// subbytes
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0		# 1 = i
+	 eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
+	tbl	v2.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
+	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	 eor	v7.16b, v7.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm7,	%xmm7
+	tbl	v3.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
+	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	// add in smeared stuff
+	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
+	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
+	ret
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.type	_vpaes_schedule_transform,%function
+.align	4
+_vpaes_schedule_transform:
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
+						// vmovdqa	(%r11),	%xmm2 	# lo
+	tbl	v2.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+						// vmovdqa	16(%r11),	%xmm1 # hi
+	tbl	v0.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.type	_vpaes_schedule_mangle,%function
+.align	4
+_vpaes_schedule_mangle:
+	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
+	cbnz	$dir, .Lschedule_mangle_dec
+
+	// encrypting
+	eor	v4.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
+	add	$out, $out, #16			// add	\$16,	%rdx
+	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
+	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
+	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
+	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
+
+	b	.Lschedule_mangle_both
+.align	4
+.Lschedule_mangle_dec:
+	// inverse mix columns
+						// lea	.Lk_dksd(%rip),%r11
+	ushr	v1.16b, v4.16b, #4		// vpsrlb	\$4,	%xmm4,	%xmm1	# 1 = hi
+	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
+
+						// vmovdqa	0x00(%r11),	%xmm2
+	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+						// vmovdqa	0x10(%r11),	%xmm3
+	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+
+						// vmovdqa	0x20(%r11),	%xmm2
+	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+						// vmovdqa	0x30(%r11),	%xmm3
+	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+
+						// vmovdqa	0x40(%r11),	%xmm2
+	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+						// vmovdqa	0x50(%r11),	%xmm3
+	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+
+						// vmovdqa	0x60(%r11),	%xmm2
+	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+						// vmovdqa	0x70(%r11),	%xmm4
+	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
+
+	sub	$out, $out, #16			// add	\$-16,	%rdx
+
+.Lschedule_mangle_both:
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	add	x8, x8, #64-16			// add	\$-16,	%r8
+	and	x8, x8, #~(1<<6)		// and	\$0x30,	%r8
+	st1	{v3.2d}, [$out]			// vmovdqu	%xmm3,	(%rdx)
+	ret
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+.globl	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,%function
+.align	4
+vpaes_set_encrypt_key:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, $bits, #5		// shr	\$5,%eax
+	add	w9, w9, #5		// \$5,%eax
+	str	w9, [$out,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	$dir, #0		// mov	\$0,%ecx
+	mov	x8, #0x30		// mov	\$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	x0, x0, x0
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl	vpaes_set_decrypt_key
+.type	vpaes_set_decrypt_key,%function
+.align	4
+vpaes_set_decrypt_key:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, $bits, #5		// shr	\$5,%eax
+	add	w9, w9, #5		// \$5,%eax
+	str	w9, [$out,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+	lsl	w9, w9, #4		// shl	\$4,%eax
+	add	$out, $out, #16		// lea	16(%rdx,%rax),%rdx
+	add	$out, $out, x9
+
+	mov	$dir, #1		// mov	\$1,%ecx
+	lsr	w8, $bits, #1		// shr	\$1,%r8d
+	and	x8, x8, #32		// and	\$32,%r8d
+	eor	x8, x8, #32		// xor	\$32,%r8d	# nbits==192?0:32
+	bl	_vpaes_schedule_core
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+___
+}
+{
+my ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5));
+
+$code.=<<___;
+.globl	vpaes_cbc_encrypt
+.type	vpaes_cbc_encrypt,%function
+.align	4
+vpaes_cbc_encrypt:
+	cbz	$len, .Lcbc_abort
+	cmp	w5, #0			// check direction
+	b.eq	vpaes_cbc_decrypt
+
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x17, $len		// reassign
+	mov	x2,  $key		// reassign
+
+	ld1	{v0.16b}, [$ivec]	// load ivec
+	bl	_vpaes_encrypt_preheat
+	b	.Lcbc_enc_loop
+
+.align	4
+.Lcbc_enc_loop:
+	ld1	{v7.16b}, [$inp],#16	// load input
+	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [$out],#16	// save output
+	subs	x17, x17, #16
+	b.hi	.Lcbc_enc_loop
+
+	st1	{v0.16b}, [$ivec]	// write ivec
+
+	ldp	x29,x30,[sp],#16
+.Lcbc_abort:
+	ret
+.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+
+.type	vpaes_cbc_decrypt,%function
+.align	4
+vpaes_cbc_decrypt:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	mov	x17, $len		// reassign
+	mov	x2,  $key		// reassign
+	ld1	{v6.16b}, [$ivec]	// load ivec
+	bl	_vpaes_decrypt_preheat
+	tst	x17, #16
+	b.eq	.Lcbc_dec_loop2x
+
+	ld1	{v7.16b}, [$inp], #16	// load input
+	bl	_vpaes_decrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
+	orr	v6.16b, v7.16b, v7.16b	// next ivec value
+	st1	{v0.16b}, [$out], #16
+	subs	x17, x17, #16
+	b.ls	.Lcbc_dec_done
+
+.align	4
+.Lcbc_dec_loop2x:
+	ld1	{v14.16b,v15.16b}, [$inp], #32
+	bl	_vpaes_decrypt_2x
+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
+	eor	v1.16b, v1.16b, v14.16b
+	orr	v6.16b, v15.16b, v15.16b
+	st1	{v0.16b,v1.16b}, [$out], #32
+	subs	x17, x17, #32
+	b.hi	.Lcbc_dec_loop2x
+
+.Lcbc_dec_done:
+	st1	{v6.16b}, [$ivec]
+
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
+___
+# We omit vpaes_ecb_* in BoringSSL. They are unused.
+if (0) {
+$code.=<<___;
+.globl	vpaes_ecb_encrypt
+.type	vpaes_ecb_encrypt,%function
+.align	4
+vpaes_ecb_encrypt:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	mov	x17, $len
+	mov	x2,  $key
+	bl	_vpaes_encrypt_preheat
+	tst	x17, #16
+	b.eq	.Lecb_enc_loop
+
+	ld1	{v7.16b}, [$inp],#16
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [$out],#16
+	subs	x17, x17, #16
+	b.ls	.Lecb_enc_done
+
+.align	4
+.Lecb_enc_loop:
+	ld1	{v14.16b,v15.16b}, [$inp], #32
+	bl	_vpaes_encrypt_2x
+	st1	{v0.16b,v1.16b}, [$out], #32
+	subs	x17, x17, #32
+	b.hi	.Lecb_enc_loop
+
+.Lecb_enc_done:
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
+
+.globl	vpaes_ecb_decrypt
+.type	vpaes_ecb_decrypt,%function
+.align	4
+vpaes_ecb_decrypt:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	mov	x17, $len
+	mov	x2,  $key
+	bl	_vpaes_decrypt_preheat
+	tst	x17, #16
+	b.eq	.Lecb_dec_loop
+
+	ld1	{v7.16b}, [$inp],#16
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [$out],#16
+	subs	x17, x17, #16
+	b.ls	.Lecb_dec_done
+
+.align	4
+.Lecb_dec_loop:
+	ld1	{v14.16b,v15.16b}, [$inp], #32
+	bl	_vpaes_decrypt_2x
+	st1	{v0.16b,v1.16b}, [$out], #32
+	subs	x17, x17, #32
+	b.hi	.Lecb_dec_loop
+
+.Lecb_dec_done:
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_ecb_decrypt,.-vpaes_ecb_decrypt
+___
+}
+
+my ($ctr, $ctr_tmp) = ("w6", "w7");
+
+# void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
+#                                 const AES_KEY *key, const uint8_t ivec[16]);
+$code.=<<___;
+.globl	vpaes_ctr32_encrypt_blocks
+.type	vpaes_ctr32_encrypt_blocks,%function
+.align	4
+vpaes_ctr32_encrypt_blocks:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	cbz	$len, .Lctr32_done
+
+	// Note, unlike the other functions, $len here is measured in blocks,
+	// not bytes.
+	mov	x17, $len
+	mov	x2,  $key
+
+	// Load the IV and counter portion.
+	ldr	$ctr, [$ivec, #12]
+	ld1	{v7.16b}, [$ivec]
+
+	bl	_vpaes_encrypt_preheat
+	tst	x17, #1
+	rev	$ctr, $ctr		// The counter is big-endian.
+	b.eq	.Lctr32_prep_loop
+
+	// Handle one block so the remaining block count is even for
+	// _vpaes_encrypt_2x.
+	ld1	{v6.16b}, [$inp], #16	// Load input ahead of time
+	bl	_vpaes_encrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// XOR input and result
+	st1	{v0.16b}, [$out], #16
+	subs	x17, x17, #1
+	// Update the counter.
+	add	$ctr, $ctr, #1
+	rev	$ctr_tmp, $ctr
+	mov	v7.s[3], $ctr_tmp
+	b.ls	.Lctr32_done
+
+.Lctr32_prep_loop:
+	// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
+	// uses v14 and v15.
+	mov	v15.16b, v7.16b
+	mov	v14.16b, v7.16b
+	add	$ctr, $ctr, #1
+	rev	$ctr_tmp, $ctr
+	mov	v15.s[3], $ctr_tmp
+
+.Lctr32_loop:
+	ld1	{v6.16b,v7.16b}, [$inp], #32	// Load input ahead of time
+	bl	_vpaes_encrypt_2x
+	eor	v0.16b, v0.16b, v6.16b		// XOR input and result
+	eor	v1.16b, v1.16b, v7.16b		// XOR input and result (#2)
+	st1	{v0.16b,v1.16b}, [$out], #32
+	subs	x17, x17, #2
+	// Update the counter.
+	add	$ctr_tmp, $ctr, #1
+	add	$ctr, $ctr, #2
+	rev	$ctr_tmp, $ctr_tmp
+	mov	v14.s[3], $ctr_tmp
+	rev	$ctr_tmp, $ctr
+	mov	v15.s[3], $ctr_tmp
+	b.hi	.Lctr32_loop
+
+.Lctr32_done:
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	ret
+.size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+___
+}
+
+print $code;
+
+close STDOUT;
diff --git a/src/crypto/fipsmodule/aes/internal.h b/src/crypto/fipsmodule/aes/internal.h
index 0df30d9..a05abcb 100644
--- a/src/crypto/fipsmodule/aes/internal.h
+++ b/src/crypto/fipsmodule/aes/internal.h
@@ -35,13 +35,13 @@
 }
 
 #define VPAES
-OPENSSL_INLINE char vpaes_capable(void) {
+OPENSSL_INLINE int vpaes_capable(void) {
   return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0;
 }
 
 #if defined(OPENSSL_X86_64)
 #define BSAES
-OPENSSL_INLINE char bsaes_capable(void) { return vpaes_capable(); }
+OPENSSL_INLINE int bsaes_capable(void) { return vpaes_capable(); }
 #endif  // X86_64
 
 #elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
@@ -51,7 +51,13 @@
 
 #if defined(OPENSSL_ARM)
 #define BSAES
-OPENSSL_INLINE char bsaes_capable(void) { return CRYPTO_is_NEON_capable(); }
+OPENSSL_INLINE int bsaes_capable(void) { return CRYPTO_is_NEON_capable(); }
+#endif
+
+#if defined(OPENSSL_AARCH64)
+#define VPAES
+#define VPAES_CTR32
+OPENSSL_INLINE int vpaes_capable(void) { return CRYPTO_is_NEON_capable(); }
 #endif
 
 #elif defined(OPENSSL_PPC64LE)
@@ -162,6 +168,10 @@
 
 void vpaes_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length,
                        const AES_KEY *key, uint8_t *ivec, int enc);
+#if defined(VPAES_CTR32)
+void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
+                                const AES_KEY *key, const uint8_t ivec[16]);
+#endif
 #else
 OPENSSL_INLINE char vpaes_capable(void) { return 0; }
 
diff --git a/src/crypto/fipsmodule/cipher/e_aes.c b/src/crypto/fipsmodule/cipher/e_aes.c
index 460deed..51a1fb1 100644
--- a/src/crypto/fipsmodule/cipher/e_aes.c
+++ b/src/crypto/fipsmodule/cipher/e_aes.c
@@ -143,7 +143,15 @@
   } else if (vpaes_capable()) {
     ret = vpaes_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
     dat->block = vpaes_encrypt;
-    dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ? vpaes_cbc_encrypt : NULL;
+    dat->stream.cbc = NULL;
+    if (mode == EVP_CIPH_CBC_MODE) {
+      dat->stream.cbc = vpaes_cbc_encrypt;
+    }
+#if defined(VPAES_CTR32)
+    if (mode == EVP_CIPH_CTR_MODE) {
+      dat->stream.ctr = vpaes_ctr32_encrypt_blocks;
+    }
+#endif
   } else {
     ret = aes_nohw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
     dat->block = aes_nohw_encrypt;
@@ -253,7 +261,11 @@
     if (gcm_key != NULL) {
       CRYPTO_gcm128_init_key(gcm_key, aes_key, vpaes_encrypt, 0);
     }
+#if defined(VPAES_CTR32)
+    return vpaes_ctr32_encrypt_blocks;
+#else
     return NULL;
+#endif
   }
 
   aes_nohw_set_encrypt_key(key, key_bytes * 8, aes_key);
diff --git a/src/crypto/fipsmodule/ec/ec_key.c b/src/crypto/fipsmodule/ec/ec_key.c
index 04650ed..3ef17d9 100644
--- a/src/crypto/fipsmodule/ec/ec_key.c
+++ b/src/crypto/fipsmodule/ec/ec_key.c
@@ -267,7 +267,7 @@
     return 0;
   }
 
-  if (EC_GROUP_cmp(key->group, pub_key->group, NULL) != 0) {
+  if (pub_key != NULL && EC_GROUP_cmp(key->group, pub_key->group, NULL) != 0) {
     OPENSSL_PUT_ERROR(EC, EC_R_GROUP_MISMATCH);
     return 0;
   }
diff --git a/src/crypto/fipsmodule/ec/ec_test.cc b/src/crypto/fipsmodule/ec/ec_test.cc
index 97c6d45..dd4c75a 100644
--- a/src/crypto/fipsmodule/ec/ec_test.cc
+++ b/src/crypto/fipsmodule/ec/ec_test.cc
@@ -347,6 +347,20 @@
       EC_KEY_set_public_key(key.get(), EC_GROUP_get0_generator(group.get())));
 }
 
+TEST(ECTest, SetNULLKey) {
+  bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(NID_X9_62_prime256v1));
+  ASSERT_TRUE(key);
+
+  EXPECT_TRUE(EC_KEY_set_public_key(
+      key.get(), EC_GROUP_get0_generator(EC_KEY_get0_group(key.get()))));
+  EXPECT_TRUE(EC_KEY_get0_public_key(key.get()));
+
+  // Setting a NULL public-key should clear the public-key and return zero, in
+  // order to match OpenSSL behaviour exactly.
+  EXPECT_FALSE(EC_KEY_set_public_key(key.get(), nullptr));
+  EXPECT_FALSE(EC_KEY_get0_public_key(key.get()));
+}
+
 TEST(ECTest, GroupMismatch) {
   bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(NID_secp384r1));
   ASSERT_TRUE(key);
diff --git a/src/crypto/fipsmodule/modes/asm/ghash-armv4.pl b/src/crypto/fipsmodule/modes/asm/ghash-armv4.pl
index e93e5b3..778b543 100644
--- a/src/crypto/fipsmodule/modes/asm/ghash-armv4.pl
+++ b/src/crypto/fipsmodule/modes/asm/ghash-armv4.pl
@@ -1,5 +1,5 @@
 #! /usr/bin/env perl
-# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
 #
 # Licensed under the OpenSSL license (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
@@ -150,6 +150,8 @@
 .text
 #if defined(__thumb2__) || defined(__clang__)
 .syntax	unified
+#define ldrplb  ldrbpl
+#define ldrneb  ldrbne
 #endif
 #if defined(__thumb2__)
 .thumb
@@ -157,11 +159,6 @@
 .code	32
 #endif
 
-#ifdef  __clang__
-#define ldrplb  ldrbpl
-#define ldrneb  ldrbne
-#endif
-
 .type	rem_4bit,%object
 .align	5
 rem_4bit:
diff --git a/src/crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl b/src/crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl
new file mode 100644
index 0000000..0d9ce15
--- /dev/null
+++ b/src/crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl
@@ -0,0 +1,288 @@
+#!/usr/bin/env perl
+# Copyright (c) 2019, Google Inc.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+# ghash-ssse3-x86.pl is a constant-time variant of the traditional 4-bit
+# table-based GHASH implementation. It requires SSSE3 instructions.
+#
+# For background, the table-based strategy is a 4-bit windowed multiplication.
+# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops
+# over 4-bit windows of the input and indexes them up into the table. Visually,
+# it multiplies as in the schoolbook multiplication diagram below, but with
+# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First
+# it incorporates the terms labeled '1' by indexing the most significant term
+# of X into the table. Then it shifts and repeats for '2' and so on.
+#
+#        hhhhhh
+#  *     xxxxxx
+#  ============
+#        666666
+#       555555
+#      444444
+#     333333
+#    222222
+#   111111
+#
+# This implementation changes the order. We treat the table as a 16×16 matrix
+# and transpose it. The first row is then the first byte of each multiple of H,
+# and so on. We then reorder terms as below. Observe that the terms labeled '1'
+# and '2' are all lookups into the first row, etc. This maps well to the SSSE3
+# pshufb instruction, using alternating terms of X in parallel as indices. This
+# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and
+# repeat for each row.
+#
+#        hhhhhh
+#  *     xxxxxx
+#  ============
+#        224466
+#       113355
+#      224466
+#     113355
+#    224466
+#   113355
+#
+# Next we account for GCM's confusing bit order. The "first" bit is the least
+# significant coefficient, but GCM treats the most sigificant bit within a byte
+# as first. Bytes are little-endian, and bits are big-endian. We reverse the
+# bytes in XMM registers for a consistent bit and byte ordering, but this means
+# the least significant bit is the most significant coefficient and vice versa.
+#
+# For consistency, "low", "high", "left-shift", and "right-shift" refer to the
+# bit ordering within the XMM register, rather than the reversed coefficient
+# ordering. Low bits are less significant bits and more significant
+# coefficients. Right-shifts move from MSB to the LSB and correspond to
+# increasing the power of each coefficient.
+#
+# Note this bit reversal enters into the table's column indices. H*1 is stored
+# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier
+# table rows contain more significant coefficients, so we iterate forwards.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output = pop;
+open STDOUT, ">$output";
+
+&asm_init($ARGV[0]);
+
+my ($Xi, $Htable, $in, $len) = ("edi", "esi", "edx", "ecx");
+&static_label("reverse_bytes");
+&static_label("low4_mask");
+
+my $call_counter = 0;
+# process_rows emits assembly code to process $rows rows of the table. On
+# input, $Htable stores the pointer to the next row. xmm0 and xmm1 store the
+# low and high halves of the input. The result so far is passed in xmm2. xmm3
+# must be zero. On output, $Htable is advanced to the next row and xmm2 is
+# updated. xmm3 remains zero. It clobbers eax, xmm4, xmm5, and xmm6.
+sub process_rows {
+	my ($rows) = @_;
+	$call_counter++;
+
+	# Shifting whole XMM registers by bits is complex. psrldq shifts by
+	# bytes, and psrlq shifts the two 64-bit halves separately. Each row
+	# produces 8 bits of carry, and the reduction needs an additional 7-bit
+	# shift. This must fit in 64 bits so reduction can use psrlq. This
+	# allows up to 7 rows at a time.
+	die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64);
+
+	&mov("eax", $rows);
+&set_label("loop_row_$call_counter");
+	&movdqa("xmm4", &QWP(0, $Htable));
+	&lea($Htable, &DWP(16, $Htable));
+
+	# Right-shift xmm2 and xmm3 by 8 bytes.
+	&movdqa("xmm6", "xmm2");
+	&palignr("xmm6", "xmm3", 1);
+	&movdqa("xmm3", "xmm6");
+	&psrldq("xmm2", 1);
+
+	# Load the next table row and index the low and high bits of the input.
+	# Note the low (respectively, high) half corresponds to more
+	# (respectively, less) significant coefficients.
+	&movdqa("xmm5", "xmm4");
+	&pshufb("xmm4", "xmm0");
+	&pshufb("xmm5", "xmm1");
+
+	# Add the high half (xmm5) without shifting.
+	&pxor("xmm2", "xmm5");
+
+	# Add the low half (xmm4). This must be right-shifted by 4 bits. First,
+	# add into the carry register (xmm3).
+	&movdqa("xmm5", "xmm4");
+	&psllq("xmm5", 60);
+	&movdqa("xmm6", "xmm5");
+	&pslldq("xmm6", 8);
+	&pxor("xmm3", "xmm6");
+
+	# Next, add into xmm2.
+	&psrldq("xmm5", 8);
+	&pxor("xmm2", "xmm5");
+	&psrlq("xmm4", 4);
+	&pxor("xmm2", "xmm4");
+
+	&sub("eax", 1);
+	&jnz(&label("loop_row_$call_counter"));
+
+	# Reduce the carry register. The reduction polynomial is 1 + x + x^2 +
+	# x^7, so we shift and XOR four times.
+	&pxor("xmm2", "xmm3");	# x^0 = 0
+	&psrlq("xmm3", 1);
+	&pxor("xmm2", "xmm3");	# x^1 = x
+	&psrlq("xmm3", 1);
+	&pxor("xmm2", "xmm3");	# x^(1+1) = x^2
+	&psrlq("xmm3", 5);
+	&pxor("xmm2", "xmm3");	# x^(1+1+5) = x^7
+	&pxor("xmm3", "xmm3");
+____
+}
+
+# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|.
+# |Xi| is represented in GHASH's serialized byte representation. |Htable| is
+# formatted as described above.
+# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]);
+&function_begin("gcm_gmult_ssse3");
+	&mov($Xi, &wparam(0));
+	&mov($Htable, &wparam(1));
+
+	&movdqu("xmm0", &QWP(0, $Xi));
+	&call(&label("pic_point"));
+&set_label("pic_point");
+	&blindpop("eax");
+	&movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "eax"));
+	&movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "eax"));
+
+	# Reverse input bytes to deserialize.
+	&pshufb("xmm0", "xmm7");
+
+	# Split each byte into low (xmm0) and high (xmm1) halves.
+	&movdqa("xmm1", "xmm2");
+	&pandn("xmm1", "xmm0");
+	&psrld("xmm1", 4);
+	&pand("xmm0", "xmm2");
+
+	# Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note
+	# that, due to bit reversal, xmm3 contains bits that fall off when
+	# right-shifting, not left-shifting.
+	&pxor("xmm2", "xmm2");
+	&pxor("xmm3", "xmm3");
+
+	# We must reduce at least once every 7 rows, so divide into three
+	# chunks.
+	&process_rows(5);
+	&process_rows(5);
+	&process_rows(6);
+
+	# Store the result. Reverse bytes to serialize.
+	&pshufb("xmm2", "xmm7");
+	&movdqu(&QWP(0, $Xi), "xmm2");
+
+	# Zero any registers which contain secrets.
+	&pxor("xmm0", "xmm0");
+	&pxor("xmm1", "xmm1");
+	&pxor("xmm2", "xmm2");
+	&pxor("xmm3", "xmm3");
+	&pxor("xmm4", "xmm4");
+	&pxor("xmm5", "xmm5");
+	&pxor("xmm6", "xmm6");
+&function_end("gcm_gmult_ssse3");
+
+# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as
+# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's
+# serialized byte representation. |Htable| is formatted as described above.
+# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
+#                      size_t len);
+&function_begin("gcm_ghash_ssse3");
+	&mov($Xi, &wparam(0));
+	&mov($Htable, &wparam(1));
+	&mov($in, &wparam(2));
+	&mov($len, &wparam(3));
+
+	&movdqu("xmm0", &QWP(0, $Xi));
+	&call(&label("pic_point"));
+&set_label("pic_point");
+	&blindpop("ebx");
+	&movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "ebx"));
+
+	# This function only processes whole blocks.
+	&and($len, -16);
+
+	# Reverse input bytes to deserialize. We maintain the running
+	# total in xmm0.
+	&pshufb("xmm0", "xmm7");
+
+	# Iterate over each block. On entry to each iteration, xmm3 is zero.
+	&pxor("xmm3", "xmm3");
+&set_label("loop_ghash");
+	&movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "ebx"));
+
+	# Incorporate the next block of input.
+	&movdqu("xmm1", &QWP(0, $in));
+	&pshufb("xmm1", "xmm7");	# Reverse bytes.
+	&pxor("xmm0", "xmm1");
+
+	# Split each byte into low (xmm0) and high (xmm1) halves.
+	&movdqa("xmm1", "xmm2");
+	&pandn("xmm1", "xmm0");
+	&psrld("xmm1", 4);
+	&pand("xmm0", "xmm2");
+
+	# Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note
+	# that, due to bit reversal, xmm3 contains bits that fall off when
+	# right-shifting, not left-shifting.
+	&pxor("xmm2", "xmm2");
+	# xmm3 is already zero at this point.
+
+	# We must reduce at least once every 7 rows, so divide into three
+	# chunks.
+	&process_rows(5);
+	&process_rows(5);
+	&process_rows(6);
+
+	&movdqa("xmm0", "xmm2");
+
+	# Rewind $Htable for the next iteration.
+	&lea($Htable, &DWP(-256, $Htable));
+
+	# Advance input and continue.
+	&lea($in, &DWP(16, $in));
+	&sub($len, 16);
+	&jnz(&label("loop_ghash"));
+
+	# Reverse bytes and store the result.
+	&pshufb("xmm0", "xmm7");
+	&movdqu(&QWP(0, $Xi), "xmm0");
+
+	# Zero any registers which contain secrets.
+	&pxor("xmm0", "xmm0");
+	&pxor("xmm1", "xmm1");
+	&pxor("xmm2", "xmm2");
+	&pxor("xmm3", "xmm3");
+	&pxor("xmm4", "xmm4");
+	&pxor("xmm5", "xmm5");
+	&pxor("xmm6", "xmm6");
+&function_end("gcm_ghash_ssse3");
+
+# reverse_bytes is a permutation which, if applied with pshufb, reverses the
+# bytes in an XMM register.
+&set_label("reverse_bytes", 16);
+&data_byte(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+# low4_mask is an XMM mask which selects the low four bits of each byte.
+&set_label("low4_mask", 16);
+&data_word(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);
+
+&asm_finish();
+
+close STDOUT;
diff --git a/src/crypto/fipsmodule/modes/gcm.c b/src/crypto/fipsmodule/modes/gcm.c
index f92f675..ca077ac 100644
--- a/src/crypto/fipsmodule/modes/gcm.c
+++ b/src/crypto/fipsmodule/modes/gcm.c
@@ -241,7 +241,7 @@
 // still in L1 cache after encryption pass...
 #define GHASH_CHUNK (3 * 1024)
 
-#if defined(GHASH_ASM_X86_64)
+#if defined(GHASH_ASM_X86_64) || defined(GHASH_ASM_X86)
 void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]) {
   // Run the existing 4-bit version.
   gcm_init_4bit(Htable, Xi);
@@ -266,7 +266,7 @@
     }
   }
 }
-#endif  // GHASH_ASM_X86_64
+#endif  // GHASH_ASM_X86_64 || GHASH_ASM_X86
 
 #ifdef GCM_FUNCREF_4BIT
 #undef GCM_MUL
@@ -321,6 +321,12 @@
     *out_hash = gcm_ghash_clmul;
     return;
   }
+  if (gcm_ssse3_capable()) {
+    gcm_init_ssse3(out_table, H.u);
+    *out_mult = gcm_gmult_ssse3;
+    *out_hash = gcm_ghash_ssse3;
+    return;
+  }
 #elif defined(GHASH_ASM_ARM)
   if (gcm_pmull_capable()) {
     gcm_init_v8(out_table, H.u);
diff --git a/src/crypto/fipsmodule/modes/gcm_test.cc b/src/crypto/fipsmodule/modes/gcm_test.cc
index 47ecd29..b2e805c 100644
--- a/src/crypto/fipsmodule/modes/gcm_test.cc
+++ b/src/crypto/fipsmodule/modes/gcm_test.cc
@@ -148,7 +148,7 @@
   }
 #endif  // GHASH_ASM_X86
 
-#if defined(GHASH_ASM_X86_64)
+#if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64)
   if (gcm_ssse3_capable()) {
     CHECK_ABI_SEH(gcm_init_ssse3, Htable, kH);
     CHECK_ABI_SEH(gcm_gmult_ssse3, X, Htable);
@@ -156,9 +156,7 @@
       CHECK_ABI_SEH(gcm_ghash_ssse3, X, Htable, buf, 16 * blocks);
     }
   }
-#endif  // GHASH_ASM_X86_64
 
-#if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64)
   if (crypto_gcm_clmul_enabled()) {
     CHECK_ABI_SEH(gcm_init_clmul, Htable, kH);
     CHECK_ABI_SEH(gcm_gmult_clmul, X, Htable);
diff --git a/src/crypto/fipsmodule/modes/internal.h b/src/crypto/fipsmodule/modes/internal.h
index 5f9d035..9a081eb 100644
--- a/src/crypto/fipsmodule/modes/internal.h
+++ b/src/crypto/fipsmodule/modes/internal.h
@@ -282,13 +282,6 @@
 void gcm_ghash_clmul(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
                      size_t len);
 
-#if defined(OPENSSL_X86_64)
-#define GHASH_ASM_X86_64
-void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]);
-void gcm_gmult_avx(uint64_t Xi[2], const u128 Htable[16]);
-void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
-                   size_t len);
-
 OPENSSL_INLINE char gcm_ssse3_capable(void) {
   return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0;
 }
@@ -300,6 +293,13 @@
 void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
                      size_t len);
 
+#if defined(OPENSSL_X86_64)
+#define GHASH_ASM_X86_64
+void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]);
+void gcm_gmult_avx(uint64_t Xi[2], const u128 Htable[16]);
+void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
+                   size_t len);
+
 #define AESNI_GCM
 size_t aesni_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
                          const AES_KEY *key, uint8_t ivec[16], uint64_t *Xi);
diff --git a/src/crypto/mem.c b/src/crypto/mem.c
index a06061b..14e0bdf 100644
--- a/src/crypto/mem.c
+++ b/src/crypto/mem.c
@@ -71,6 +71,14 @@
 
 #define OPENSSL_MALLOC_PREFIX 8
 
+#if defined(OPENSSL_ASAN)
+void __asan_poison_memory_region(const volatile void *addr, size_t size);
+void __asan_unpoison_memory_region(const volatile void *addr, size_t size);
+#else
+static void __asan_poison_memory_region(const void *addr, size_t size) {}
+static void __asan_unpoison_memory_region(const void *addr, size_t size) {}
+#endif
+
 #if defined(__GNUC__) || defined(__clang__)
 // sdallocx is a sized |free| function. By passing the size (which we happen to
 // always know in BoringSSL), the malloc implementation can save work. We cannot
@@ -99,6 +107,7 @@
 
   *(size_t *)ptr = size;
 
+  __asan_poison_memory_region(ptr, OPENSSL_MALLOC_PREFIX);
   return ((uint8_t *)ptr) + OPENSSL_MALLOC_PREFIX;
 }
 
@@ -108,6 +117,7 @@
   }
 
   void *ptr = ((uint8_t *)orig_ptr) - OPENSSL_MALLOC_PREFIX;
+  __asan_unpoison_memory_region(ptr, OPENSSL_MALLOC_PREFIX);
 
   size_t size = *(size_t *)ptr;
   OPENSSL_cleanse(ptr, size + OPENSSL_MALLOC_PREFIX);
@@ -120,7 +130,9 @@
   }
 
   void *ptr = ((uint8_t *)orig_ptr) - OPENSSL_MALLOC_PREFIX;
+  __asan_unpoison_memory_region(ptr, OPENSSL_MALLOC_PREFIX);
   size_t old_size = *(size_t *)ptr;
+  __asan_poison_memory_region(ptr, OPENSSL_MALLOC_PREFIX);
 
   void *ret = OPENSSL_malloc(new_size);
   if (ret == NULL) {
@@ -153,6 +165,10 @@
 #endif  // !OPENSSL_NO_ASM
 }
 
+void OPENSSL_clear_free(void *ptr, size_t unused) {
+  OPENSSL_free(ptr);
+}
+
 int CRYPTO_memcmp(const void *in_a, const void *in_b, size_t len) {
   const uint8_t *a = in_a;
   const uint8_t *b = in_b;
diff --git a/src/include/openssl/mem.h b/src/include/openssl/mem.h
index 4e1c2ca..2e25f52 100644
--- a/src/include/openssl/mem.h
+++ b/src/include/openssl/mem.h
@@ -138,7 +138,7 @@
 
 // OPENSSL_clear_free calls |OPENSSL_free|. BoringSSL automatically clears all
 // allocations on free, but we define |OPENSSL_clear_free| for compatibility.
-#define OPENSSL_clear_free(ptr, len) OPENSSL_free(ptr)
+OPENSSL_EXPORT void OPENSSL_clear_free(void *ptr, size_t len);
 
 
 #if defined(__cplusplus)
diff --git a/src/include/openssl/ssl.h b/src/include/openssl/ssl.h
index b28dcf6..a539b20 100644
--- a/src/include/openssl/ssl.h
+++ b/src/include/openssl/ssl.h
@@ -3069,7 +3069,7 @@
 // host may use a delegated credential to sign the handshake. Once issued,
 // credentials can't be revoked. In order to mitigate the damage in case the
 // credential secret key is compromised, the credential is only valid for a
-// short time (days, hours, or even minutes). This library implements draft-02
+// short time (days, hours, or even minutes). This library implements draft-03
 // of the protocol spec.
 //
 // The extension ID has not been assigned; we're using 0xff02 for the time
@@ -3082,7 +3082,7 @@
 // SSL_set1_delegated_credential configures the delegated credential (DC) that
 // will be sent to the peer for the current connection. |dc| is the DC in wire
 // format, and |pkey| or |key_method| is the corresponding private key.
-// Currently (as of draft-02), only servers may configure a DC to use in the
+// Currently (as of draft-03), only servers may configure a DC to use in the
 // handshake.
 //
 // The DC will only be used if the protocol version is correct and the signature
@@ -3440,13 +3440,6 @@
 
 // Obscure functions.
 
-// SSL_get_structure_sizes returns the sizes of the SSL, SSL_CTX and
-// SSL_SESSION structures so that a test can ensure that outside code agrees on
-// these values.
-OPENSSL_EXPORT void SSL_get_structure_sizes(size_t *ssl_size,
-                                            size_t *ssl_ctx_size,
-                                            size_t *ssl_session_size);
-
 // SSL_CTX_set_msg_callback installs |cb| as the message callback for |ctx|.
 // This callback will be called when sending or receiving low-level record
 // headers, complete handshake messages, ChangeCipherSpec, and alerts.
diff --git a/src/ssl/ssl_cert.cc b/src/ssl/ssl_cert.cc
index 1b01e7f..54df38f 100644
--- a/src/ssl/ssl_cert.cc
+++ b/src/ssl/ssl_cert.cc
@@ -804,9 +804,7 @@
 
 // ssl_can_serve_dc returns true if the host has configured a DC that it can
 // serve in the handshake. Specifically, it checks that a DC has been
-// configured, that the DC protocol version is the same as the negotiated
-// protocol version, and that the DC signature algorithm is supported by the
-// peer.
+// configured and that the DC signature algorithm is supported by the peer.
 static bool ssl_can_serve_dc(const SSL_HANDSHAKE *hs) {
   // Check that a DC has been configured.
   const CERT *cert = hs->config->cert.get();
diff --git a/src/ssl/ssl_lib.cc b/src/ssl/ssl_lib.cc
index a4f2044..d3e76d0 100644
--- a/src/ssl/ssl_lib.cc
+++ b/src/ssl/ssl_lib.cc
@@ -2672,13 +2672,6 @@
   return SSL_in_false_start(ssl);
 }
 
-void SSL_get_structure_sizes(size_t *ssl_size, size_t *ssl_ctx_size,
-                             size_t *ssl_session_size) {
-  *ssl_size = sizeof(SSL);
-  *ssl_ctx_size = sizeof(SSL_CTX);
-  *ssl_session_size = sizeof(SSL_SESSION);
-}
-
 int SSL_is_server(const SSL *ssl) { return ssl->server; }
 
 int SSL_is_dtls(const SSL *ssl) { return ssl->method->is_dtls; }
diff --git a/src/util/fipstools/delocate/delocate.go b/src/util/fipstools/delocate/delocate.go
index ce1a8b2..593abec 100644
--- a/src/util/fipstools/delocate/delocate.go
+++ b/src/util/fipstools/delocate/delocate.go
@@ -801,6 +801,8 @@
 	// instrCombine merges the source and destination in some fashion, for example
 	// a 2-operand bitwise operation.
 	instrCombine
+	// instrThreeArg merges two sources into a destination in some fashion.
+	instrThreeArg
 	instrOther
 )
 
@@ -831,6 +833,11 @@
 			return instrCombine
 		}
 
+	case "sarxq", "shlxq", "shrxq":
+		if len(args) == 3 {
+			return instrThreeArg
+		}
+
 	case "vpbroadcastq":
 		if len(args) == 2 {
 			return instrTransformingMove
@@ -878,10 +885,24 @@
 	}
 }
 
-func saveRegister(w stringWriter, avoidReg string) (wrapperFunc, string) {
-	reg := "%rax"
-	if reg == avoidReg {
-		reg = "%rbx"
+func saveRegister(w stringWriter, avoidRegs []string) (wrapperFunc, string) {
+	candidates := []string{"%rax", "%rbx", "%rcx", "%rdx"}
+
+	var reg string
+NextCandidate:
+	for _, candidate := range candidates {
+		for _, avoid := range avoidRegs {
+			if candidate == avoid {
+				continue NextCandidate
+			}
+		}
+
+		reg = candidate
+		break
+	}
+
+	if len(reg) == 0 {
+		panic("too many excluded registers")
 	}
 
 	return func(k func()) {
@@ -918,6 +939,13 @@
 	}
 }
 
+func threeArgCombineOp(w stringWriter, instructionName, source1, source2, dest string) wrapperFunc {
+	return func(k func()) {
+		k()
+		w.WriteString("\t" + instructionName + " " + source1 + ", " + source2 + ", " + dest + "\n")
+	}
+}
+
 func isValidLEATarget(reg string) bool {
 	return !strings.HasPrefix(reg, "%xmm") && !strings.HasPrefix(reg, "%ymm") && !strings.HasPrefix(reg, "%zmm")
 }
@@ -1033,9 +1061,6 @@
 				if len(offset) > 0 {
 					return nil, errors.New("loading from GOT with offset is unsupported")
 				}
-				if i != 0 {
-					return nil, errors.New("GOT access must be source operand")
-				}
 				if !d.isRIPRelative(memRef) {
 					return nil, errors.New("GOT access must be IP-relative")
 				}
@@ -1048,10 +1073,15 @@
 					useGOT = true
 				}
 
+				classification := classifyInstruction(instructionName, argNodes)
+				if classification != instrThreeArg && i != 0 {
+					return nil, errors.New("GOT access must be source operand")
+				}
+
 				// Reduce the instruction to movq symbol@GOTPCREL, targetReg.
 				var targetReg string
 				var redzoneCleared bool
-				switch classifyInstruction(instructionName, argNodes) {
+				switch classification {
 				case instrPush:
 					wrappers = append(wrappers, push(d.output))
 					targetReg = "%rax"
@@ -1073,12 +1103,36 @@
 					if !isValidLEATarget(targetReg) {
 						return nil, fmt.Errorf("cannot handle combining instructions targeting non-general registers")
 					}
-					saveRegWrapper, tempReg := saveRegister(d.output, targetReg)
+					saveRegWrapper, tempReg := saveRegister(d.output, []string{targetReg})
 					redzoneCleared = true
 					wrappers = append(wrappers, saveRegWrapper)
 
 					wrappers = append(wrappers, combineOp(d.output, instructionName, tempReg, targetReg))
 					targetReg = tempReg
+				case instrThreeArg:
+					if n := len(argNodes); n != 3 {
+						return nil, fmt.Errorf("three-argument instruction has %d arguments", n)
+					}
+					if i != 0 && i != 1 {
+						return nil, errors.New("GOT access must be from soure operand")
+					}
+					targetReg = d.contents(argNodes[2])
+
+					otherSource := d.contents(argNodes[1])
+					if i == 1 {
+						otherSource = d.contents(argNodes[0])
+					}
+
+					saveRegWrapper, tempReg := saveRegister(d.output, []string{targetReg, otherSource})
+					redzoneCleared = true
+					wrappers = append(wrappers, saveRegWrapper)
+
+					if i == 0 {
+						wrappers = append(wrappers, threeArgCombineOp(d.output, instructionName, tempReg, otherSource, targetReg))
+					} else {
+						wrappers = append(wrappers, threeArgCombineOp(d.output, instructionName, otherSource, tempReg, targetReg))
+					}
+					targetReg = tempReg
 				default:
 					return nil, fmt.Errorf("Cannot rewrite GOTPCREL reference for instruction %q", instructionName)
 				}
@@ -1087,7 +1141,7 @@
 					// Sometimes the compiler will load from the GOT to an
 					// XMM register, which is not a valid target of an LEA
 					// instruction.
-					saveRegWrapper, tempReg := saveRegister(d.output, "")
+					saveRegWrapper, tempReg := saveRegister(d.output, nil)
 					wrappers = append(wrappers, saveRegWrapper)
 					isAVX := strings.HasPrefix(instructionName, "v")
 					wrappers = append(wrappers, moveTo(d.output, targetReg, isAVX, tempReg))
diff --git a/src/util/fipstools/delocate/delocate_test.go b/src/util/fipstools/delocate/delocate_test.go
index e0ecc17..269b484 100644
--- a/src/util/fipstools/delocate/delocate_test.go
+++ b/src/util/fipstools/delocate/delocate_test.go
@@ -48,6 +48,7 @@
 	{"x86_64-GOTRewrite", []string{"in.s"}, "out.s"},
 	{"x86_64-LabelRewrite", []string{"in1.s", "in2.s"}, "out.s"},
 	{"x86_64-Sections", []string{"in.s"}, "out.s"},
+	{"x86_64-ThreeArg", []string{"in.s"}, "out.s"},
 }
 
 func TestDelocate(t *testing.T) {
diff --git a/src/util/fipstools/delocate/testdata/x86_64-ThreeArg/in.s b/src/util/fipstools/delocate/testdata/x86_64-ThreeArg/in.s
new file mode 100644
index 0000000..78a09de
--- /dev/null
+++ b/src/util/fipstools/delocate/testdata/x86_64-ThreeArg/in.s
@@ -0,0 +1,14 @@
+	.type foo, @function
+	.globl foo
+foo:
+	movq	%rax, %rax
+	shrxq	%rbx, kBoringSSLRSASqrtTwo@GOTPCREL(%rip), %rax
+	shrxq	kBoringSSLRSASqrtTwo@GOTPCREL(%rip), %rbx, %rax
+
+
+	.type	kBoringSSLRSASqrtTwo,@object # @kBoringSSLRSASqrtTwo
+	.section	.rodata,"a",@progbits,unique,760
+	.globl	kBoringSSLRSASqrtTwo
+	.p2align	4
+kBoringSSLRSASqrtTwo:
+	.quad	-2404814165548301886    # 0xdea06241f7aa81c2
diff --git a/src/util/fipstools/delocate/testdata/x86_64-ThreeArg/out.s b/src/util/fipstools/delocate/testdata/x86_64-ThreeArg/out.s
new file mode 100644
index 0000000..7ad8f76
--- /dev/null
+++ b/src/util/fipstools/delocate/testdata/x86_64-ThreeArg/out.s
@@ -0,0 +1,114 @@
+.text
+.file 1 "inserted_by_delocate.c"
+.loc 1 1 0
+BORINGSSL_bcm_text_start:
+	.type foo, @function
+	.globl foo
+.Lfoo_local_target:
+foo:
+	movq	%rax, %rax
+# WAS shrxq	%rbx, kBoringSSLRSASqrtTwo@GOTPCREL(%rip), %rax
+	leaq -128(%rsp), %rsp
+	pushq %rcx
+	leaq	.LkBoringSSLRSASqrtTwo_local_target(%rip), %rcx
+	shrxq %rbx, %rcx, %rax
+	popq %rcx
+	leaq 128(%rsp), %rsp
+# WAS shrxq	kBoringSSLRSASqrtTwo@GOTPCREL(%rip), %rbx, %rax
+	leaq -128(%rsp), %rsp
+	pushq %rcx
+	leaq	.LkBoringSSLRSASqrtTwo_local_target(%rip), %rcx
+	shrxq %rcx, %rbx, %rax
+	popq %rcx
+	leaq 128(%rsp), %rsp
+
+
+	.type	kBoringSSLRSASqrtTwo,@object # @kBoringSSLRSASqrtTwo
+# WAS .section	.rodata,"a",@progbits,unique,760
+.text
+	.globl	kBoringSSLRSASqrtTwo
+	.p2align	4
+.LkBoringSSLRSASqrtTwo_local_target:
+kBoringSSLRSASqrtTwo:
+	.quad	-2404814165548301886    # 0xdea06241f7aa81c2
+.text
+.loc 1 2 0
+BORINGSSL_bcm_text_end:
+.type OPENSSL_ia32cap_get, @function
+.globl OPENSSL_ia32cap_get
+.LOPENSSL_ia32cap_get_local_target:
+OPENSSL_ia32cap_get:
+	leaq OPENSSL_ia32cap_P(%rip), %rax
+	ret
+.extern OPENSSL_ia32cap_P
+.type OPENSSL_ia32cap_addr_delta, @object
+.size OPENSSL_ia32cap_addr_delta, 8
+OPENSSL_ia32cap_addr_delta:
+.quad OPENSSL_ia32cap_P-OPENSSL_ia32cap_addr_delta
+.type BORINGSSL_bcm_text_hash, @object
+.size BORINGSSL_bcm_text_hash, 64
+BORINGSSL_bcm_text_hash:
+.byte 0xae
+.byte 0x2c
+.byte 0xea
+.byte 0x2a
+.byte 0xbd
+.byte 0xa6
+.byte 0xf3
+.byte 0xec
+.byte 0x97
+.byte 0x7f
+.byte 0x9b
+.byte 0xf6
+.byte 0x94
+.byte 0x9a
+.byte 0xfc
+.byte 0x83
+.byte 0x68
+.byte 0x27
+.byte 0xcb
+.byte 0xa0
+.byte 0xa0
+.byte 0x9f
+.byte 0x6b
+.byte 0x6f
+.byte 0xde
+.byte 0x52
+.byte 0xcd
+.byte 0xe2
+.byte 0xcd
+.byte 0xff
+.byte 0x31
+.byte 0x80
+.byte 0xa2
+.byte 0xd4
+.byte 0xc3
+.byte 0x66
+.byte 0xf
+.byte 0xc2
+.byte 0x6a
+.byte 0x7b
+.byte 0xf4
+.byte 0xbe
+.byte 0x39
+.byte 0xa2
+.byte 0xd7
+.byte 0x25
+.byte 0xdb
+.byte 0x21
+.byte 0x98
+.byte 0xe9
+.byte 0xd5
+.byte 0x53
+.byte 0xbf
+.byte 0x5c
+.byte 0x32
+.byte 0x6
+.byte 0x83
+.byte 0x34
+.byte 0xc
+.byte 0x65
+.byte 0x89
+.byte 0x52
+.byte 0xbd
+.byte 0x1f
diff --git a/src/util/generate_build_files.py b/src/util/generate_build_files.py
index 8f88e36..1773cba 100644
--- a/src/util/generate_build_files.py
+++ b/src/util/generate_build_files.py
@@ -633,7 +633,10 @@
   ]
 
   ssl_test_files = FindCFiles(os.path.join('src', 'ssl'), OnlyTests)
-  ssl_test_files.append('src/crypto/test/gtest_main.cc')
+  ssl_test_files += [
+      'src/crypto/test/abi_test.cc',
+      'src/crypto/test/gtest_main.cc',
+  ]
 
   fuzz_c_files = FindCFiles(os.path.join('src', 'fuzz'), NoTests)