external/boringssl: Sync to 0726fb76ebe7f422e3c4fb2e25a0064926975770.

This includes the following changes:

https://boringssl.googlesource.com/boringssl/+log/9c33ae85621ef8e00a42309b5101e0bedd02b816..0726fb76ebe7f422e3c4fb2e25a0064926975770

Test: cts-tradefed run cts -m CtsLibcoreOkHttpTestCases -a arm64-v8a
Test: cts-tradefed run cts -m CtsLibcoreTestCases -a arm64-v8a

Change-Id: I6da679b1bbebffd35568794c7f6e45e2d620287b
diff --git a/BORINGSSL_REVISION b/BORINGSSL_REVISION
index 027b126..aad77f2 100644
--- a/BORINGSSL_REVISION
+++ b/BORINGSSL_REVISION
@@ -1 +1 @@
-9c33ae85621ef8e00a42309b5101e0bedd02b816
+0726fb76ebe7f422e3c4fb2e25a0064926975770
diff --git a/linux-x86/crypto/modes/ghash-x86.S b/linux-x86/crypto/modes/ghash-x86.S
index 2872088..84ff2f4 100644
--- a/linux-x86/crypto/modes/ghash-x86.S
+++ b/linux-x86/crypto/modes/ghash-x86.S
@@ -1,211 +1,6 @@
 #if defined(__i386__)
 .file	"ghash-x86.S"
 .text
-.globl	gcm_gmult_4bit_x86
-.hidden	gcm_gmult_4bit_x86
-.type	gcm_gmult_4bit_x86,@function
-.align	16
-gcm_gmult_4bit_x86:
-.L_gcm_gmult_4bit_x86_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	subl	$84,%esp
-	movl	104(%esp),%edi
-	movl	108(%esp),%esi
-	movl	(%edi),%ebp
-	movl	4(%edi),%edx
-	movl	8(%edi),%ecx
-	movl	12(%edi),%ebx
-	movl	$0,16(%esp)
-	movl	$471859200,20(%esp)
-	movl	$943718400,24(%esp)
-	movl	$610271232,28(%esp)
-	movl	$1887436800,32(%esp)
-	movl	$1822425088,36(%esp)
-	movl	$1220542464,40(%esp)
-	movl	$1423966208,44(%esp)
-	movl	$3774873600,48(%esp)
-	movl	$4246732800,52(%esp)
-	movl	$3644850176,56(%esp)
-	movl	$3311403008,60(%esp)
-	movl	$2441084928,64(%esp)
-	movl	$2376073216,68(%esp)
-	movl	$2847932416,72(%esp)
-	movl	$3051356160,76(%esp)
-	movl	%ebp,(%esp)
-	movl	%edx,4(%esp)
-	movl	%ecx,8(%esp)
-	movl	%ebx,12(%esp)
-	shrl	$20,%ebx
-	andl	$240,%ebx
-	movl	4(%esi,%ebx,1),%ebp
-	movl	(%esi,%ebx,1),%edx
-	movl	12(%esi,%ebx,1),%ecx
-	movl	8(%esi,%ebx,1),%ebx
-	xorl	%eax,%eax
-	movl	$15,%edi
-	jmp	.L000x86_loop
-.align	16
-.L000x86_loop:
-	movb	%bl,%al
-	shrdl	$4,%ecx,%ebx
-	andb	$15,%al
-	shrdl	$4,%edx,%ecx
-	shrdl	$4,%ebp,%edx
-	shrl	$4,%ebp
-	xorl	16(%esp,%eax,4),%ebp
-	movb	(%esp,%edi,1),%al
-	andb	$240,%al
-	xorl	8(%esi,%eax,1),%ebx
-	xorl	12(%esi,%eax,1),%ecx
-	xorl	(%esi,%eax,1),%edx
-	xorl	4(%esi,%eax,1),%ebp
-	decl	%edi
-	js	.L001x86_break
-	movb	%bl,%al
-	shrdl	$4,%ecx,%ebx
-	andb	$15,%al
-	shrdl	$4,%edx,%ecx
-	shrdl	$4,%ebp,%edx
-	shrl	$4,%ebp
-	xorl	16(%esp,%eax,4),%ebp
-	movb	(%esp,%edi,1),%al
-	shlb	$4,%al
-	xorl	8(%esi,%eax,1),%ebx
-	xorl	12(%esi,%eax,1),%ecx
-	xorl	(%esi,%eax,1),%edx
-	xorl	4(%esi,%eax,1),%ebp
-	jmp	.L000x86_loop
-.align	16
-.L001x86_break:
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	bswap	%ebp
-	movl	104(%esp),%edi
-	movl	%ebx,12(%edi)
-	movl	%ecx,8(%edi)
-	movl	%edx,4(%edi)
-	movl	%ebp,(%edi)
-	addl	$84,%esp
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.size	gcm_gmult_4bit_x86,.-.L_gcm_gmult_4bit_x86_begin
-.globl	gcm_ghash_4bit_x86
-.hidden	gcm_ghash_4bit_x86
-.type	gcm_ghash_4bit_x86,@function
-.align	16
-gcm_ghash_4bit_x86:
-.L_gcm_ghash_4bit_x86_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	subl	$84,%esp
-	movl	104(%esp),%ebx
-	movl	108(%esp),%esi
-	movl	112(%esp),%edi
-	movl	116(%esp),%ecx
-	addl	%edi,%ecx
-	movl	%ecx,116(%esp)
-	movl	(%ebx),%ebp
-	movl	4(%ebx),%edx
-	movl	8(%ebx),%ecx
-	movl	12(%ebx),%ebx
-	movl	$0,16(%esp)
-	movl	$471859200,20(%esp)
-	movl	$943718400,24(%esp)
-	movl	$610271232,28(%esp)
-	movl	$1887436800,32(%esp)
-	movl	$1822425088,36(%esp)
-	movl	$1220542464,40(%esp)
-	movl	$1423966208,44(%esp)
-	movl	$3774873600,48(%esp)
-	movl	$4246732800,52(%esp)
-	movl	$3644850176,56(%esp)
-	movl	$3311403008,60(%esp)
-	movl	$2441084928,64(%esp)
-	movl	$2376073216,68(%esp)
-	movl	$2847932416,72(%esp)
-	movl	$3051356160,76(%esp)
-.align	16
-.L002x86_outer_loop:
-	xorl	12(%edi),%ebx
-	xorl	8(%edi),%ecx
-	xorl	4(%edi),%edx
-	xorl	(%edi),%ebp
-	movl	%ebx,12(%esp)
-	movl	%ecx,8(%esp)
-	movl	%edx,4(%esp)
-	movl	%ebp,(%esp)
-	shrl	$20,%ebx
-	andl	$240,%ebx
-	movl	4(%esi,%ebx,1),%ebp
-	movl	(%esi,%ebx,1),%edx
-	movl	12(%esi,%ebx,1),%ecx
-	movl	8(%esi,%ebx,1),%ebx
-	xorl	%eax,%eax
-	movl	$15,%edi
-	jmp	.L003x86_loop
-.align	16
-.L003x86_loop:
-	movb	%bl,%al
-	shrdl	$4,%ecx,%ebx
-	andb	$15,%al
-	shrdl	$4,%edx,%ecx
-	shrdl	$4,%ebp,%edx
-	shrl	$4,%ebp
-	xorl	16(%esp,%eax,4),%ebp
-	movb	(%esp,%edi,1),%al
-	andb	$240,%al
-	xorl	8(%esi,%eax,1),%ebx
-	xorl	12(%esi,%eax,1),%ecx
-	xorl	(%esi,%eax,1),%edx
-	xorl	4(%esi,%eax,1),%ebp
-	decl	%edi
-	js	.L004x86_break
-	movb	%bl,%al
-	shrdl	$4,%ecx,%ebx
-	andb	$15,%al
-	shrdl	$4,%edx,%ecx
-	shrdl	$4,%ebp,%edx
-	shrl	$4,%ebp
-	xorl	16(%esp,%eax,4),%ebp
-	movb	(%esp,%edi,1),%al
-	shlb	$4,%al
-	xorl	8(%esi,%eax,1),%ebx
-	xorl	12(%esi,%eax,1),%ecx
-	xorl	(%esi,%eax,1),%edx
-	xorl	4(%esi,%eax,1),%ebp
-	jmp	.L003x86_loop
-.align	16
-.L004x86_break:
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	bswap	%ebp
-	movl	112(%esp),%edi
-	leal	16(%edi),%edi
-	cmpl	116(%esp),%edi
-	movl	%edi,112(%esp)
-	jb	.L002x86_outer_loop
-	movl	104(%esp),%edi
-	movl	%ebx,12(%edi)
-	movl	%ecx,8(%edi)
-	movl	%edx,4(%edi)
-	movl	%ebp,(%edi)
-	addl	$84,%esp
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.size	gcm_ghash_4bit_x86,.-.L_gcm_ghash_4bit_x86_begin
 .globl	gcm_gmult_4bit_mmx
 .hidden	gcm_gmult_4bit_mmx
 .type	gcm_gmult_4bit_mmx,@function
@@ -218,10 +13,10 @@
 	pushl	%edi
 	movl	20(%esp),%edi
 	movl	24(%esp),%esi
-	call	.L005pic_point
-.L005pic_point:
+	call	.L000pic_point
+.L000pic_point:
 	popl	%eax
-	leal	.Lrem_4bit-.L005pic_point(%eax),%eax
+	leal	.Lrem_4bit-.L000pic_point(%eax),%eax
 	movzbl	15(%edi),%ebx
 	xorl	%ecx,%ecx
 	movl	%ebx,%edx
@@ -232,9 +27,9 @@
 	movq	8(%esi,%ecx,1),%mm0
 	movq	(%esi,%ecx,1),%mm1
 	movd	%mm0,%ebx
-	jmp	.L006mmx_loop
+	jmp	.L001mmx_loop
 .align	16
-.L006mmx_loop:
+.L001mmx_loop:
 	psrlq	$4,%mm0
 	andl	$15,%ebx
 	movq	%mm1,%mm2
@@ -248,7 +43,7 @@
 	pxor	(%esi,%edx,1),%mm1
 	movl	%ecx,%edx
 	pxor	%mm2,%mm0
-	js	.L007mmx_break
+	js	.L002mmx_break
 	shlb	$4,%cl
 	andl	$15,%ebx
 	psrlq	$4,%mm0
@@ -261,9 +56,9 @@
 	movd	%mm0,%ebx
 	pxor	(%esi,%ecx,1),%mm1
 	pxor	%mm2,%mm0
-	jmp	.L006mmx_loop
+	jmp	.L001mmx_loop
 .align	16
-.L007mmx_break:
+.L002mmx_break:
 	shlb	$4,%cl
 	andl	$15,%ebx
 	psrlq	$4,%mm0
@@ -321,10 +116,10 @@
 	movl	28(%esp),%ecx
 	movl	32(%esp),%edx
 	movl	%esp,%ebp
-	call	.L008pic_point
-.L008pic_point:
+	call	.L003pic_point
+.L003pic_point:
 	popl	%esi
-	leal	.Lrem_8bit-.L008pic_point(%esi),%esi
+	leal	.Lrem_8bit-.L003pic_point(%esi),%esi
 	subl	$544,%esp
 	andl	$-64,%esp
 	subl	$16,%esp
@@ -563,7 +358,7 @@
 	movl	8(%eax),%ebx
 	movl	12(%eax),%edx
 .align	16
-.L009outer:
+.L004outer:
 	xorl	12(%ecx),%edx
 	xorl	8(%ecx),%ebx
 	pxor	(%ecx),%mm6
@@ -898,7 +693,7 @@
 	pshufw	$27,%mm6,%mm6
 	bswap	%ebx
 	cmpl	552(%esp),%ecx
-	jne	.L009outer
+	jne	.L004outer
 	movl	544(%esp),%eax
 	movl	%edx,12(%eax)
 	movl	%ebx,8(%eax)
@@ -919,10 +714,10 @@
 .L_gcm_init_clmul_begin:
 	movl	4(%esp),%edx
 	movl	8(%esp),%eax
-	call	.L010pic
-.L010pic:
+	call	.L005pic
+.L005pic:
 	popl	%ecx
-	leal	.Lbswap-.L010pic(%ecx),%ecx
+	leal	.Lbswap-.L005pic(%ecx),%ecx
 	movdqu	(%eax),%xmm2
 	pshufd	$78,%xmm2,%xmm2
 	pshufd	$255,%xmm2,%xmm4
@@ -989,10 +784,10 @@
 .L_gcm_gmult_clmul_begin:
 	movl	4(%esp),%eax
 	movl	8(%esp),%edx
-	call	.L011pic
-.L011pic:
+	call	.L006pic
+.L006pic:
 	popl	%ecx
-	leal	.Lbswap-.L011pic(%ecx),%ecx
+	leal	.Lbswap-.L006pic(%ecx),%ecx
 	movdqu	(%eax),%xmm0
 	movdqa	(%ecx),%xmm5
 	movups	(%edx),%xmm2
@@ -1049,16 +844,16 @@
 	movl	24(%esp),%edx
 	movl	28(%esp),%esi
 	movl	32(%esp),%ebx
-	call	.L012pic
-.L012pic:
+	call	.L007pic
+.L007pic:
 	popl	%ecx
-	leal	.Lbswap-.L012pic(%ecx),%ecx
+	leal	.Lbswap-.L007pic(%ecx),%ecx
 	movdqu	(%eax),%xmm0
 	movdqa	(%ecx),%xmm5
 	movdqu	(%edx),%xmm2
 .byte	102,15,56,0,197
 	subl	$16,%ebx
-	jz	.L013odd_tail
+	jz	.L008odd_tail
 	movdqu	(%esi),%xmm3
 	movdqu	16(%esi),%xmm6
 .byte	102,15,56,0,221
@@ -1075,10 +870,10 @@
 	movups	16(%edx),%xmm2
 	nop
 	subl	$32,%ebx
-	jbe	.L014even_tail
-	jmp	.L015mod_loop
+	jbe	.L009even_tail
+	jmp	.L010mod_loop
 .align	32
-.L015mod_loop:
+.L010mod_loop:
 	pshufd	$78,%xmm0,%xmm4
 	movdqa	%xmm0,%xmm1
 	pxor	%xmm0,%xmm4
@@ -1133,8 +928,8 @@
 .byte	102,15,58,68,221,0
 	leal	32(%esi),%esi
 	subl	$32,%ebx
-	ja	.L015mod_loop
-.L014even_tail:
+	ja	.L010mod_loop
+.L009even_tail:
 	pshufd	$78,%xmm0,%xmm4
 	movdqa	%xmm0,%xmm1
 	pxor	%xmm0,%xmm4
@@ -1173,9 +968,9 @@
 	psrlq	$1,%xmm0
 	pxor	%xmm1,%xmm0
 	testl	%ebx,%ebx
-	jnz	.L016done
+	jnz	.L011done
 	movups	(%edx),%xmm2
-.L013odd_tail:
+.L008odd_tail:
 	movdqu	(%esi),%xmm3
 .byte	102,15,56,0,221
 	pxor	%xmm3,%xmm0
@@ -1214,7 +1009,7 @@
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
 	pxor	%xmm1,%xmm0
-.L016done:
+.L011done:
 .byte	102,15,56,0,197
 	movdqu	%xmm0,(%eax)
 	popl	%edi
diff --git a/linux-x86_64/crypto/ec/p256-x86_64-asm.S b/linux-x86_64/crypto/ec/p256-x86_64-asm.S
index e059dd6..785f269 100644
--- a/linux-x86_64/crypto/ec/p256-x86_64-asm.S
+++ b/linux-x86_64/crypto/ec/p256-x86_64-asm.S
@@ -17,48 +17,6 @@
 .LONE_mont:
 .quad	0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
 
-.type	ecp_nistz256_mul_by_2,@function
-.align	64
-ecp_nistz256_mul_by_2:
-	pushq	%r12
-	pushq	%r13
-
-	movq	0(%rsi),%r8
-	xorq	%r13,%r13
-	movq	8(%rsi),%r9
-	addq	%r8,%r8
-	movq	16(%rsi),%r10
-	adcq	%r9,%r9
-	movq	24(%rsi),%r11
-	leaq	.Lpoly(%rip),%rsi
-	movq	%r8,%rax
-	adcq	%r10,%r10
-	adcq	%r11,%r11
-	movq	%r9,%rdx
-	adcq	$0,%r13
-
-	subq	0(%rsi),%r8
-	movq	%r10,%rcx
-	sbbq	8(%rsi),%r9
-	sbbq	16(%rsi),%r10
-	movq	%r11,%r12
-	sbbq	24(%rsi),%r11
-	sbbq	$0,%r13
-
-	cmovcq	%rax,%r8
-	cmovcq	%rdx,%r9
-	movq	%r8,0(%rdi)
-	cmovcq	%rcx,%r10
-	movq	%r9,8(%rdi)
-	cmovcq	%r12,%r11
-	movq	%r10,16(%rdi)
-	movq	%r11,24(%rdi)
-
-	popq	%r13
-	popq	%r12
-	.byte	0xf3,0xc3
-.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
-
 
 
 .globl	ecp_nistz256_neg
@@ -554,103 +512,6 @@
 .size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
 
 
-
-
-
-
-.globl	ecp_nistz256_from_mont
-.hidden ecp_nistz256_from_mont
-.type	ecp_nistz256_from_mont,@function
-.align	32
-ecp_nistz256_from_mont:
-	pushq	%r12
-	pushq	%r13
-
-	movq	0(%rsi),%rax
-	movq	.Lpoly+24(%rip),%r13
-	movq	8(%rsi),%r9
-	movq	16(%rsi),%r10
-	movq	24(%rsi),%r11
-	movq	%rax,%r8
-	movq	.Lpoly+8(%rip),%r12
-
-
-
-	movq	%rax,%rcx
-	shlq	$32,%r8
-	mulq	%r13
-	shrq	$32,%rcx
-	addq	%r8,%r9
-	adcq	%rcx,%r10
-	adcq	%rax,%r11
-	movq	%r9,%rax
-	adcq	$0,%rdx
-
-
-
-	movq	%r9,%rcx
-	shlq	$32,%r9
-	movq	%rdx,%r8
-	mulq	%r13
-	shrq	$32,%rcx
-	addq	%r9,%r10
-	adcq	%rcx,%r11
-	adcq	%rax,%r8
-	movq	%r10,%rax
-	adcq	$0,%rdx
-
-
-
-	movq	%r10,%rcx
-	shlq	$32,%r10
-	movq	%rdx,%r9
-	mulq	%r13
-	shrq	$32,%rcx
-	addq	%r10,%r11
-	adcq	%rcx,%r8
-	adcq	%rax,%r9
-	movq	%r11,%rax
-	adcq	$0,%rdx
-
-
-
-	movq	%r11,%rcx
-	shlq	$32,%r11
-	movq	%rdx,%r10
-	mulq	%r13
-	shrq	$32,%rcx
-	addq	%r11,%r8
-	adcq	%rcx,%r9
-	movq	%r8,%rcx
-	adcq	%rax,%r10
-	movq	%r9,%rsi
-	adcq	$0,%rdx
-
-
-
-	subq	$-1,%r8
-	movq	%r10,%rax
-	sbbq	%r12,%r9
-	sbbq	$0,%r10
-	movq	%rdx,%r11
-	sbbq	%r13,%rdx
-	sbbq	%r13,%r13
-
-	cmovnzq	%rcx,%r8
-	cmovnzq	%rsi,%r9
-	movq	%r8,0(%rdi)
-	cmovnzq	%rax,%r10
-	movq	%r9,8(%rdi)
-	cmovzq	%rdx,%r11
-	movq	%r10,16(%rdi)
-	movq	%r11,24(%rdi)
-
-	popq	%r13
-	popq	%r12
-	.byte	0xf3,0xc3
-.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
-
-
 .globl	ecp_nistz256_select_w5
 .hidden ecp_nistz256_select_w5
 .type	ecp_nistz256_select_w5,@function
diff --git a/linux-x86_64/crypto/modes/aesni-gcm-x86_64.S b/linux-x86_64/crypto/modes/aesni-gcm-x86_64.S
index f01692e..9953e4c 100644
--- a/linux-x86_64/crypto/modes/aesni-gcm-x86_64.S
+++ b/linux-x86_64/crypto/modes/aesni-gcm-x86_64.S
@@ -1,19 +1,798 @@
 #if defined(__x86_64__)
 .text	
 
-.globl	aesni_gcm_encrypt
-.hidden aesni_gcm_encrypt
-.type	aesni_gcm_encrypt,@function
-aesni_gcm_encrypt:
-	xorl	%eax,%eax
-	.byte	0xf3,0xc3
-.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
+.type	_aesni_ctr32_ghash_6x,@function
+.align	32
+_aesni_ctr32_ghash_6x:
+	vmovdqu	32(%r11),%xmm2
+	subq	$6,%rdx
+	vpxor	%xmm4,%xmm4,%xmm4
+	vmovdqu	0-128(%rcx),%xmm15
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovdqu	%xmm4,16+8(%rsp)
+	jmp	.Loop6x
 
+.align	32
+.Loop6x:
+	addl	$100663296,%ebx
+	jc	.Lhandle_ctr32
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32:
+	vmovdqu	%xmm1,(%r8)
+	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm15,%xmm12,%xmm12
+	vmovups	16-128(%rcx),%xmm2
+	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	xorq	%r12,%r12
+	cmpq	%r14,%r15
+
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vmovdqu	48+8(%rsp),%xmm0
+	vpxor	%xmm15,%xmm13,%xmm13
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm14,%xmm14
+	setnc	%r12b
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vmovdqu	16-32(%r9),%xmm3
+	negq	%r12
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
+	vpxor	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm2,%xmm13,%xmm13
+	vpxor	%xmm5,%xmm1,%xmm4
+	andq	$0x60,%r12
+	vmovups	32-128(%rcx),%xmm15
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
+	vaesenc	%xmm2,%xmm14,%xmm14
+
+	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
+	leaq	(%r14,%r12,1),%r14
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
+	vmovdqu	64+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	88(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	80(%r14),%r12
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,32+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,40+8(%rsp)
+	vmovdqu	48-32(%r9),%xmm5
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	48-128(%rcx),%xmm15
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
+	vmovdqu	80+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqu	64-32(%r9),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	64-128(%rcx),%xmm15
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	72(%r14),%r13
+	vpxor	%xmm5,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	64(%r14),%r12
+	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
+	vmovdqu	96+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,48+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,56+8(%rsp)
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	96-32(%r9),%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	80-128(%rcx),%xmm15
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	56(%r14),%r13
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
+	vpxor	112+8(%rsp),%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	48(%r14),%r12
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,64+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,72+8(%rsp)
+	vpxor	%xmm3,%xmm4,%xmm4
+	vmovdqu	112-32(%r9),%xmm3
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	96-128(%rcx),%xmm15
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	40(%r14),%r13
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	32(%r14),%r12
+	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,80+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,88+8(%rsp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm6,%xmm6
+
+	vmovups	112-128(%rcx),%xmm15
+	vpslldq	$8,%xmm6,%xmm5
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	16(%r11),%xmm3
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm8,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm4,%xmm4
+	movbeq	24(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	16(%r14),%r12
+	vpalignr	$8,%xmm4,%xmm4,%xmm0
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	movq	%r13,96+8(%rsp)
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r12,104+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vmovups	128-128(%rcx),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	144-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vpsrldq	$8,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vpxor	%xmm6,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vpxor	%xmm0,%xmm4,%xmm4
+	movbeq	8(%r14),%r13
+	vaesenc	%xmm1,%xmm13,%xmm13
+	movbeq	0(%r14),%r12
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	160-128(%rcx),%xmm1
+	cmpl	$11,%ebp
+	jb	.Lenc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	176-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	192-128(%rcx),%xmm1
+	je	.Lenc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	208-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	224-128(%rcx),%xmm1
+	jmp	.Lenc_tail
+
+.align	32
+.Lhandle_ctr32:
+	vmovdqu	(%r11),%xmm0
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm15,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpshufb	%xmm0,%xmm1,%xmm1
+	jmp	.Lresume_ctr32
+
+.align	32
+.Lenc_tail:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vmovdqu	%xmm7,16+8(%rsp)
+	vpalignr	$8,%xmm4,%xmm4,%xmm8
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	vpxor	0(%rdi),%xmm1,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	16(%rdi),%xmm1,%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	32(%rdi),%xmm1,%xmm5
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	48(%rdi),%xmm1,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	64(%rdi),%xmm1,%xmm7
+	vpxor	80(%rdi),%xmm1,%xmm3
+	vmovdqu	(%r8),%xmm1
+
+	vaesenclast	%xmm2,%xmm9,%xmm9
+	vmovdqu	32(%r11),%xmm2
+	vaesenclast	%xmm0,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm1,%xmm0
+	movq	%r13,112+8(%rsp)
+	leaq	96(%rdi),%rdi
+	vaesenclast	%xmm5,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm0,%xmm5
+	movq	%r12,120+8(%rsp)
+	leaq	96(%rsi),%rsi
+	vmovdqu	0-128(%rcx),%xmm15
+	vaesenclast	%xmm6,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm5,%xmm6
+	vaesenclast	%xmm7,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm6,%xmm7
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vpaddb	%xmm2,%xmm7,%xmm3
+
+	addq	$0x60,%r10
+	subq	$0x6,%rdx
+	jc	.L6x_done
+
+	vmovups	%xmm9,-96(%rsi)
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovups	%xmm10,-80(%rsi)
+	vmovdqa	%xmm0,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vmovdqa	%xmm5,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vmovdqa	%xmm6,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vmovdqa	%xmm7,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vmovdqa	%xmm3,%xmm14
+	vmovdqu	32+8(%rsp),%xmm7
+	jmp	.Loop6x
+
+.L6x_done:
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpxor	%xmm4,%xmm8,%xmm8
+
+	.byte	0xf3,0xc3
+.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
 .globl	aesni_gcm_decrypt
 .hidden aesni_gcm_decrypt
 .type	aesni_gcm_decrypt,@function
+.align	32
 aesni_gcm_decrypt:
-	xorl	%eax,%eax
+	xorq	%r10,%r10
+
+
+
+	cmpq	$0x60,%rdx
+	jb	.Lgcm_dec_abort
+
+	leaq	(%rsp),%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	vzeroupper
+
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	.Lbswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	vmovdqu	(%r9),%xmm8
+	andq	$-128,%rsp
+	vmovdqu	(%r11),%xmm0
+	leaq	128(%rcx),%rcx
+	leaq	32+32(%r9),%r9
+	movl	240-128(%rcx),%ebp
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	.Ldec_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	.Ldec_no_key_aliasing
+	subq	%r15,%rsp
+.Ldec_no_key_aliasing:
+
+	vmovdqu	80(%rdi),%xmm7
+	leaq	(%rdi),%r14
+	vmovdqu	64(%rdi),%xmm4
+
+
+
+
+
+
+
+	leaq	-192(%rdi,%rdx,1),%r15
+
+	vmovdqu	48(%rdi),%xmm5
+	shrq	$4,%rdx
+	xorq	%r10,%r10
+	vmovdqu	32(%rdi),%xmm6
+	vpshufb	%xmm0,%xmm7,%xmm7
+	vmovdqu	16(%rdi),%xmm2
+	vpshufb	%xmm0,%xmm4,%xmm4
+	vmovdqu	(%rdi),%xmm3
+	vpshufb	%xmm0,%xmm5,%xmm5
+	vmovdqu	%xmm4,48(%rsp)
+	vpshufb	%xmm0,%xmm6,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm2,%xmm2
+	vmovdqu	%xmm6,80(%rsp)
+	vpshufb	%xmm0,%xmm3,%xmm3
+	vmovdqu	%xmm2,96(%rsp)
+	vmovdqu	%xmm3,112(%rsp)
+
+	call	_aesni_ctr32_ghash_6x
+
+	vmovups	%xmm9,-96(%rsi)
+	vmovups	%xmm10,-80(%rsi)
+	vmovups	%xmm11,-64(%rsi)
+	vmovups	%xmm12,-48(%rsi)
+	vmovups	%xmm13,-32(%rsi)
+	vmovups	%xmm14,-16(%rsi)
+
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,-64(%r9)
+
+	vzeroupper
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lgcm_dec_abort:
+	movq	%r10,%rax
 	.byte	0xf3,0xc3
 .size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
+.type	_aesni_ctr32_6x,@function
+.align	32
+_aesni_ctr32_6x:
+	vmovdqu	0-128(%rcx),%xmm4
+	vmovdqu	32(%r11),%xmm2
+	leaq	-1(%rbp),%r13
+	vmovups	16-128(%rcx),%xmm15
+	leaq	32-128(%rcx),%r12
+	vpxor	%xmm4,%xmm1,%xmm9
+	addl	$100663296,%ebx
+	jc	.Lhandle_ctr32_2
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	.Loop_ctr32
+
+.align	16
+.Loop_ctr32:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vmovups	(%r12),%xmm15
+	leaq	16(%r12),%r12
+	decl	%r13d
+	jnz	.Loop_ctr32
+
+	vmovdqu	(%r12),%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	0(%rdi),%xmm3,%xmm4
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	16(%rdi),%xmm3,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	32(%rdi),%xmm3,%xmm6
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	48(%rdi),%xmm3,%xmm8
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	64(%rdi),%xmm3,%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	80(%rdi),%xmm3,%xmm3
+	leaq	96(%rdi),%rdi
+
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vaesenclast	%xmm5,%xmm10,%xmm10
+	vaesenclast	%xmm6,%xmm11,%xmm11
+	vaesenclast	%xmm8,%xmm12,%xmm12
+	vaesenclast	%xmm2,%xmm13,%xmm13
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vmovups	%xmm9,0(%rsi)
+	vmovups	%xmm10,16(%rsi)
+	vmovups	%xmm11,32(%rsi)
+	vmovups	%xmm12,48(%rsi)
+	vmovups	%xmm13,64(%rsi)
+	vmovups	%xmm14,80(%rsi)
+	leaq	96(%rsi),%rsi
+
+	.byte	0xf3,0xc3
+.align	32
+.Lhandle_ctr32_2:
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	.Loop_ctr32
+.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl	aesni_gcm_encrypt
+.hidden aesni_gcm_encrypt
+.type	aesni_gcm_encrypt,@function
+.align	32
+aesni_gcm_encrypt:
+	xorq	%r10,%r10
+
+
+
+
+	cmpq	$288,%rdx
+	jb	.Lgcm_enc_abort
+
+	leaq	(%rsp),%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	vzeroupper
+
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	.Lbswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	leaq	128(%rcx),%rcx
+	vmovdqu	(%r11),%xmm0
+	andq	$-128,%rsp
+	movl	240-128(%rcx),%ebp
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	.Lenc_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	.Lenc_no_key_aliasing
+	subq	%r15,%rsp
+.Lenc_no_key_aliasing:
+
+	leaq	(%rsi),%r14
+
+
+
+
+
+
+
+
+	leaq	-192(%rsi,%rdx,1),%r15
+
+	shrq	$4,%rdx
+
+	call	_aesni_ctr32_6x
+	vpshufb	%xmm0,%xmm9,%xmm8
+	vpshufb	%xmm0,%xmm10,%xmm2
+	vmovdqu	%xmm8,112(%rsp)
+	vpshufb	%xmm0,%xmm11,%xmm4
+	vmovdqu	%xmm2,96(%rsp)
+	vpshufb	%xmm0,%xmm12,%xmm5
+	vmovdqu	%xmm4,80(%rsp)
+	vpshufb	%xmm0,%xmm13,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm14,%xmm7
+	vmovdqu	%xmm6,48(%rsp)
+
+	call	_aesni_ctr32_6x
+
+	vmovdqu	(%r9),%xmm8
+	leaq	32+32(%r9),%r9
+	subq	$12,%rdx
+	movq	$192,%r10
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	call	_aesni_ctr32_ghash_6x
+	vmovdqu	32(%rsp),%xmm7
+	vmovdqu	(%r11),%xmm0
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm7,%xmm7,%xmm1
+	vmovdqu	32-32(%r9),%xmm15
+	vmovups	%xmm9,-96(%rsi)
+	vpshufb	%xmm0,%xmm9,%xmm9
+	vpxor	%xmm7,%xmm1,%xmm1
+	vmovups	%xmm10,-80(%rsi)
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vmovdqu	%xmm9,16(%rsp)
+	vmovdqu	48(%rsp),%xmm6
+	vmovdqu	16-32(%r9),%xmm0
+	vpunpckhqdq	%xmm6,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm6,%xmm2,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+
+	vmovdqu	64(%rsp),%xmm9
+	vpclmulqdq	$0x00,%xmm0,%xmm6,%xmm4
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm9,%xmm9,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm6,%xmm6
+	vpxor	%xmm9,%xmm5,%xmm5
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vmovdqu	80(%rsp),%xmm1
+	vpclmulqdq	$0x00,%xmm3,%xmm9,%xmm7
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm4,%xmm7,%xmm7
+	vpunpckhqdq	%xmm1,%xmm1,%xmm4
+	vpclmulqdq	$0x11,%xmm3,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpxor	%xmm6,%xmm9,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm5,%xmm5
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	96(%rsp),%xmm2
+	vpclmulqdq	$0x00,%xmm0,%xmm1,%xmm6
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpunpckhqdq	%xmm2,%xmm2,%xmm7
+	vpclmulqdq	$0x11,%xmm0,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpxor	%xmm9,%xmm1,%xmm1
+	vpclmulqdq	$0x10,%xmm15,%xmm4,%xmm4
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm5,%xmm4,%xmm4
+
+	vpxor	112(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm3,%xmm2,%xmm5
+	vmovdqu	112-32(%r9),%xmm0
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x11,%xmm3,%xmm2,%xmm2
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm15,%xmm7,%xmm7
+	vpxor	%xmm4,%xmm7,%xmm4
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm6
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm1
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm8
+	vpxor	%xmm14,%xmm1,%xmm1
+	vpxor	%xmm5,%xmm6,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm9
+	vmovdqu	32-32(%r9),%xmm15
+	vpxor	%xmm2,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm9,%xmm6
+
+	vmovdqu	16-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm7,%xmm9
+	vpclmulqdq	$0x00,%xmm3,%xmm14,%xmm4
+	vpxor	%xmm9,%xmm6,%xmm6
+	vpunpckhqdq	%xmm13,%xmm13,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm14,%xmm14
+	vpxor	%xmm13,%xmm2,%xmm2
+	vpslldq	$8,%xmm6,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+	vpxor	%xmm9,%xmm5,%xmm8
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpclmulqdq	$0x00,%xmm0,%xmm13,%xmm5
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm12,%xmm12,%xmm9
+	vpclmulqdq	$0x11,%xmm0,%xmm13,%xmm13
+	vpxor	%xmm12,%xmm9,%xmm9
+	vpxor	%xmm14,%xmm13,%xmm13
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm3,%xmm12,%xmm4
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm11,%xmm11,%xmm1
+	vpclmulqdq	$0x11,%xmm3,%xmm12,%xmm12
+	vpxor	%xmm11,%xmm1,%xmm1
+	vpxor	%xmm13,%xmm12,%xmm12
+	vxorps	16(%rsp),%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm9,%xmm9
+
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm0,%xmm11,%xmm5
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm10,%xmm10,%xmm2
+	vpclmulqdq	$0x11,%xmm0,%xmm11,%xmm11
+	vpxor	%xmm10,%xmm2,%xmm2
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpxor	%xmm12,%xmm11,%xmm11
+	vpclmulqdq	$0x10,%xmm15,%xmm1,%xmm1
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm9,%xmm1,%xmm1
+
+	vxorps	%xmm7,%xmm14,%xmm14
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm3,%xmm10,%xmm4
+	vmovdqu	112-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpclmulqdq	$0x11,%xmm3,%xmm10,%xmm10
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm15,%xmm2,%xmm2
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm6
+	vpxor	%xmm10,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm6,%xmm6
+
+	vpxor	%xmm5,%xmm7,%xmm4
+	vpxor	%xmm4,%xmm6,%xmm6
+	vpslldq	$8,%xmm6,%xmm1
+	vmovdqu	16(%r11),%xmm3
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm5,%xmm8
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm2,%xmm8,%xmm8
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm7,%xmm2,%xmm2
+	vpxor	%xmm2,%xmm8,%xmm8
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,-64(%r9)
+
+	vzeroupper
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Lgcm_enc_abort:
+	movq	%r10,%rax
+	.byte	0xf3,0xc3
+.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
 #endif
diff --git a/linux-x86_64/crypto/modes/ghash-x86_64.S b/linux-x86_64/crypto/modes/ghash-x86_64.S
index b47bdc9..b6ca45f 100644
--- a/linux-x86_64/crypto/modes/ghash-x86_64.S
+++ b/linux-x86_64/crypto/modes/ghash-x86_64.S
@@ -1257,7 +1257,108 @@
 .type	gcm_init_avx,@function
 .align	32
 gcm_init_avx:
-	jmp	.L_init_clmul
+	vzeroupper
+
+	vmovdqu	(%rsi),%xmm2
+	vpshufd	$78,%xmm2,%xmm2
+
+
+	vpshufd	$255,%xmm2,%xmm4
+	vpsrlq	$63,%xmm2,%xmm3
+	vpsllq	$1,%xmm2,%xmm2
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpcmpgtd	%xmm4,%xmm5,%xmm5
+	vpslldq	$8,%xmm3,%xmm3
+	vpor	%xmm3,%xmm2,%xmm2
+
+
+	vpand	.L0x1c2_polynomial(%rip),%xmm5,%xmm5
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vpunpckhqdq	%xmm2,%xmm2,%xmm6
+	vmovdqa	%xmm2,%xmm0
+	vpxor	%xmm2,%xmm6,%xmm6
+	movq	$4,%r10
+	jmp	.Linit_start_avx
+.align	32
+.Linit_loop_avx:
+	vpalignr	$8,%xmm3,%xmm4,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+.Linit_start_avx:
+	vmovdqa	%xmm0,%xmm5
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+	vpshufd	$78,%xmm5,%xmm3
+	vpshufd	$78,%xmm0,%xmm4
+	vpxor	%xmm5,%xmm3,%xmm3
+	vmovdqu	%xmm5,0(%rdi)
+	vpxor	%xmm0,%xmm4,%xmm4
+	vmovdqu	%xmm0,16(%rdi)
+	leaq	48(%rdi),%rdi
+	subq	$1,%r10
+	jnz	.Linit_loop_avx
+
+	vpalignr	$8,%xmm4,%xmm3,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+
+	vzeroupper
+	.byte	0xf3,0xc3
 .size	gcm_init_avx,.-gcm_init_avx
 .globl	gcm_gmult_avx
 .hidden gcm_gmult_avx
@@ -1271,7 +1372,377 @@
 .type	gcm_ghash_avx,@function
 .align	32
 gcm_ghash_avx:
-	jmp	.L_ghash_clmul
+	vzeroupper
+
+	vmovdqu	(%rdi),%xmm10
+	leaq	.L0x1c2_polynomial(%rip),%r10
+	leaq	64(%rsi),%rsi
+	vmovdqu	.Lbswap_mask(%rip),%xmm13
+	vpshufb	%xmm13,%xmm10,%xmm10
+	cmpq	$0x80,%rcx
+	jb	.Lshort_avx
+	subq	$0x80,%rcx
+
+	vmovdqu	112(%rdx),%xmm14
+	vmovdqu	0-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vmovdqu	32-64(%rsi),%xmm7
+
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	80(%rdx),%xmm14
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	48-64(%rsi),%xmm6
+	vpxor	%xmm14,%xmm9,%xmm9
+	vmovdqu	64(%rdx),%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	48(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	32(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	16(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+
+	leaq	128(%rdx),%rdx
+	cmpq	$0x80,%rcx
+	jb	.Ltail_avx
+
+	vpxor	%xmm10,%xmm15,%xmm15
+	subq	$0x80,%rcx
+	jmp	.Loop8x_avx
+
+.align	32
+.Loop8x_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	112(%rdx),%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
+	vmovdqu	0-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
+	vmovdqu	32-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm3,%xmm10,%xmm10
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vxorps	%xmm4,%xmm11,%xmm11
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm5,%xmm12,%xmm12
+	vxorps	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	80(%rdx),%xmm14
+	vpxor	%xmm10,%xmm12,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm11,%xmm12,%xmm12
+	vpslldq	$8,%xmm12,%xmm9
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vpsrldq	$8,%xmm12,%xmm12
+	vpxor	%xmm9,%xmm10,%xmm10
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vxorps	%xmm12,%xmm11,%xmm11
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	64(%rdx),%xmm15
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vxorps	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vmovdqu	48(%rdx),%xmm14
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	32(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+	vxorps	%xmm12,%xmm10,%xmm10
+
+	vmovdqu	16(%rdx),%xmm14
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vxorps	%xmm11,%xmm12,%xmm12
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm12,%xmm15,%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm10,%xmm15,%xmm15
+
+	leaq	128(%rdx),%rdx
+	subq	$0x80,%rcx
+	jnc	.Loop8x_avx
+
+	addq	$0x80,%rcx
+	jmp	.Ltail_no_xor_avx
+
+.align	32
+.Lshort_avx:
+	vmovdqu	-16(%rdx,%rcx,1),%xmm14
+	leaq	(%rdx,%rcx,1),%rdx
+	vmovdqu	0-64(%rsi),%xmm6
+	vmovdqu	32-64(%rsi),%xmm7
+	vpshufb	%xmm13,%xmm14,%xmm15
+
+	vmovdqa	%xmm0,%xmm3
+	vmovdqa	%xmm1,%xmm4
+	vmovdqa	%xmm2,%xmm5
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-32(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-48(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	80-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-64(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-80(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	96-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	128-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-96(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-112(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	144-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovq	184-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jmp	.Ltail_avx
+
+.align	32
+.Ltail_avx:
+	vpxor	%xmm10,%xmm15,%xmm15
+.Ltail_no_xor_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+
+	vmovdqu	(%r10),%xmm12
+
+	vpxor	%xmm0,%xmm3,%xmm10
+	vpxor	%xmm1,%xmm4,%xmm11
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vpxor	%xmm10,%xmm5,%xmm5
+	vpxor	%xmm11,%xmm5,%xmm5
+	vpslldq	$8,%xmm5,%xmm9
+	vpsrldq	$8,%xmm5,%xmm5
+	vpxor	%xmm9,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm11,%xmm11
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	cmpq	$0,%rcx
+	jne	.Lshort_avx
+
+	vpshufb	%xmm13,%xmm10,%xmm10
+	vmovdqu	%xmm10,(%rdi)
+	vzeroupper
+	.byte	0xf3,0xc3
 .size	gcm_ghash_avx,.-gcm_ghash_avx
 .align	64
 .Lbswap_mask:
diff --git a/mac-x86/crypto/modes/ghash-x86.S b/mac-x86/crypto/modes/ghash-x86.S
index 8693b82..8ae2183 100644
--- a/mac-x86/crypto/modes/ghash-x86.S
+++ b/mac-x86/crypto/modes/ghash-x86.S
@@ -1,207 +1,6 @@
 #if defined(__i386__)
 .file	"ghash-x86.S"
 .text
-.globl	_gcm_gmult_4bit_x86
-.private_extern	_gcm_gmult_4bit_x86
-.align	4
-_gcm_gmult_4bit_x86:
-L_gcm_gmult_4bit_x86_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	subl	$84,%esp
-	movl	104(%esp),%edi
-	movl	108(%esp),%esi
-	movl	(%edi),%ebp
-	movl	4(%edi),%edx
-	movl	8(%edi),%ecx
-	movl	12(%edi),%ebx
-	movl	$0,16(%esp)
-	movl	$471859200,20(%esp)
-	movl	$943718400,24(%esp)
-	movl	$610271232,28(%esp)
-	movl	$1887436800,32(%esp)
-	movl	$1822425088,36(%esp)
-	movl	$1220542464,40(%esp)
-	movl	$1423966208,44(%esp)
-	movl	$3774873600,48(%esp)
-	movl	$4246732800,52(%esp)
-	movl	$3644850176,56(%esp)
-	movl	$3311403008,60(%esp)
-	movl	$2441084928,64(%esp)
-	movl	$2376073216,68(%esp)
-	movl	$2847932416,72(%esp)
-	movl	$3051356160,76(%esp)
-	movl	%ebp,(%esp)
-	movl	%edx,4(%esp)
-	movl	%ecx,8(%esp)
-	movl	%ebx,12(%esp)
-	shrl	$20,%ebx
-	andl	$240,%ebx
-	movl	4(%esi,%ebx,1),%ebp
-	movl	(%esi,%ebx,1),%edx
-	movl	12(%esi,%ebx,1),%ecx
-	movl	8(%esi,%ebx,1),%ebx
-	xorl	%eax,%eax
-	movl	$15,%edi
-	jmp	L000x86_loop
-.align	4,0x90
-L000x86_loop:
-	movb	%bl,%al
-	shrdl	$4,%ecx,%ebx
-	andb	$15,%al
-	shrdl	$4,%edx,%ecx
-	shrdl	$4,%ebp,%edx
-	shrl	$4,%ebp
-	xorl	16(%esp,%eax,4),%ebp
-	movb	(%esp,%edi,1),%al
-	andb	$240,%al
-	xorl	8(%esi,%eax,1),%ebx
-	xorl	12(%esi,%eax,1),%ecx
-	xorl	(%esi,%eax,1),%edx
-	xorl	4(%esi,%eax,1),%ebp
-	decl	%edi
-	js	L001x86_break
-	movb	%bl,%al
-	shrdl	$4,%ecx,%ebx
-	andb	$15,%al
-	shrdl	$4,%edx,%ecx
-	shrdl	$4,%ebp,%edx
-	shrl	$4,%ebp
-	xorl	16(%esp,%eax,4),%ebp
-	movb	(%esp,%edi,1),%al
-	shlb	$4,%al
-	xorl	8(%esi,%eax,1),%ebx
-	xorl	12(%esi,%eax,1),%ecx
-	xorl	(%esi,%eax,1),%edx
-	xorl	4(%esi,%eax,1),%ebp
-	jmp	L000x86_loop
-.align	4,0x90
-L001x86_break:
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	bswap	%ebp
-	movl	104(%esp),%edi
-	movl	%ebx,12(%edi)
-	movl	%ecx,8(%edi)
-	movl	%edx,4(%edi)
-	movl	%ebp,(%edi)
-	addl	$84,%esp
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
-.globl	_gcm_ghash_4bit_x86
-.private_extern	_gcm_ghash_4bit_x86
-.align	4
-_gcm_ghash_4bit_x86:
-L_gcm_ghash_4bit_x86_begin:
-	pushl	%ebp
-	pushl	%ebx
-	pushl	%esi
-	pushl	%edi
-	subl	$84,%esp
-	movl	104(%esp),%ebx
-	movl	108(%esp),%esi
-	movl	112(%esp),%edi
-	movl	116(%esp),%ecx
-	addl	%edi,%ecx
-	movl	%ecx,116(%esp)
-	movl	(%ebx),%ebp
-	movl	4(%ebx),%edx
-	movl	8(%ebx),%ecx
-	movl	12(%ebx),%ebx
-	movl	$0,16(%esp)
-	movl	$471859200,20(%esp)
-	movl	$943718400,24(%esp)
-	movl	$610271232,28(%esp)
-	movl	$1887436800,32(%esp)
-	movl	$1822425088,36(%esp)
-	movl	$1220542464,40(%esp)
-	movl	$1423966208,44(%esp)
-	movl	$3774873600,48(%esp)
-	movl	$4246732800,52(%esp)
-	movl	$3644850176,56(%esp)
-	movl	$3311403008,60(%esp)
-	movl	$2441084928,64(%esp)
-	movl	$2376073216,68(%esp)
-	movl	$2847932416,72(%esp)
-	movl	$3051356160,76(%esp)
-.align	4,0x90
-L002x86_outer_loop:
-	xorl	12(%edi),%ebx
-	xorl	8(%edi),%ecx
-	xorl	4(%edi),%edx
-	xorl	(%edi),%ebp
-	movl	%ebx,12(%esp)
-	movl	%ecx,8(%esp)
-	movl	%edx,4(%esp)
-	movl	%ebp,(%esp)
-	shrl	$20,%ebx
-	andl	$240,%ebx
-	movl	4(%esi,%ebx,1),%ebp
-	movl	(%esi,%ebx,1),%edx
-	movl	12(%esi,%ebx,1),%ecx
-	movl	8(%esi,%ebx,1),%ebx
-	xorl	%eax,%eax
-	movl	$15,%edi
-	jmp	L003x86_loop
-.align	4,0x90
-L003x86_loop:
-	movb	%bl,%al
-	shrdl	$4,%ecx,%ebx
-	andb	$15,%al
-	shrdl	$4,%edx,%ecx
-	shrdl	$4,%ebp,%edx
-	shrl	$4,%ebp
-	xorl	16(%esp,%eax,4),%ebp
-	movb	(%esp,%edi,1),%al
-	andb	$240,%al
-	xorl	8(%esi,%eax,1),%ebx
-	xorl	12(%esi,%eax,1),%ecx
-	xorl	(%esi,%eax,1),%edx
-	xorl	4(%esi,%eax,1),%ebp
-	decl	%edi
-	js	L004x86_break
-	movb	%bl,%al
-	shrdl	$4,%ecx,%ebx
-	andb	$15,%al
-	shrdl	$4,%edx,%ecx
-	shrdl	$4,%ebp,%edx
-	shrl	$4,%ebp
-	xorl	16(%esp,%eax,4),%ebp
-	movb	(%esp,%edi,1),%al
-	shlb	$4,%al
-	xorl	8(%esi,%eax,1),%ebx
-	xorl	12(%esi,%eax,1),%ecx
-	xorl	(%esi,%eax,1),%edx
-	xorl	4(%esi,%eax,1),%ebp
-	jmp	L003x86_loop
-.align	4,0x90
-L004x86_break:
-	bswap	%ebx
-	bswap	%ecx
-	bswap	%edx
-	bswap	%ebp
-	movl	112(%esp),%edi
-	leal	16(%edi),%edi
-	cmpl	116(%esp),%edi
-	movl	%edi,112(%esp)
-	jb	L002x86_outer_loop
-	movl	104(%esp),%edi
-	movl	%ebx,12(%edi)
-	movl	%ecx,8(%edi)
-	movl	%edx,4(%edi)
-	movl	%ebp,(%edi)
-	addl	$84,%esp
-	popl	%edi
-	popl	%esi
-	popl	%ebx
-	popl	%ebp
-	ret
 .globl	_gcm_gmult_4bit_mmx
 .private_extern	_gcm_gmult_4bit_mmx
 .align	4
@@ -213,10 +12,10 @@
 	pushl	%edi
 	movl	20(%esp),%edi
 	movl	24(%esp),%esi
-	call	L005pic_point
-L005pic_point:
+	call	L000pic_point
+L000pic_point:
 	popl	%eax
-	leal	Lrem_4bit-L005pic_point(%eax),%eax
+	leal	Lrem_4bit-L000pic_point(%eax),%eax
 	movzbl	15(%edi),%ebx
 	xorl	%ecx,%ecx
 	movl	%ebx,%edx
@@ -227,9 +26,9 @@
 	movq	8(%esi,%ecx,1),%mm0
 	movq	(%esi,%ecx,1),%mm1
 	movd	%mm0,%ebx
-	jmp	L006mmx_loop
+	jmp	L001mmx_loop
 .align	4,0x90
-L006mmx_loop:
+L001mmx_loop:
 	psrlq	$4,%mm0
 	andl	$15,%ebx
 	movq	%mm1,%mm2
@@ -243,7 +42,7 @@
 	pxor	(%esi,%edx,1),%mm1
 	movl	%ecx,%edx
 	pxor	%mm2,%mm0
-	js	L007mmx_break
+	js	L002mmx_break
 	shlb	$4,%cl
 	andl	$15,%ebx
 	psrlq	$4,%mm0
@@ -256,9 +55,9 @@
 	movd	%mm0,%ebx
 	pxor	(%esi,%ecx,1),%mm1
 	pxor	%mm2,%mm0
-	jmp	L006mmx_loop
+	jmp	L001mmx_loop
 .align	4,0x90
-L007mmx_break:
+L002mmx_break:
 	shlb	$4,%cl
 	andl	$15,%ebx
 	psrlq	$4,%mm0
@@ -314,10 +113,10 @@
 	movl	28(%esp),%ecx
 	movl	32(%esp),%edx
 	movl	%esp,%ebp
-	call	L008pic_point
-L008pic_point:
+	call	L003pic_point
+L003pic_point:
 	popl	%esi
-	leal	Lrem_8bit-L008pic_point(%esi),%esi
+	leal	Lrem_8bit-L003pic_point(%esi),%esi
 	subl	$544,%esp
 	andl	$-64,%esp
 	subl	$16,%esp
@@ -556,7 +355,7 @@
 	movl	8(%eax),%ebx
 	movl	12(%eax),%edx
 .align	4,0x90
-L009outer:
+L004outer:
 	xorl	12(%ecx),%edx
 	xorl	8(%ecx),%ebx
 	pxor	(%ecx),%mm6
@@ -891,7 +690,7 @@
 	pshufw	$27,%mm6,%mm6
 	bswap	%ebx
 	cmpl	552(%esp),%ecx
-	jne	L009outer
+	jne	L004outer
 	movl	544(%esp),%eax
 	movl	%edx,12(%eax)
 	movl	%ebx,8(%eax)
@@ -910,10 +709,10 @@
 L_gcm_init_clmul_begin:
 	movl	4(%esp),%edx
 	movl	8(%esp),%eax
-	call	L010pic
-L010pic:
+	call	L005pic
+L005pic:
 	popl	%ecx
-	leal	Lbswap-L010pic(%ecx),%ecx
+	leal	Lbswap-L005pic(%ecx),%ecx
 	movdqu	(%eax),%xmm2
 	pshufd	$78,%xmm2,%xmm2
 	pshufd	$255,%xmm2,%xmm4
@@ -978,10 +777,10 @@
 L_gcm_gmult_clmul_begin:
 	movl	4(%esp),%eax
 	movl	8(%esp),%edx
-	call	L011pic
-L011pic:
+	call	L006pic
+L006pic:
 	popl	%ecx
-	leal	Lbswap-L011pic(%ecx),%ecx
+	leal	Lbswap-L006pic(%ecx),%ecx
 	movdqu	(%eax),%xmm0
 	movdqa	(%ecx),%xmm5
 	movups	(%edx),%xmm2
@@ -1036,16 +835,16 @@
 	movl	24(%esp),%edx
 	movl	28(%esp),%esi
 	movl	32(%esp),%ebx
-	call	L012pic
-L012pic:
+	call	L007pic
+L007pic:
 	popl	%ecx
-	leal	Lbswap-L012pic(%ecx),%ecx
+	leal	Lbswap-L007pic(%ecx),%ecx
 	movdqu	(%eax),%xmm0
 	movdqa	(%ecx),%xmm5
 	movdqu	(%edx),%xmm2
 .byte	102,15,56,0,197
 	subl	$16,%ebx
-	jz	L013odd_tail
+	jz	L008odd_tail
 	movdqu	(%esi),%xmm3
 	movdqu	16(%esi),%xmm6
 .byte	102,15,56,0,221
@@ -1062,10 +861,10 @@
 	movups	16(%edx),%xmm2
 	nop
 	subl	$32,%ebx
-	jbe	L014even_tail
-	jmp	L015mod_loop
+	jbe	L009even_tail
+	jmp	L010mod_loop
 .align	5,0x90
-L015mod_loop:
+L010mod_loop:
 	pshufd	$78,%xmm0,%xmm4
 	movdqa	%xmm0,%xmm1
 	pxor	%xmm0,%xmm4
@@ -1120,8 +919,8 @@
 .byte	102,15,58,68,221,0
 	leal	32(%esi),%esi
 	subl	$32,%ebx
-	ja	L015mod_loop
-L014even_tail:
+	ja	L010mod_loop
+L009even_tail:
 	pshufd	$78,%xmm0,%xmm4
 	movdqa	%xmm0,%xmm1
 	pxor	%xmm0,%xmm4
@@ -1160,9 +959,9 @@
 	psrlq	$1,%xmm0
 	pxor	%xmm1,%xmm0
 	testl	%ebx,%ebx
-	jnz	L016done
+	jnz	L011done
 	movups	(%edx),%xmm2
-L013odd_tail:
+L008odd_tail:
 	movdqu	(%esi),%xmm3
 .byte	102,15,56,0,221
 	pxor	%xmm3,%xmm0
@@ -1201,7 +1000,7 @@
 	pxor	%xmm4,%xmm0
 	psrlq	$1,%xmm0
 	pxor	%xmm1,%xmm0
-L016done:
+L011done:
 .byte	102,15,56,0,197
 	movdqu	%xmm0,(%eax)
 	popl	%edi
diff --git a/mac-x86_64/crypto/ec/p256-x86_64-asm.S b/mac-x86_64/crypto/ec/p256-x86_64-asm.S
index 97fb75a..aa434bf 100644
--- a/mac-x86_64/crypto/ec/p256-x86_64-asm.S
+++ b/mac-x86_64/crypto/ec/p256-x86_64-asm.S
@@ -17,48 +17,6 @@
 .quad	0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
 
 
-.p2align	6
-ecp_nistz256_mul_by_2:
-	pushq	%r12
-	pushq	%r13
-
-	movq	0(%rsi),%r8
-	xorq	%r13,%r13
-	movq	8(%rsi),%r9
-	addq	%r8,%r8
-	movq	16(%rsi),%r10
-	adcq	%r9,%r9
-	movq	24(%rsi),%r11
-	leaq	L$poly(%rip),%rsi
-	movq	%r8,%rax
-	adcq	%r10,%r10
-	adcq	%r11,%r11
-	movq	%r9,%rdx
-	adcq	$0,%r13
-
-	subq	0(%rsi),%r8
-	movq	%r10,%rcx
-	sbbq	8(%rsi),%r9
-	sbbq	16(%rsi),%r10
-	movq	%r11,%r12
-	sbbq	24(%rsi),%r11
-	sbbq	$0,%r13
-
-	cmovcq	%rax,%r8
-	cmovcq	%rdx,%r9
-	movq	%r8,0(%rdi)
-	cmovcq	%rcx,%r10
-	movq	%r9,8(%rdi)
-	cmovcq	%r12,%r11
-	movq	%r10,16(%rdi)
-	movq	%r11,24(%rdi)
-
-	popq	%r13
-	popq	%r12
-	.byte	0xf3,0xc3
-
-
-
 
 .globl	_ecp_nistz256_neg
 .private_extern _ecp_nistz256_neg
@@ -553,103 +511,6 @@
 
 
 
-
-
-
-
-.globl	_ecp_nistz256_from_mont
-.private_extern _ecp_nistz256_from_mont
-
-.p2align	5
-_ecp_nistz256_from_mont:
-	pushq	%r12
-	pushq	%r13
-
-	movq	0(%rsi),%rax
-	movq	L$poly+24(%rip),%r13
-	movq	8(%rsi),%r9
-	movq	16(%rsi),%r10
-	movq	24(%rsi),%r11
-	movq	%rax,%r8
-	movq	L$poly+8(%rip),%r12
-
-
-
-	movq	%rax,%rcx
-	shlq	$32,%r8
-	mulq	%r13
-	shrq	$32,%rcx
-	addq	%r8,%r9
-	adcq	%rcx,%r10
-	adcq	%rax,%r11
-	movq	%r9,%rax
-	adcq	$0,%rdx
-
-
-
-	movq	%r9,%rcx
-	shlq	$32,%r9
-	movq	%rdx,%r8
-	mulq	%r13
-	shrq	$32,%rcx
-	addq	%r9,%r10
-	adcq	%rcx,%r11
-	adcq	%rax,%r8
-	movq	%r10,%rax
-	adcq	$0,%rdx
-
-
-
-	movq	%r10,%rcx
-	shlq	$32,%r10
-	movq	%rdx,%r9
-	mulq	%r13
-	shrq	$32,%rcx
-	addq	%r10,%r11
-	adcq	%rcx,%r8
-	adcq	%rax,%r9
-	movq	%r11,%rax
-	adcq	$0,%rdx
-
-
-
-	movq	%r11,%rcx
-	shlq	$32,%r11
-	movq	%rdx,%r10
-	mulq	%r13
-	shrq	$32,%rcx
-	addq	%r11,%r8
-	adcq	%rcx,%r9
-	movq	%r8,%rcx
-	adcq	%rax,%r10
-	movq	%r9,%rsi
-	adcq	$0,%rdx
-
-
-
-	subq	$-1,%r8
-	movq	%r10,%rax
-	sbbq	%r12,%r9
-	sbbq	$0,%r10
-	movq	%rdx,%r11
-	sbbq	%r13,%rdx
-	sbbq	%r13,%r13
-
-	cmovnzq	%rcx,%r8
-	cmovnzq	%rsi,%r9
-	movq	%r8,0(%rdi)
-	cmovnzq	%rax,%r10
-	movq	%r9,8(%rdi)
-	cmovzq	%rdx,%r11
-	movq	%r10,16(%rdi)
-	movq	%r11,24(%rdi)
-
-	popq	%r13
-	popq	%r12
-	.byte	0xf3,0xc3
-
-
-
 .globl	_ecp_nistz256_select_w5
 .private_extern _ecp_nistz256_select_w5
 
diff --git a/mac-x86_64/crypto/modes/aesni-gcm-x86_64.S b/mac-x86_64/crypto/modes/aesni-gcm-x86_64.S
index 21d5ad6..901bb96 100644
--- a/mac-x86_64/crypto/modes/aesni-gcm-x86_64.S
+++ b/mac-x86_64/crypto/modes/aesni-gcm-x86_64.S
@@ -1,19 +1,798 @@
 #if defined(__x86_64__)
 .text	
 
-.globl	_aesni_gcm_encrypt
-.private_extern _aesni_gcm_encrypt
 
-_aesni_gcm_encrypt:
-	xorl	%eax,%eax
+.p2align	5
+_aesni_ctr32_ghash_6x:
+	vmovdqu	32(%r11),%xmm2
+	subq	$6,%rdx
+	vpxor	%xmm4,%xmm4,%xmm4
+	vmovdqu	0-128(%rcx),%xmm15
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovdqu	%xmm4,16+8(%rsp)
+	jmp	L$oop6x
+
+.p2align	5
+L$oop6x:
+	addl	$100663296,%ebx
+	jc	L$handle_ctr32
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm11,%xmm11
+
+L$resume_ctr32:
+	vmovdqu	%xmm1,(%r8)
+	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm15,%xmm12,%xmm12
+	vmovups	16-128(%rcx),%xmm2
+	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	xorq	%r12,%r12
+	cmpq	%r14,%r15
+
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vmovdqu	48+8(%rsp),%xmm0
+	vpxor	%xmm15,%xmm13,%xmm13
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm14,%xmm14
+	setnc	%r12b
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vmovdqu	16-32(%r9),%xmm3
+	negq	%r12
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
+	vpxor	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm2,%xmm13,%xmm13
+	vpxor	%xmm5,%xmm1,%xmm4
+	andq	$0x60,%r12
+	vmovups	32-128(%rcx),%xmm15
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
+	vaesenc	%xmm2,%xmm14,%xmm14
+
+	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
+	leaq	(%r14,%r12,1),%r14
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
+	vmovdqu	64+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	88(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	80(%r14),%r12
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,32+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,40+8(%rsp)
+	vmovdqu	48-32(%r9),%xmm5
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	48-128(%rcx),%xmm15
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
+	vmovdqu	80+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqu	64-32(%r9),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	64-128(%rcx),%xmm15
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	72(%r14),%r13
+	vpxor	%xmm5,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	64(%r14),%r12
+	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
+	vmovdqu	96+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,48+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,56+8(%rsp)
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	96-32(%r9),%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	80-128(%rcx),%xmm15
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	56(%r14),%r13
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
+	vpxor	112+8(%rsp),%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	48(%r14),%r12
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,64+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,72+8(%rsp)
+	vpxor	%xmm3,%xmm4,%xmm4
+	vmovdqu	112-32(%r9),%xmm3
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	96-128(%rcx),%xmm15
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	40(%r14),%r13
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	32(%r14),%r12
+	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,80+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,88+8(%rsp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm6,%xmm6
+
+	vmovups	112-128(%rcx),%xmm15
+	vpslldq	$8,%xmm6,%xmm5
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	16(%r11),%xmm3
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm8,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm4,%xmm4
+	movbeq	24(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	16(%r14),%r12
+	vpalignr	$8,%xmm4,%xmm4,%xmm0
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	movq	%r13,96+8(%rsp)
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r12,104+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vmovups	128-128(%rcx),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	144-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vpsrldq	$8,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vpxor	%xmm6,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vpxor	%xmm0,%xmm4,%xmm4
+	movbeq	8(%r14),%r13
+	vaesenc	%xmm1,%xmm13,%xmm13
+	movbeq	0(%r14),%r12
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	160-128(%rcx),%xmm1
+	cmpl	$11,%ebp
+	jb	L$enc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	176-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	192-128(%rcx),%xmm1
+	je	L$enc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	208-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	224-128(%rcx),%xmm1
+	jmp	L$enc_tail
+
+.p2align	5
+L$handle_ctr32:
+	vmovdqu	(%r11),%xmm0
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm15,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpshufb	%xmm0,%xmm1,%xmm1
+	jmp	L$resume_ctr32
+
+.p2align	5
+L$enc_tail:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vmovdqu	%xmm7,16+8(%rsp)
+	vpalignr	$8,%xmm4,%xmm4,%xmm8
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	vpxor	0(%rdi),%xmm1,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	16(%rdi),%xmm1,%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	32(%rdi),%xmm1,%xmm5
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	48(%rdi),%xmm1,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	64(%rdi),%xmm1,%xmm7
+	vpxor	80(%rdi),%xmm1,%xmm3
+	vmovdqu	(%r8),%xmm1
+
+	vaesenclast	%xmm2,%xmm9,%xmm9
+	vmovdqu	32(%r11),%xmm2
+	vaesenclast	%xmm0,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm1,%xmm0
+	movq	%r13,112+8(%rsp)
+	leaq	96(%rdi),%rdi
+	vaesenclast	%xmm5,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm0,%xmm5
+	movq	%r12,120+8(%rsp)
+	leaq	96(%rsi),%rsi
+	vmovdqu	0-128(%rcx),%xmm15
+	vaesenclast	%xmm6,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm5,%xmm6
+	vaesenclast	%xmm7,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm6,%xmm7
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vpaddb	%xmm2,%xmm7,%xmm3
+
+	addq	$0x60,%r10
+	subq	$0x6,%rdx
+	jc	L$6x_done
+
+	vmovups	%xmm9,-96(%rsi)
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovups	%xmm10,-80(%rsi)
+	vmovdqa	%xmm0,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vmovdqa	%xmm5,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vmovdqa	%xmm6,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vmovdqa	%xmm7,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vmovdqa	%xmm3,%xmm14
+	vmovdqu	32+8(%rsp),%xmm7
+	jmp	L$oop6x
+
+L$6x_done:
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpxor	%xmm4,%xmm8,%xmm8
+
 	.byte	0xf3,0xc3
 
-
 .globl	_aesni_gcm_decrypt
 .private_extern _aesni_gcm_decrypt
 
+.p2align	5
 _aesni_gcm_decrypt:
-	xorl	%eax,%eax
+	xorq	%r10,%r10
+
+
+
+	cmpq	$0x60,%rdx
+	jb	L$gcm_dec_abort
+
+	leaq	(%rsp),%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	vzeroupper
+
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	L$bswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	vmovdqu	(%r9),%xmm8
+	andq	$-128,%rsp
+	vmovdqu	(%r11),%xmm0
+	leaq	128(%rcx),%rcx
+	leaq	32+32(%r9),%r9
+	movl	240-128(%rcx),%ebp
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	L$dec_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	L$dec_no_key_aliasing
+	subq	%r15,%rsp
+L$dec_no_key_aliasing:
+
+	vmovdqu	80(%rdi),%xmm7
+	leaq	(%rdi),%r14
+	vmovdqu	64(%rdi),%xmm4
+
+
+
+
+
+
+
+	leaq	-192(%rdi,%rdx,1),%r15
+
+	vmovdqu	48(%rdi),%xmm5
+	shrq	$4,%rdx
+	xorq	%r10,%r10
+	vmovdqu	32(%rdi),%xmm6
+	vpshufb	%xmm0,%xmm7,%xmm7
+	vmovdqu	16(%rdi),%xmm2
+	vpshufb	%xmm0,%xmm4,%xmm4
+	vmovdqu	(%rdi),%xmm3
+	vpshufb	%xmm0,%xmm5,%xmm5
+	vmovdqu	%xmm4,48(%rsp)
+	vpshufb	%xmm0,%xmm6,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm2,%xmm2
+	vmovdqu	%xmm6,80(%rsp)
+	vpshufb	%xmm0,%xmm3,%xmm3
+	vmovdqu	%xmm2,96(%rsp)
+	vmovdqu	%xmm3,112(%rsp)
+
+	call	_aesni_ctr32_ghash_6x
+
+	vmovups	%xmm9,-96(%rsi)
+	vmovups	%xmm10,-80(%rsi)
+	vmovups	%xmm11,-64(%rsi)
+	vmovups	%xmm12,-48(%rsi)
+	vmovups	%xmm13,-32(%rsi)
+	vmovups	%xmm14,-16(%rsi)
+
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,-64(%r9)
+
+	vzeroupper
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+L$gcm_dec_abort:
+	movq	%r10,%rax
 	.byte	0xf3,0xc3
 
+
+.p2align	5
+_aesni_ctr32_6x:
+	vmovdqu	0-128(%rcx),%xmm4
+	vmovdqu	32(%r11),%xmm2
+	leaq	-1(%rbp),%r13
+	vmovups	16-128(%rcx),%xmm15
+	leaq	32-128(%rcx),%r12
+	vpxor	%xmm4,%xmm1,%xmm9
+	addl	$100663296,%ebx
+	jc	L$handle_ctr32_2
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	L$oop_ctr32
+
+.p2align	4
+L$oop_ctr32:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vmovups	(%r12),%xmm15
+	leaq	16(%r12),%r12
+	decl	%r13d
+	jnz	L$oop_ctr32
+
+	vmovdqu	(%r12),%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	0(%rdi),%xmm3,%xmm4
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	16(%rdi),%xmm3,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	32(%rdi),%xmm3,%xmm6
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	48(%rdi),%xmm3,%xmm8
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	64(%rdi),%xmm3,%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	80(%rdi),%xmm3,%xmm3
+	leaq	96(%rdi),%rdi
+
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vaesenclast	%xmm5,%xmm10,%xmm10
+	vaesenclast	%xmm6,%xmm11,%xmm11
+	vaesenclast	%xmm8,%xmm12,%xmm12
+	vaesenclast	%xmm2,%xmm13,%xmm13
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vmovups	%xmm9,0(%rsi)
+	vmovups	%xmm10,16(%rsi)
+	vmovups	%xmm11,32(%rsi)
+	vmovups	%xmm12,48(%rsi)
+	vmovups	%xmm13,64(%rsi)
+	vmovups	%xmm14,80(%rsi)
+	leaq	96(%rsi),%rsi
+
+	.byte	0xf3,0xc3
+.p2align	5
+L$handle_ctr32_2:
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	L$oop_ctr32
+
+
+.globl	_aesni_gcm_encrypt
+.private_extern _aesni_gcm_encrypt
+
+.p2align	5
+_aesni_gcm_encrypt:
+	xorq	%r10,%r10
+
+
+
+
+	cmpq	$288,%rdx
+	jb	L$gcm_enc_abort
+
+	leaq	(%rsp),%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	vzeroupper
+
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	L$bswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	leaq	128(%rcx),%rcx
+	vmovdqu	(%r11),%xmm0
+	andq	$-128,%rsp
+	movl	240-128(%rcx),%ebp
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	L$enc_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	L$enc_no_key_aliasing
+	subq	%r15,%rsp
+L$enc_no_key_aliasing:
+
+	leaq	(%rsi),%r14
+
+
+
+
+
+
+
+
+	leaq	-192(%rsi,%rdx,1),%r15
+
+	shrq	$4,%rdx
+
+	call	_aesni_ctr32_6x
+	vpshufb	%xmm0,%xmm9,%xmm8
+	vpshufb	%xmm0,%xmm10,%xmm2
+	vmovdqu	%xmm8,112(%rsp)
+	vpshufb	%xmm0,%xmm11,%xmm4
+	vmovdqu	%xmm2,96(%rsp)
+	vpshufb	%xmm0,%xmm12,%xmm5
+	vmovdqu	%xmm4,80(%rsp)
+	vpshufb	%xmm0,%xmm13,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm14,%xmm7
+	vmovdqu	%xmm6,48(%rsp)
+
+	call	_aesni_ctr32_6x
+
+	vmovdqu	(%r9),%xmm8
+	leaq	32+32(%r9),%r9
+	subq	$12,%rdx
+	movq	$192,%r10
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	call	_aesni_ctr32_ghash_6x
+	vmovdqu	32(%rsp),%xmm7
+	vmovdqu	(%r11),%xmm0
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm7,%xmm7,%xmm1
+	vmovdqu	32-32(%r9),%xmm15
+	vmovups	%xmm9,-96(%rsi)
+	vpshufb	%xmm0,%xmm9,%xmm9
+	vpxor	%xmm7,%xmm1,%xmm1
+	vmovups	%xmm10,-80(%rsi)
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vmovdqu	%xmm9,16(%rsp)
+	vmovdqu	48(%rsp),%xmm6
+	vmovdqu	16-32(%r9),%xmm0
+	vpunpckhqdq	%xmm6,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm6,%xmm2,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+
+	vmovdqu	64(%rsp),%xmm9
+	vpclmulqdq	$0x00,%xmm0,%xmm6,%xmm4
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm9,%xmm9,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm6,%xmm6
+	vpxor	%xmm9,%xmm5,%xmm5
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vmovdqu	80(%rsp),%xmm1
+	vpclmulqdq	$0x00,%xmm3,%xmm9,%xmm7
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm4,%xmm7,%xmm7
+	vpunpckhqdq	%xmm1,%xmm1,%xmm4
+	vpclmulqdq	$0x11,%xmm3,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpxor	%xmm6,%xmm9,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm5,%xmm5
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	96(%rsp),%xmm2
+	vpclmulqdq	$0x00,%xmm0,%xmm1,%xmm6
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpunpckhqdq	%xmm2,%xmm2,%xmm7
+	vpclmulqdq	$0x11,%xmm0,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpxor	%xmm9,%xmm1,%xmm1
+	vpclmulqdq	$0x10,%xmm15,%xmm4,%xmm4
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm5,%xmm4,%xmm4
+
+	vpxor	112(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm3,%xmm2,%xmm5
+	vmovdqu	112-32(%r9),%xmm0
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x11,%xmm3,%xmm2,%xmm2
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm15,%xmm7,%xmm7
+	vpxor	%xmm4,%xmm7,%xmm4
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm6
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm1
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm8
+	vpxor	%xmm14,%xmm1,%xmm1
+	vpxor	%xmm5,%xmm6,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm9
+	vmovdqu	32-32(%r9),%xmm15
+	vpxor	%xmm2,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm9,%xmm6
+
+	vmovdqu	16-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm7,%xmm9
+	vpclmulqdq	$0x00,%xmm3,%xmm14,%xmm4
+	vpxor	%xmm9,%xmm6,%xmm6
+	vpunpckhqdq	%xmm13,%xmm13,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm14,%xmm14
+	vpxor	%xmm13,%xmm2,%xmm2
+	vpslldq	$8,%xmm6,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+	vpxor	%xmm9,%xmm5,%xmm8
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpclmulqdq	$0x00,%xmm0,%xmm13,%xmm5
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm12,%xmm12,%xmm9
+	vpclmulqdq	$0x11,%xmm0,%xmm13,%xmm13
+	vpxor	%xmm12,%xmm9,%xmm9
+	vpxor	%xmm14,%xmm13,%xmm13
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm3,%xmm12,%xmm4
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm11,%xmm11,%xmm1
+	vpclmulqdq	$0x11,%xmm3,%xmm12,%xmm12
+	vpxor	%xmm11,%xmm1,%xmm1
+	vpxor	%xmm13,%xmm12,%xmm12
+	vxorps	16(%rsp),%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm9,%xmm9
+
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm0,%xmm11,%xmm5
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm10,%xmm10,%xmm2
+	vpclmulqdq	$0x11,%xmm0,%xmm11,%xmm11
+	vpxor	%xmm10,%xmm2,%xmm2
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpxor	%xmm12,%xmm11,%xmm11
+	vpclmulqdq	$0x10,%xmm15,%xmm1,%xmm1
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm9,%xmm1,%xmm1
+
+	vxorps	%xmm7,%xmm14,%xmm14
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm3,%xmm10,%xmm4
+	vmovdqu	112-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpclmulqdq	$0x11,%xmm3,%xmm10,%xmm10
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm15,%xmm2,%xmm2
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm6
+	vpxor	%xmm10,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm6,%xmm6
+
+	vpxor	%xmm5,%xmm7,%xmm4
+	vpxor	%xmm4,%xmm6,%xmm6
+	vpslldq	$8,%xmm6,%xmm1
+	vmovdqu	16(%r11),%xmm3
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm5,%xmm8
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm2,%xmm8,%xmm8
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm7,%xmm2,%xmm2
+	vpxor	%xmm2,%xmm8,%xmm8
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,-64(%r9)
+
+	vzeroupper
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+L$gcm_enc_abort:
+	movq	%r10,%rax
+	.byte	0xf3,0xc3
+
+.p2align	6
+L$bswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+L$poly:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+L$one_msb:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+L$two_lsb:
+.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+L$one_lsb:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align	6
 #endif
diff --git a/mac-x86_64/crypto/modes/ghash-x86_64.S b/mac-x86_64/crypto/modes/ghash-x86_64.S
index 1072c7f..334f83f 100644
--- a/mac-x86_64/crypto/modes/ghash-x86_64.S
+++ b/mac-x86_64/crypto/modes/ghash-x86_64.S
@@ -1256,7 +1256,108 @@
 
 .p2align	5
 _gcm_init_avx:
-	jmp	L$_init_clmul
+	vzeroupper
+
+	vmovdqu	(%rsi),%xmm2
+	vpshufd	$78,%xmm2,%xmm2
+
+
+	vpshufd	$255,%xmm2,%xmm4
+	vpsrlq	$63,%xmm2,%xmm3
+	vpsllq	$1,%xmm2,%xmm2
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpcmpgtd	%xmm4,%xmm5,%xmm5
+	vpslldq	$8,%xmm3,%xmm3
+	vpor	%xmm3,%xmm2,%xmm2
+
+
+	vpand	L$0x1c2_polynomial(%rip),%xmm5,%xmm5
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vpunpckhqdq	%xmm2,%xmm2,%xmm6
+	vmovdqa	%xmm2,%xmm0
+	vpxor	%xmm2,%xmm6,%xmm6
+	movq	$4,%r10
+	jmp	L$init_start_avx
+.p2align	5
+L$init_loop_avx:
+	vpalignr	$8,%xmm3,%xmm4,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+L$init_start_avx:
+	vmovdqa	%xmm0,%xmm5
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+	vpshufd	$78,%xmm5,%xmm3
+	vpshufd	$78,%xmm0,%xmm4
+	vpxor	%xmm5,%xmm3,%xmm3
+	vmovdqu	%xmm5,0(%rdi)
+	vpxor	%xmm0,%xmm4,%xmm4
+	vmovdqu	%xmm0,16(%rdi)
+	leaq	48(%rdi),%rdi
+	subq	$1,%r10
+	jnz	L$init_loop_avx
+
+	vpalignr	$8,%xmm4,%xmm3,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+
+	vzeroupper
+	.byte	0xf3,0xc3
 
 .globl	_gcm_gmult_avx
 .private_extern _gcm_gmult_avx
@@ -1270,7 +1371,377 @@
 
 .p2align	5
 _gcm_ghash_avx:
-	jmp	L$_ghash_clmul
+	vzeroupper
+
+	vmovdqu	(%rdi),%xmm10
+	leaq	L$0x1c2_polynomial(%rip),%r10
+	leaq	64(%rsi),%rsi
+	vmovdqu	L$bswap_mask(%rip),%xmm13
+	vpshufb	%xmm13,%xmm10,%xmm10
+	cmpq	$0x80,%rcx
+	jb	L$short_avx
+	subq	$0x80,%rcx
+
+	vmovdqu	112(%rdx),%xmm14
+	vmovdqu	0-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vmovdqu	32-64(%rsi),%xmm7
+
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	80(%rdx),%xmm14
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	48-64(%rsi),%xmm6
+	vpxor	%xmm14,%xmm9,%xmm9
+	vmovdqu	64(%rdx),%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	48(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	32(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	16(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+
+	leaq	128(%rdx),%rdx
+	cmpq	$0x80,%rcx
+	jb	L$tail_avx
+
+	vpxor	%xmm10,%xmm15,%xmm15
+	subq	$0x80,%rcx
+	jmp	L$oop8x_avx
+
+.p2align	5
+L$oop8x_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	112(%rdx),%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
+	vmovdqu	0-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
+	vmovdqu	32-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm3,%xmm10,%xmm10
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vxorps	%xmm4,%xmm11,%xmm11
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm5,%xmm12,%xmm12
+	vxorps	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	80(%rdx),%xmm14
+	vpxor	%xmm10,%xmm12,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm11,%xmm12,%xmm12
+	vpslldq	$8,%xmm12,%xmm9
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vpsrldq	$8,%xmm12,%xmm12
+	vpxor	%xmm9,%xmm10,%xmm10
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vxorps	%xmm12,%xmm11,%xmm11
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	64(%rdx),%xmm15
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vxorps	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vmovdqu	48(%rdx),%xmm14
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	32(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+	vxorps	%xmm12,%xmm10,%xmm10
+
+	vmovdqu	16(%rdx),%xmm14
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vxorps	%xmm11,%xmm12,%xmm12
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm12,%xmm15,%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm10,%xmm15,%xmm15
+
+	leaq	128(%rdx),%rdx
+	subq	$0x80,%rcx
+	jnc	L$oop8x_avx
+
+	addq	$0x80,%rcx
+	jmp	L$tail_no_xor_avx
+
+.p2align	5
+L$short_avx:
+	vmovdqu	-16(%rdx,%rcx,1),%xmm14
+	leaq	(%rdx,%rcx,1),%rdx
+	vmovdqu	0-64(%rsi),%xmm6
+	vmovdqu	32-64(%rsi),%xmm7
+	vpshufb	%xmm13,%xmm14,%xmm15
+
+	vmovdqa	%xmm0,%xmm3
+	vmovdqa	%xmm1,%xmm4
+	vmovdqa	%xmm2,%xmm5
+	subq	$0x10,%rcx
+	jz	L$tail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-32(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	L$tail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-48(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	80-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	L$tail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-64(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	L$tail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-80(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	96-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	128-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	L$tail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-96(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	L$tail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-112(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	144-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovq	184-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jmp	L$tail_avx
+
+.p2align	5
+L$tail_avx:
+	vpxor	%xmm10,%xmm15,%xmm15
+L$tail_no_xor_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+
+	vmovdqu	(%r10),%xmm12
+
+	vpxor	%xmm0,%xmm3,%xmm10
+	vpxor	%xmm1,%xmm4,%xmm11
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vpxor	%xmm10,%xmm5,%xmm5
+	vpxor	%xmm11,%xmm5,%xmm5
+	vpslldq	$8,%xmm5,%xmm9
+	vpsrldq	$8,%xmm5,%xmm5
+	vpxor	%xmm9,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm11,%xmm11
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	cmpq	$0,%rcx
+	jne	L$short_avx
+
+	vpshufb	%xmm13,%xmm10,%xmm10
+	vmovdqu	%xmm10,(%rdi)
+	vzeroupper
+	.byte	0xf3,0xc3
 
 .p2align	6
 L$bswap_mask:
diff --git a/sources.bp b/sources.bp
index 9672673..1b9e815 100644
--- a/sources.bp
+++ b/sources.bp
@@ -174,7 +174,6 @@
         "src/crypto/pem/pem_pkey.c",
         "src/crypto/pem/pem_x509.c",
         "src/crypto/pem/pem_xaux.c",
-        "src/crypto/pkcs8/p5_pbe.c",
         "src/crypto/pkcs8/p5_pbev2.c",
         "src/crypto/pkcs8/p8_pkey.c",
         "src/crypto/pkcs8/pkcs8.c",
@@ -183,6 +182,7 @@
         "src/crypto/poly1305/poly1305_vec.c",
         "src/crypto/pool/pool.c",
         "src/crypto/rand/deterministic.c",
+        "src/crypto/rand/fuchsia.c",
         "src/crypto/rand/rand.c",
         "src/crypto/rand/urandom.c",
         "src/crypto/rand/windows.c",
diff --git a/sources.mk b/sources.mk
index d872d77..dc2449d 100644
--- a/sources.mk
+++ b/sources.mk
@@ -172,7 +172,6 @@
   src/crypto/pem/pem_pkey.c\
   src/crypto/pem/pem_x509.c\
   src/crypto/pem/pem_xaux.c\
-  src/crypto/pkcs8/p5_pbe.c\
   src/crypto/pkcs8/p5_pbev2.c\
   src/crypto/pkcs8/p8_pkey.c\
   src/crypto/pkcs8/pkcs8.c\
@@ -181,6 +180,7 @@
   src/crypto/poly1305/poly1305_vec.c\
   src/crypto/pool/pool.c\
   src/crypto/rand/deterministic.c\
+  src/crypto/rand/fuchsia.c\
   src/crypto/rand/rand.c\
   src/crypto/rand/urandom.c\
   src/crypto/rand/windows.c\
diff --git a/src/BUILDING.md b/src/BUILDING.md
index 8e226c0..c87e8b5 100644
--- a/src/BUILDING.md
+++ b/src/BUILDING.md
@@ -33,7 +33,7 @@
     executable may be configured explicitly by setting `GO_EXECUTABLE`.
 
   * To build the x86 and x86\_64 assembly, your assembler must support AVX2
-    instructions. If using GNU binutils, you must have 2.22 or later.
+    instructions and MOVBE. If using GNU binutils, you must have 2.22 or later.
 
 ## Building
 
diff --git a/src/crypto/bn/bn.c b/src/crypto/bn/bn.c
index 31bb937..e3c55f2 100644
--- a/src/crypto/bn/bn.c
+++ b/src/crypto/bn/bn.c
@@ -172,12 +172,6 @@
   return &kOne;
 }
 
-void BN_with_flags(BIGNUM *out, const BIGNUM *in, int flags) {
-  OPENSSL_memcpy(out, in, sizeof(BIGNUM));
-  out->flags &= ~BN_FLG_MALLOCED;
-  out->flags |= BN_FLG_STATIC_DATA | flags;
-}
-
 /* BN_num_bits_word returns the minimum number of bits needed to represent the
  * value in |l|. */
 unsigned BN_num_bits_word(BN_ULONG l) {
@@ -369,11 +363,3 @@
     bn->neg = 0;
   }
 }
-
-int BN_get_flags(const BIGNUM *bn, int flags) {
-  return bn->flags & flags;
-}
-
-void BN_set_flags(BIGNUM *bn, int flags) {
-  bn->flags |= flags;
-}
diff --git a/src/crypto/bn/bn_test.cc b/src/crypto/bn/bn_test.cc
index 4f544a7..8f93ad0 100644
--- a/src/crypto/bn/bn_test.cc
+++ b/src/crypto/bn/bn_test.cc
@@ -632,15 +632,6 @@
     return false;
   }
 
-  BN_set_flags(a.get(), BN_FLG_CONSTTIME);
-
-  if (!ret ||
-      !BN_mod_inverse(ret.get(), a.get(), m.get(), ctx) ||
-      !ExpectBIGNUMsEqual(t, "inv(A) (mod M) (constant-time)", mod_inv.get(),
-                          ret.get())) {
-    return false;
-  }
-
   return true;
 }
 
diff --git a/src/crypto/bn/exponentiation.c b/src/crypto/bn/exponentiation.c
index 3161a2a..933a731 100644
--- a/src/crypto/bn/exponentiation.c
+++ b/src/crypto/bn/exponentiation.c
@@ -140,12 +140,6 @@
   int i, bits, ret = 0;
   BIGNUM *v, *rr;
 
-  if ((p->flags & BN_FLG_CONSTTIME) != 0) {
-    /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
-    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-    return 0;
-  }
-
   BN_CTX_start(ctx);
   if (r == a || r == p) {
     rr = BN_CTX_get(ctx);
@@ -437,12 +431,6 @@
   BIGNUM *val[TABLE_SIZE];
   BN_RECP_CTX recp;
 
-  if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
-    /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
-    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-    return 0;
-  }
-
   bits = BN_num_bits(p);
 
   if (bits == 0) {
@@ -593,10 +581,6 @@
   BIGNUM *val[TABLE_SIZE];
   BN_MONT_CTX *new_mont = NULL;
 
-  if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
-    return BN_mod_exp_mont_consttime(rr, a, p, m, ctx, mont);
-  }
-
   if (!BN_is_odd(m)) {
     OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS);
     return 0;
diff --git a/src/crypto/bn/gcd.c b/src/crypto/bn/gcd.c
index 9e62da0..7c20b8e 100644
--- a/src/crypto/bn/gcd.c
+++ b/src/crypto/bn/gcd.c
@@ -399,10 +399,6 @@
 
 BIGNUM *BN_mod_inverse(BIGNUM *out, const BIGNUM *a, const BIGNUM *n,
                        BN_CTX *ctx) {
-  int no_inverse;
-
-  BIGNUM *a_reduced = NULL;
-
   BIGNUM *new_out = NULL;
   if (out == NULL) {
     new_out = BN_new();
@@ -414,10 +410,7 @@
   }
 
   int ok = 0;
-
-  int no_branch =
-      (a->flags & BN_FLG_CONSTTIME) != 0 || (n->flags & BN_FLG_CONSTTIME) != 0;
-
+  BIGNUM *a_reduced = NULL;
   if (a->neg || BN_ucmp(a, n) >= 0) {
     a_reduced = BN_dup(a);
     if (a_reduced == NULL) {
@@ -429,7 +422,8 @@
     a = a_reduced;
   }
 
-  if (no_branch || !BN_is_odd(n)) {
+  int no_inverse;
+  if (!BN_is_odd(n)) {
     if (!bn_mod_inverse_general(out, &no_inverse, a, n, ctx)) {
       goto err;
     }
diff --git a/src/crypto/bn/montgomery.c b/src/crypto/bn/montgomery.c
index 70f0585..aa5bc42 100644
--- a/src/crypto/bn/montgomery.c
+++ b/src/crypto/bn/montgomery.c
@@ -187,9 +187,6 @@
     OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
     return 0;
   }
-  if (BN_get_flags(mod, BN_FLG_CONSTTIME)) {
-    BN_set_flags(&mont->N, BN_FLG_CONSTTIME);
-  }
 
   /* Find n0 such that n0 * N == -1 (mod r).
    *
diff --git a/src/crypto/dh/dh.c b/src/crypto/dh/dh.c
index 69a7ec8..33c36f3 100644
--- a/src/crypto/dh/dh.c
+++ b/src/crypto/dh/dh.c
@@ -258,7 +258,6 @@
   int generate_new_key = 0;
   BN_CTX *ctx = NULL;
   BIGNUM *pub_key = NULL, *priv_key = NULL;
-  BIGNUM local_priv;
 
   if (BN_num_bits(dh->p) > OPENSSL_DH_MAX_MODULUS_BITS) {
     OPENSSL_PUT_ERROR(DH, DH_R_MODULUS_TOO_LARGE);
@@ -317,8 +316,7 @@
     }
   }
 
-  BN_with_flags(&local_priv, priv_key, BN_FLG_CONSTTIME);
-  if (!BN_mod_exp_mont_consttime(pub_key, dh->g, &local_priv, dh->p, ctx,
+  if (!BN_mod_exp_mont_consttime(pub_key, dh->g, priv_key, dh->p, ctx,
                                  dh->method_mont_p)) {
     goto err;
   }
@@ -347,7 +345,6 @@
   BIGNUM *shared_key;
   int ret = -1;
   int check_result;
-  BIGNUM local_priv;
 
   if (BN_num_bits(dh->p) > OPENSSL_DH_MAX_MODULUS_BITS) {
     OPENSSL_PUT_ERROR(DH, DH_R_MODULUS_TOO_LARGE);
@@ -379,9 +376,8 @@
     goto err;
   }
 
-  BN_with_flags(&local_priv, dh->priv_key, BN_FLG_CONSTTIME);
-  if (!BN_mod_exp_mont_consttime(shared_key, peers_key, &local_priv, dh->p, ctx,
-                                 dh->method_mont_p)) {
+  if (!BN_mod_exp_mont_consttime(shared_key, peers_key, dh->priv_key, dh->p,
+                                 ctx, dh->method_mont_p)) {
     OPENSSL_PUT_ERROR(DH, ERR_R_BN_LIB);
     goto err;
   }
diff --git a/src/crypto/digest/digests.c b/src/crypto/digest/digests.c
index 351e031..9656027 100644
--- a/src/crypto/digest/digests.c
+++ b/src/crypto/digest/digests.c
@@ -59,9 +59,10 @@
 #include <assert.h>
 #include <string.h>
 
+#include <openssl/asn1.h>
 #include <openssl/md4.h>
 #include <openssl/md5.h>
-#include <openssl/obj.h>
+#include <openssl/nid.h>
 #include <openssl/sha.h>
 
 #include "internal.h"
@@ -306,8 +307,36 @@
   return NULL;
 }
 
-const EVP_MD* EVP_get_digestbyobj(const ASN1_OBJECT *obj) {
-  return EVP_get_digestbynid(OBJ_obj2nid(obj));
+static const struct {
+  uint8_t oid[9];
+  uint8_t oid_len;
+  const EVP_MD *(*md_func) (void);
+} kMDOIDs[] = {
+  /* 1.2.840.113549.2.4 */
+  { {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x02, 0x04}, 8, EVP_md4 },
+  /* 1.2.840.113549.2.5 */
+  { {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x02, 0x05}, 8, EVP_md5 },
+  /* 1.3.14.3.2.26 */
+  { {0x2b, 0x0e, 0x03, 0x02, 0x1a}, 5, EVP_sha1 },
+  /* 2.16.840.1.101.3.4.2.1 */
+  { {0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x01}, 9, EVP_sha256 },
+  /* 2.16.840.1.101.3.4.2.2 */
+  { {0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x02}, 9, EVP_sha384 },
+  /* 2.16.840.1.101.3.4.2.3 */
+  { {0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x03}, 9, EVP_sha512 },
+  /* 2.16.840.1.101.3.4.2.4 */
+  { {0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x04}, 9, EVP_sha224 },
+};
+
+const EVP_MD *EVP_get_digestbyobj(const ASN1_OBJECT *obj) {
+  for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kMDOIDs); i++) {
+    if (obj->length == kMDOIDs[i].oid_len &&
+        memcmp(obj->data, kMDOIDs[i].oid, obj->length) == 0) {
+      return kMDOIDs[i].md_func();
+    }
+  }
+
+  return NULL;
 }
 
 const EVP_MD *EVP_get_digestbyname(const char *name) {
diff --git a/src/crypto/dsa/dsa.c b/src/crypto/dsa/dsa.c
index 15583be..e2b6695 100644
--- a/src/crypto/dsa/dsa.c
+++ b/src/crypto/dsa/dsa.c
@@ -434,7 +434,6 @@
   int ok = 0;
   BN_CTX *ctx = NULL;
   BIGNUM *pub_key = NULL, *priv_key = NULL;
-  BIGNUM prk;
 
   ctx = BN_CTX_new();
   if (ctx == NULL) {
@@ -461,12 +460,9 @@
     }
   }
 
-  BN_init(&prk);
-  BN_with_flags(&prk, priv_key, BN_FLG_CONSTTIME);
-
   if (!BN_MONT_CTX_set_locked(&dsa->method_mont_p, &dsa->method_mont_lock,
                               dsa->p, ctx) ||
-      !BN_mod_exp_mont_consttime(pub_key, dsa->g, &prk, dsa->p, ctx,
+      !BN_mod_exp_mont_consttime(pub_key, dsa->g, priv_key, dsa->p, ctx,
                                  dsa->method_mont_p)) {
     goto err;
   }
@@ -844,8 +840,6 @@
     goto err;
   }
 
-  BN_set_flags(&k, BN_FLG_CONSTTIME);
-
   if (!BN_MONT_CTX_set_locked((BN_MONT_CTX **)&dsa->method_mont_p,
                               (CRYPTO_MUTEX *)&dsa->method_mont_lock, dsa->p,
                               ctx) ||
@@ -873,7 +867,6 @@
     goto err;
   }
 
-  BN_set_flags(&kq, BN_FLG_CONSTTIME);
   if (!BN_mod_exp_mont_consttime(r, dsa->g, &kq, dsa->p, ctx,
                                  dsa->method_mont_p)) {
     goto err;
diff --git a/src/crypto/ec/asm/p256-x86_64-asm.pl b/src/crypto/ec/asm/p256-x86_64-asm.pl
index a0b4b18..3cd7b01 100755
--- a/src/crypto/ec/asm/p256-x86_64-asm.pl
+++ b/src/crypto/ec/asm/p256-x86_64-asm.pl
@@ -78,57 +78,12 @@
 ___
 
 {
-################################################################################
-# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
-
 my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
 my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
 my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
 
 $code.=<<___;
 
-.type	ecp_nistz256_mul_by_2,\@function,2
-.align	64
-ecp_nistz256_mul_by_2:
-	push	%r12
-	push	%r13
-
-	mov	8*0($a_ptr), $a0
-	xor	$t4,$t4
-	mov	8*1($a_ptr), $a1
-	add	$a0, $a0		# a0:a3+a0:a3
-	mov	8*2($a_ptr), $a2
-	adc	$a1, $a1
-	mov	8*3($a_ptr), $a3
-	lea	.Lpoly(%rip), $a_ptr
-	 mov	$a0, $t0
-	adc	$a2, $a2
-	adc	$a3, $a3
-	 mov	$a1, $t1
-	adc	\$0, $t4
-
-	sub	8*0($a_ptr), $a0
-	 mov	$a2, $t2
-	sbb	8*1($a_ptr), $a1
-	sbb	8*2($a_ptr), $a2
-	 mov	$a3, $t3
-	sbb	8*3($a_ptr), $a3
-	sbb	\$0, $t4
-
-	cmovc	$t0, $a0
-	cmovc	$t1, $a1
-	mov	$a0, 8*0($r_ptr)
-	cmovc	$t2, $a2
-	mov	$a1, 8*1($r_ptr)
-	cmovc	$t3, $a3
-	mov	$a2, 8*2($r_ptr)
-	mov	$a3, 8*3($r_ptr)
-
-	pop	%r13
-	pop	%r12
-	ret
-.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
-
 ################################################################################
 # void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
 .globl	ecp_nistz256_neg
@@ -990,110 +945,6 @@
 }
 }
 {
-my ($r_ptr,$in_ptr)=("%rdi","%rsi");
-my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
-my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
-
-$code.=<<___;
-################################################################################
-# void ecp_nistz256_from_mont(
-#   uint64_t res[4],
-#   uint64_t in[4]);
-# This one performs Montgomery multiplication by 1, so we only need the reduction
-
-.globl	ecp_nistz256_from_mont
-.type	ecp_nistz256_from_mont,\@function,2
-.align	32
-ecp_nistz256_from_mont:
-	push	%r12
-	push	%r13
-
-	mov	8*0($in_ptr), %rax
-	mov	.Lpoly+8*3(%rip), $t2
-	mov	8*1($in_ptr), $acc1
-	mov	8*2($in_ptr), $acc2
-	mov	8*3($in_ptr), $acc3
-	mov	%rax, $acc0
-	mov	.Lpoly+8*1(%rip), $t1
-
-	#########################################
-	# First iteration
-	mov	%rax, $t0
-	shl	\$32, $acc0
-	mulq	$t2
-	shr	\$32, $t0
-	add	$acc0, $acc1
-	adc	$t0, $acc2
-	adc	%rax, $acc3
-	 mov	$acc1, %rax
-	adc	\$0, %rdx
-
-	#########################################
-	# Second iteration
-	mov	$acc1, $t0
-	shl	\$32, $acc1
-	mov	%rdx, $acc0
-	mulq	$t2
-	shr	\$32, $t0
-	add	$acc1, $acc2
-	adc	$t0, $acc3
-	adc	%rax, $acc0
-	 mov	$acc2, %rax
-	adc	\$0, %rdx
-
-	##########################################
-	# Third iteration
-	mov	$acc2, $t0
-	shl	\$32, $acc2
-	mov	%rdx, $acc1
-	mulq	$t2
-	shr	\$32, $t0
-	add	$acc2, $acc3
-	adc	$t0, $acc0
-	adc	%rax, $acc1
-	 mov	$acc3, %rax
-	adc	\$0, %rdx
-
-	###########################################
-	# Last iteration
-	mov	$acc3, $t0
-	shl	\$32, $acc3
-	mov	%rdx, $acc2
-	mulq	$t2
-	shr	\$32, $t0
-	add	$acc3, $acc0
-	adc	$t0, $acc1
-	 mov	$acc0, $t0
-	adc	%rax, $acc2
-	 mov	$acc1, $in_ptr
-	adc	\$0, %rdx
-
-	###########################################
-	# Branch-less conditional subtraction
-	sub	\$-1, $acc0
-	 mov	$acc2, %rax
-	sbb	$t1, $acc1
-	sbb	\$0, $acc2
-	 mov	%rdx, $acc3
-	sbb	$t2, %rdx
-	sbb	$t2, $t2
-
-	cmovnz	$t0, $acc0
-	cmovnz	$in_ptr, $acc1
-	mov	$acc0, 8*0($r_ptr)
-	cmovnz	%rax, $acc2
-	mov	$acc1, 8*1($r_ptr)
-	cmovz	%rdx, $acc3
-	mov	$acc2, 8*2($r_ptr)
-	mov	$acc3, 8*3($r_ptr)
-
-	pop	%r13
-	pop	%r12
-	ret
-.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
-___
-}
-{
 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
 my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
 my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
diff --git a/src/crypto/ec/p256-x86_64.c b/src/crypto/ec/p256-x86_64.c
index 2400740..652d10c 100644
--- a/src/crypto/ec/p256-x86_64.c
+++ b/src/crypto/ec/p256-x86_64.c
@@ -515,13 +515,15 @@
   ecp_nistz256_mod_inverse_mont(z_inv3, point_z);
   ecp_nistz256_sqr_mont(z_inv2, z_inv3);
 
-  /* TODO(davidben): The two calls to |ecp_nistz256_from_mont| may be factored
-   * into one call now that other operations also reduce mod P. */
+  /* Instead of using |ecp_nistz256_from_mont| to convert the |x| coordinate
+   * and then calling |ecp_nistz256_from_mont| again to convert the |y|
+   * coordinate below, convert the common factor |z_inv2| once now, saving one
+   * reduction. */
+  ecp_nistz256_from_mont(z_inv2, z_inv2);
 
   if (x != NULL) {
     BN_ULONG x_aff[P256_LIMBS];
     ecp_nistz256_mul_mont(x_aff, z_inv2, point_x);
-    ecp_nistz256_from_mont(x_aff, x_aff);
     if (!bn_set_words(x, x_aff, P256_LIMBS)) {
       OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE);
       return 0;
@@ -532,7 +534,6 @@
     BN_ULONG y_aff[P256_LIMBS];
     ecp_nistz256_mul_mont(z_inv3, z_inv3, z_inv2);
     ecp_nistz256_mul_mont(y_aff, z_inv3, point_y);
-    ecp_nistz256_from_mont(y_aff, y_aff);
     if (!bn_set_words(y, y_aff, P256_LIMBS)) {
       OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE);
       return 0;
diff --git a/src/crypto/ec/p256-x86_64.h b/src/crypto/ec/p256-x86_64.h
index 9897699..0132348 100644
--- a/src/crypto/ec/p256-x86_64.h
+++ b/src/crypto/ec/p256-x86_64.h
@@ -51,8 +51,11 @@
 
 /* ecp_nistz256_from_mont sets |res| to |in|, converted from Montgomery domain
  * by multiplying with 1. */
-void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS],
-                            const BN_ULONG in[P256_LIMBS]);
+static inline void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS],
+                                          const BN_ULONG in[P256_LIMBS]) {
+  static const BN_ULONG ONE[P256_LIMBS] = { 1 };
+  ecp_nistz256_mul_mont(res, in, ONE);
+}
 
 
 /* P-256 point operations.
diff --git a/src/crypto/ec/wnaf.c b/src/crypto/ec/wnaf.c
index 1594354..67b7f34 100644
--- a/src/crypto/ec/wnaf.c
+++ b/src/crypto/ec/wnaf.c
@@ -242,7 +242,7 @@
   BN_CTX *new_ctx = NULL;
   const EC_POINT *generator = NULL;
   EC_POINT *tmp = NULL;
-  size_t total_num;
+  size_t total_num = 0;
   size_t i, j;
   int k;
   int r_is_inverted = 0;
@@ -251,7 +251,7 @@
   int8_t **wNAF = NULL; /* individual wNAFs */
   size_t *wNAF_len = NULL;
   size_t max_len = 0;
-  size_t num_val;
+  size_t num_val = 0;
   EC_POINT **val = NULL; /* precomputation */
   EC_POINT **v;
   EC_POINT ***val_sub = NULL; /* pointers to sub-arrays of 'val' */
@@ -284,15 +284,14 @@
   }
 
 
-  wsize = OPENSSL_malloc(total_num * sizeof wsize[0]);
-  wNAF_len = OPENSSL_malloc(total_num * sizeof wNAF_len[0]);
-  wNAF = OPENSSL_malloc((total_num + 1) *
-                        sizeof wNAF[0]); /* includes space for pivot */
-  val_sub = OPENSSL_malloc(total_num * sizeof val_sub[0]);
+  wsize = OPENSSL_malloc(total_num * sizeof(wsize[0]));
+  wNAF_len = OPENSSL_malloc(total_num * sizeof(wNAF_len[0]));
+  wNAF = OPENSSL_malloc(total_num * sizeof(wNAF[0]));
+  val_sub = OPENSSL_malloc(total_num * sizeof(val_sub[0]));
 
   /* Ensure wNAF is initialised in case we end up going to err. */
-  if (wNAF) {
-    wNAF[0] = NULL; /* preliminary pivot */
+  if (wNAF != NULL) {
+    OPENSSL_memset(wNAF, 0, total_num * sizeof(wNAF[0]));
   }
 
   if (!wsize || !wNAF_len || !wNAF || !val_sub) {
@@ -309,7 +308,6 @@
     bits = i < num ? BN_num_bits(scalars[i]) : BN_num_bits(g_scalar);
     wsize[i] = window_bits_for_scalar_size(bits);
     num_val += (size_t)1 << (wsize[i] - 1);
-    wNAF[i + 1] = NULL; /* make sure we always have a pivot */
     wNAF[i] =
         compute_wNAF((i < num ? scalars[i] : g_scalar), wsize[i], &wNAF_len[i]);
     if (wNAF[i] == NULL) {
@@ -322,12 +320,12 @@
 
   /* All points we precompute now go into a single array 'val'. 'val_sub[i]' is
    * a pointer to the subarray for the i-th point. */
-  val = OPENSSL_malloc((num_val + 1) * sizeof val[0]);
+  val = OPENSSL_malloc(num_val * sizeof(val[0]));
   if (val == NULL) {
     OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE);
     goto err;
   }
-  val[num_val] = NULL; /* pivot element */
+  OPENSSL_memset(val, 0, num_val * sizeof(val[0]));
 
   /* allocate points for precomputation */
   v = val;
@@ -442,17 +440,15 @@
   OPENSSL_free(wsize);
   OPENSSL_free(wNAF_len);
   if (wNAF != NULL) {
-    int8_t **w;
-
-    for (w = wNAF; *w != NULL; w++) {
-      OPENSSL_free(*w);
+    for (i = 0; i < total_num; i++) {
+      OPENSSL_free(wNAF[i]);
     }
 
     OPENSSL_free(wNAF);
   }
   if (val != NULL) {
-    for (v = val; *v != NULL; v++) {
-      EC_POINT_clear_free(*v);
+    for (i = 0; i < num_val; i++) {
+      EC_POINT_clear_free(val[i]);
     }
 
     OPENSSL_free(val);
diff --git a/src/crypto/modes/asm/aesni-gcm-x86_64.pl b/src/crypto/modes/asm/aesni-gcm-x86_64.pl
index f777a6e..e329741 100644
--- a/src/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/src/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -41,17 +41,24 @@
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-# This must be kept in sync with |$avx| in ghash-x86_64.pl; otherwise tags will
+# |$avx| in ghash-x86_64.pl must be set to at least 1; otherwise tags will
 # be computed incorrectly.
 #
 # In upstream, this is controlled by shelling out to the compiler to check
 # versions, but BoringSSL is intended to be used with pre-generated perlasm
 # output, so this isn't useful anyway.
-$avx = 0;
+#
+# The upstream code uses the condition |$avx>1| even though no AVX2
+# instructions are used, because it assumes MOVBE is supported by the assembler
+# if and only if AVX2 is also supported by the assembler; see
+# https://marc.info/?l=openssl-dev&m=146567589526984&w=2.
+$avx = 2;
 
 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;
 
+# See the comment above regarding why the condition is ($avx>1) when there are
+# no AVX2 instructions being used.
 if ($avx>1) {{{
 
 ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
diff --git a/src/crypto/modes/asm/ghash-x86.pl b/src/crypto/modes/asm/ghash-x86.pl
index db6eeae..182c29a 100644
--- a/src/crypto/modes/asm/ghash-x86.pl
+++ b/src/crypto/modes/asm/ghash-x86.pl
@@ -258,245 +258,17 @@
 	&mov	(&DWP($bias+56,"esp"),0xA9C0<<16);
 	&mov	(&DWP($bias+60,"esp"),0xB5E0<<16);
 }
-
-$suffix = $x86only ? "" : "_x86";
 
-&function_begin("gcm_gmult_4bit".$suffix);
-	&stack_push(16+4+1);			# +1 for stack alignment
-	&mov	($inp,&wparam(0));		# load Xi
-	&mov	($Htbl,&wparam(1));		# load Htable
-
-	&mov	($Zhh,&DWP(0,$inp));		# load Xi[16]
-	&mov	($Zhl,&DWP(4,$inp));
-	&mov	($Zlh,&DWP(8,$inp));
-	&mov	($Zll,&DWP(12,$inp));
-
-	&deposit_rem_4bit(16);
-
-	&mov	(&DWP(0,"esp"),$Zhh);		# copy Xi[16] on stack
-	&mov	(&DWP(4,"esp"),$Zhl);
-	&mov	(&DWP(8,"esp"),$Zlh);
-	&mov	(&DWP(12,"esp"),$Zll);
-	&shr	($Zll,20);
-	&and	($Zll,0xf0);
-
-	if ($unroll) {
-		&call	("_x86_gmult_4bit_inner");
-	} else {
-		&x86_loop(0);
-		&mov	($inp,&wparam(0));
-	}
-
-	&mov	(&DWP(12,$inp),$Zll);
-	&mov	(&DWP(8,$inp),$Zlh);
-	&mov	(&DWP(4,$inp),$Zhl);
-	&mov	(&DWP(0,$inp),$Zhh);
-	&stack_pop(16+4+1);
-&function_end("gcm_gmult_4bit".$suffix);
-
-&function_begin("gcm_ghash_4bit".$suffix);
-	&stack_push(16+4+1);			# +1 for 64-bit alignment
-	&mov	($Zll,&wparam(0));		# load Xi
-	&mov	($Htbl,&wparam(1));		# load Htable
-	&mov	($inp,&wparam(2));		# load in
-	&mov	("ecx",&wparam(3));		# load len
-	&add	("ecx",$inp);
-	&mov	(&wparam(3),"ecx");
-
-	&mov	($Zhh,&DWP(0,$Zll));		# load Xi[16]
-	&mov	($Zhl,&DWP(4,$Zll));
-	&mov	($Zlh,&DWP(8,$Zll));
-	&mov	($Zll,&DWP(12,$Zll));
-
-	&deposit_rem_4bit(16);
-
-    &set_label("x86_outer_loop",16);
-	&xor	($Zll,&DWP(12,$inp));		# xor with input
-	&xor	($Zlh,&DWP(8,$inp));
-	&xor	($Zhl,&DWP(4,$inp));
-	&xor	($Zhh,&DWP(0,$inp));
-	&mov	(&DWP(12,"esp"),$Zll);		# dump it on stack
-	&mov	(&DWP(8,"esp"),$Zlh);
-	&mov	(&DWP(4,"esp"),$Zhl);
-	&mov	(&DWP(0,"esp"),$Zhh);
-
-	&shr	($Zll,20);
-	&and	($Zll,0xf0);
-
-	if ($unroll) {
-		&call	("_x86_gmult_4bit_inner");
-	} else {
-		&x86_loop(0);
-		&mov	($inp,&wparam(2));
-	}
-	&lea	($inp,&DWP(16,$inp));
-	&cmp	($inp,&wparam(3));
-	&mov	(&wparam(2),$inp)	if (!$unroll);
-	&jb	(&label("x86_outer_loop"));
-
-	&mov	($inp,&wparam(0));	# load Xi
-	&mov	(&DWP(12,$inp),$Zll);
-	&mov	(&DWP(8,$inp),$Zlh);
-	&mov	(&DWP(4,$inp),$Zhl);
-	&mov	(&DWP(0,$inp),$Zhh);
-	&stack_pop(16+4+1);
-&function_end("gcm_ghash_4bit".$suffix);
-
 if (!$x86only) {{{
 
 &static_label("rem_4bit");
 
 if (!$sse2) {{	# pure-MMX "May" version...
 
-$S=12;		# shift factor for rem_4bit
+    # This code was removed since SSE2 is required for BoringSSL. The
+    # outer structure of the code was retained to minimize future merge
+    # conflicts.
 
-&function_begin_B("_mmx_gmult_4bit_inner");
-# MMX version performs 3.5 times better on P4 (see comment in non-MMX
-# routine for further details), 100% better on Opteron, ~70% better
-# on Core2 and PIII... In other words effort is considered to be well
-# spent... Since initial release the loop was unrolled in order to
-# "liberate" register previously used as loop counter. Instead it's
-# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
-# The path involves move of Z.lo from MMX to integer register,
-# effective address calculation and finally merge of value to Z.hi.
-# Reference to rem_4bit is scheduled so late that I had to >>4
-# rem_4bit elements. This resulted in 20-45% procent improvement
-# on contemporary µ-archs.
-{
-    my $cnt;
-    my $rem_4bit = "eax";
-    my @rem = ($Zhh,$Zll);
-    my $nhi = $Zhl;
-    my $nlo = $Zlh;
-
-    my ($Zlo,$Zhi) = ("mm0","mm1");
-    my $tmp = "mm2";
-
-	&xor	($nlo,$nlo);	# avoid partial register stalls on PIII
-	&mov	($nhi,$Zll);
-	&mov	(&LB($nlo),&LB($nhi));
-	&shl	(&LB($nlo),4);
-	&and	($nhi,0xf0);
-	&movq	($Zlo,&QWP(8,$Htbl,$nlo));
-	&movq	($Zhi,&QWP(0,$Htbl,$nlo));
-	&movd	($rem[0],$Zlo);
-
-	for ($cnt=28;$cnt>=-2;$cnt--) {
-	    my $odd = $cnt&1;
-	    my $nix = $odd ? $nlo : $nhi;
-
-		&shl	(&LB($nlo),4)			if ($odd);
-		&psrlq	($Zlo,4);
-		&movq	($tmp,$Zhi);
-		&psrlq	($Zhi,4);
-		&pxor	($Zlo,&QWP(8,$Htbl,$nix));
-		&mov	(&LB($nlo),&BP($cnt/2,$inp))	if (!$odd && $cnt>=0);
-		&psllq	($tmp,60);
-		&and	($nhi,0xf0)			if ($odd);
-		&pxor	($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
-		&and	($rem[0],0xf);
-		&pxor	($Zhi,&QWP(0,$Htbl,$nix));
-		&mov	($nhi,$nlo)			if (!$odd && $cnt>=0);
-		&movd	($rem[1],$Zlo);
-		&pxor	($Zlo,$tmp);
-
-		push	(@rem,shift(@rem));		# "rotate" registers
-	}
-
-	&mov	($inp,&DWP(4,$rem_4bit,$rem[1],8));	# last rem_4bit[rem]
-
-	&psrlq	($Zlo,32);	# lower part of Zlo is already there
-	&movd	($Zhl,$Zhi);
-	&psrlq	($Zhi,32);
-	&movd	($Zlh,$Zlo);
-	&movd	($Zhh,$Zhi);
-	&shl	($inp,4);	# compensate for rem_4bit[i] being >>4
-
-	&bswap	($Zll);
-	&bswap	($Zhl);
-	&bswap	($Zlh);
-	&xor	($Zhh,$inp);
-	&bswap	($Zhh);
-
-	&ret	();
-}
-&function_end_B("_mmx_gmult_4bit_inner");
-
-&function_begin("gcm_gmult_4bit_mmx");
-	&mov	($inp,&wparam(0));	# load Xi
-	&mov	($Htbl,&wparam(1));	# load Htable
-
-	&call	(&label("pic_point"));
-	&set_label("pic_point");
-	&blindpop("eax");
-	&lea	("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
-
-	&movz	($Zll,&BP(15,$inp));
-
-	&call	("_mmx_gmult_4bit_inner");
-
-	&mov	($inp,&wparam(0));	# load Xi
-	&emms	();
-	&mov	(&DWP(12,$inp),$Zll);
-	&mov	(&DWP(4,$inp),$Zhl);
-	&mov	(&DWP(8,$inp),$Zlh);
-	&mov	(&DWP(0,$inp),$Zhh);
-&function_end("gcm_gmult_4bit_mmx");
-
-# Streamed version performs 20% better on P4, 7% on Opteron,
-# 10% on Core2 and PIII...
-&function_begin("gcm_ghash_4bit_mmx");
-	&mov	($Zhh,&wparam(0));	# load Xi
-	&mov	($Htbl,&wparam(1));	# load Htable
-	&mov	($inp,&wparam(2));	# load in
-	&mov	($Zlh,&wparam(3));	# load len
-
-	&call	(&label("pic_point"));
-	&set_label("pic_point");
-	&blindpop("eax");
-	&lea	("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
-
-	&add	($Zlh,$inp);
-	&mov	(&wparam(3),$Zlh);	# len to point at the end of input
-	&stack_push(4+1);		# +1 for stack alignment
-
-	&mov	($Zll,&DWP(12,$Zhh));	# load Xi[16]
-	&mov	($Zhl,&DWP(4,$Zhh));
-	&mov	($Zlh,&DWP(8,$Zhh));
-	&mov	($Zhh,&DWP(0,$Zhh));
-	&jmp	(&label("mmx_outer_loop"));
-
-    &set_label("mmx_outer_loop",16);
-	&xor	($Zll,&DWP(12,$inp));
-	&xor	($Zhl,&DWP(4,$inp));
-	&xor	($Zlh,&DWP(8,$inp));
-	&xor	($Zhh,&DWP(0,$inp));
-	&mov	(&wparam(2),$inp);
-	&mov	(&DWP(12,"esp"),$Zll);
-	&mov	(&DWP(4,"esp"),$Zhl);
-	&mov	(&DWP(8,"esp"),$Zlh);
-	&mov	(&DWP(0,"esp"),$Zhh);
-
-	&mov	($inp,"esp");
-	&shr	($Zll,24);
-
-	&call	("_mmx_gmult_4bit_inner");
-
-	&mov	($inp,&wparam(2));
-	&lea	($inp,&DWP(16,$inp));
-	&cmp	($inp,&wparam(3));
-	&jb	(&label("mmx_outer_loop"));
-
-	&mov	($inp,&wparam(0));	# load Xi
-	&emms	();
-	&mov	(&DWP(12,$inp),$Zll);
-	&mov	(&DWP(4,$inp),$Zhl);
-	&mov	(&DWP(8,$inp),$Zlh);
-	&mov	(&DWP(0,$inp),$Zhh);
-
-	&stack_pop(4+1);
-&function_end("gcm_ghash_4bit_mmx");
-
 }} else {{	# "June" MMX version...
 		# ... has slower "April" gcm_gmult_4bit_mmx with folded
 		# loop. This is done to conserve code size...
diff --git a/src/crypto/modes/asm/ghash-x86_64.pl b/src/crypto/modes/asm/ghash-x86_64.pl
index df8546c..d7471e2 100644
--- a/src/crypto/modes/asm/ghash-x86_64.pl
+++ b/src/crypto/modes/asm/ghash-x86_64.pl
@@ -90,13 +90,13 @@
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-# This must be kept in sync with |$avx| in aesni-gcm-x86_64.pl; otherwise tags
-# will be computed incorrectly.
+# See the notes about |$avx| in aesni-gcm-x86_64.pl; otherwise tags will be
+# computed incorrectly.
 #
 # In upstream, this is controlled by shelling out to the compiler to check
 # versions, but BoringSSL is intended to be used with pre-generated perlasm
 # output, so this isn't useful anyway.
-$avx = 0;
+$avx = 1;
 
 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;
diff --git a/src/crypto/modes/gcm.c b/src/crypto/modes/gcm.c
index df68c40..1330ad6 100644
--- a/src/crypto/modes/gcm.c
+++ b/src/crypto/modes/gcm.c
@@ -255,19 +255,16 @@
 
 
 #if defined(GHASH_ASM)
+
 #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
-#define GHASH_ASM_X86_OR_64
 #define GCM_FUNCREF_4BIT
 void gcm_init_clmul(u128 Htable[16], const uint64_t Xi[2]);
 void gcm_gmult_clmul(uint64_t Xi[2], const u128 Htable[16]);
 void gcm_ghash_clmul(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
                      size_t len);
 
-#if defined(OPENSSL_X86)
-#define gcm_init_avx gcm_init_clmul
-#define gcm_gmult_avx gcm_gmult_clmul
-#define gcm_ghash_avx gcm_ghash_clmul
-#else
+#if defined(OPENSSL_X86_64)
+#define GHASH_ASM_X86_64
 void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]);
 void gcm_gmult_avx(uint64_t Xi[2], const u128 Htable[16]);
 void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
@@ -289,11 +286,8 @@
 void gcm_gmult_4bit_mmx(uint64_t Xi[2], const u128 Htable[16]);
 void gcm_ghash_4bit_mmx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
                         size_t len);
-
-void gcm_gmult_4bit_x86(uint64_t Xi[2], const u128 Htable[16]);
-void gcm_ghash_4bit_x86(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
-                        size_t len);
 #endif
+
 #elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
 #include <openssl/arm_arch.h>
 #if __ARM_ARCH__ >= 7
@@ -357,7 +351,8 @@
 #endif
 
 void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash,
-                       u128 out_table[16], const uint8_t *gcm_key) {
+                       u128 *out_key, u128 out_table[16],
+                       const uint8_t *gcm_key) {
   union {
     uint64_t u[2];
     uint8_t c[16];
@@ -369,7 +364,9 @@
   H.u[0] = CRYPTO_bswap8(H.u[0]);
   H.u[1] = CRYPTO_bswap8(H.u[1]);
 
-#if defined(GHASH_ASM_X86_OR_64)
+  OPENSSL_memcpy(out_key, H.c, 16);
+
+#if defined(GHASH_ASM_X86_64)
   if (crypto_gcm_clmul_enabled()) {
     if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
       gcm_init_avx(out_table, H.u);
@@ -377,20 +374,18 @@
       *out_hash = gcm_ghash_avx;
       return;
     }
-
     gcm_init_clmul(out_table, H.u);
     *out_mult = gcm_gmult_clmul;
     *out_hash = gcm_ghash_clmul;
     return;
   }
-#if defined(GHASH_ASM_X86) /* x86 only */
-  if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
-    gcm_init_4bit(out_table, H.u);
-    *out_mult = gcm_gmult_4bit_mmx;
-    *out_hash = gcm_ghash_4bit_mmx;
+#elif defined(GHASH_ASM_X86)
+  if (crypto_gcm_clmul_enabled()) {
+    gcm_init_clmul(out_table, H.u);
+    *out_mult = gcm_gmult_clmul;
+    *out_hash = gcm_ghash_clmul;
     return;
   }
-#endif
 #elif defined(GHASH_ASM_ARM)
   if (pmull_capable()) {
     gcm_init_v8(out_table, H.u);
@@ -416,8 +411,8 @@
 
   gcm_init_4bit(out_table, H.u);
 #if defined(GHASH_ASM_X86)
-  *out_mult = gcm_gmult_4bit_x86;
-  *out_hash = gcm_ghash_4bit_x86;
+  *out_mult = gcm_gmult_4bit_mmx;
+  *out_hash = gcm_ghash_4bit_mmx;
 #else
   *out_mult = gcm_gmult_4bit;
   *out_hash = gcm_ghash_4bit;
@@ -433,7 +428,7 @@
   OPENSSL_memset(gcm_key, 0, sizeof(gcm_key));
   (*block)(gcm_key, gcm_key, aes_key);
 
-  CRYPTO_ghash_init(&ctx->gmult, &ctx->ghash, ctx->Htable, gcm_key);
+  CRYPTO_ghash_init(&ctx->gmult, &ctx->ghash, &ctx->H, ctx->Htable, gcm_key);
 }
 
 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const void *key,
diff --git a/src/crypto/modes/internal.h b/src/crypto/modes/internal.h
index 9b579fa..94072ec 100644
--- a/src/crypto/modes/internal.h
+++ b/src/crypto/modes/internal.h
@@ -150,6 +150,9 @@
     size_t t[16 / sizeof(size_t)];
   } Yi, EKi, EK0, len, Xi;
 
+  /* Note that the order of |Xi|, |H| and |Htable| is fixed by the MOVBE-based,
+   * x86-64, GHASH assembly. */
+  u128 H;
   u128 Htable[16];
   gmult_func gmult;
   ghash_func ghash;
@@ -211,7 +214,8 @@
  * |out_table| and sets |*out_mult| and |*out_hash| to (potentially hardware
  * accelerated) functions for performing operations in the GHASH field. */
 void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash,
-                       u128 out_table[16], const uint8_t *gcm_key);
+                       u128 *out_key, u128 out_table[16],
+                       const uint8_t *gcm_key);
 
 /* CRYPTO_gcm128_init initialises |ctx| to use |block| (typically AES) with
  * the given key. */
@@ -348,7 +352,10 @@
 } polyval_block;
 
 struct polyval_ctx {
+  /* Note that the order of |S|, |H| and |Htable| is fixed by the MOVBE-based,
+   * x86-64, GHASH assembly. */
   polyval_block S;
+  u128 H;
   u128 Htable[16];
   gmult_func gmult;
   ghash_func ghash;
diff --git a/src/crypto/modes/polyval.c b/src/crypto/modes/polyval.c
index 125b256..33d37eb 100644
--- a/src/crypto/modes/polyval.c
+++ b/src/crypto/modes/polyval.c
@@ -57,7 +57,7 @@
   OPENSSL_memcpy(H.c, key, 16);
   reverse_and_mulX_ghash(&H);
 
-  CRYPTO_ghash_init(&ctx->gmult, &ctx->ghash, ctx->Htable, H.c);
+  CRYPTO_ghash_init(&ctx->gmult, &ctx->ghash, &ctx->H, ctx->Htable, H.c);
   OPENSSL_memset(&ctx->S, 0, sizeof(ctx->S));
 }
 
diff --git a/src/crypto/pkcs8/CMakeLists.txt b/src/crypto/pkcs8/CMakeLists.txt
index ffb3821..a2e52e1 100644
--- a/src/crypto/pkcs8/CMakeLists.txt
+++ b/src/crypto/pkcs8/CMakeLists.txt
@@ -7,7 +7,6 @@
 
   pkcs8.c
   p8_pkey.c
-  p5_pbe.c
   p5_pbev2.c
 )
 
diff --git a/src/crypto/pkcs8/internal.h b/src/crypto/pkcs8/internal.h
index 7995e78..9cebe29 100644
--- a/src/crypto/pkcs8/internal.h
+++ b/src/crypto/pkcs8/internal.h
@@ -63,17 +63,36 @@
 #endif
 
 
+#define PBE_UCS2_CONVERT_PASSWORD 0x1
+
+struct pbe_suite {
+  int pbe_nid;
+  const EVP_CIPHER *(*cipher_func)(void);
+  const EVP_MD *(*md_func)(void);
+  /* decrypt_init initialize |ctx| for decrypting. The password is specified by
+   * |pass_raw| and |pass_raw_len|. |param| contains the serialized parameters
+   * field of the AlgorithmIdentifier.
+   *
+   * It returns one on success and zero on error. */
+  int (*decrypt_init)(const struct pbe_suite *suite, EVP_CIPHER_CTX *ctx,
+                      const uint8_t *pass_raw, size_t pass_raw_len, CBS *param);
+  int flags;
+};
+
 #define PKCS5_DEFAULT_ITERATIONS 2048
 #define PKCS5_SALT_LEN 8
 
-/* PKCS5_v2_PBE_keyivgen intializes the supplied |ctx| for PBKDF v2, which must
- * be specified by |param|. The password is specified by |pass_raw| and
- * |pass_raw_len|. |cipher| and |md| are ignored.
- *
- * It returns one on success and zero on error. */
-int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const uint8_t *pass_raw,
-                          size_t pass_raw_len, ASN1_TYPE *param,
-                          const EVP_CIPHER *cipher, const EVP_MD *md, int enc);
+int PKCS5_pbe2_decrypt_init(const struct pbe_suite *suite, EVP_CIPHER_CTX *ctx,
+                            const uint8_t *pass_raw, size_t pass_raw_len,
+                            CBS *param);
+
+/* PKCS5_pbe2_encrypt_init configures |ctx| for encrypting with PKCS #5 PBES2,
+ * as defined in RFC 2998, with the specified parameters. It writes the
+ * corresponding AlgorithmIdentifier to |out|. */
+int PKCS5_pbe2_encrypt_init(CBB *out, EVP_CIPHER_CTX *ctx,
+                            const EVP_CIPHER *cipher, unsigned iterations,
+                            const uint8_t *pass_raw, size_t pass_raw_len,
+                            const uint8_t *salt, size_t salt_len);
 
 
 #if defined(__cplusplus)
diff --git a/src/crypto/pkcs8/p5_pbe.c b/src/crypto/pkcs8/p5_pbe.c
deleted file mode 100644
index eee2e00..0000000
--- a/src/crypto/pkcs8/p5_pbe.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
- * project 1999.
- */
-/* ====================================================================
- * Copyright (c) 1999 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    licensing@OpenSSL.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- * This product includes cryptographic software written by Eric Young
- * (eay@cryptsoft.com).  This product includes software written by Tim
- * Hudson (tjh@cryptsoft.com). */
-
-#include <string.h>
-
-#include <openssl/asn1t.h>
-#include <openssl/err.h>
-#include <openssl/obj.h>
-#include <openssl/pkcs8.h>
-#include <openssl/rand.h>
-#include <openssl/x509.h>
-
-#include "../internal.h"
-#include "internal.h"
-
-
-/* PKCS#5 password based encryption structure */
-
-ASN1_SEQUENCE(PBEPARAM) = {
-	ASN1_SIMPLE(PBEPARAM, salt, ASN1_OCTET_STRING),
-	ASN1_SIMPLE(PBEPARAM, iter, ASN1_INTEGER)
-} ASN1_SEQUENCE_END(PBEPARAM)
-
-IMPLEMENT_ASN1_FUNCTIONS(PBEPARAM)
-
-
-/* Set an algorithm identifier for a PKCS#5 PBE algorithm */
-
-int PKCS5_pbe_set0_algor(X509_ALGOR *algor, int alg, int iter,
-				const unsigned char *salt, int saltlen)
-	{
-	PBEPARAM *pbe=NULL;
-	ASN1_STRING *pbe_str=NULL;
-	unsigned char *sstr;
-
-	pbe = PBEPARAM_new();
-	if (!pbe)
-		{
-		OPENSSL_PUT_ERROR(PKCS8, ERR_R_MALLOC_FAILURE);
-		goto err;
-		}
-	if(iter <= 0)
-		iter = PKCS5_DEFAULT_ITERATIONS;
-	if (!ASN1_INTEGER_set(pbe->iter, iter))
-		{
-		OPENSSL_PUT_ERROR(PKCS8, ERR_R_MALLOC_FAILURE);
-		goto err;
-		}
-	if (!saltlen)
-		saltlen = PKCS5_SALT_LEN;
-	if (!ASN1_STRING_set(pbe->salt, NULL, saltlen))
-		{
-		OPENSSL_PUT_ERROR(PKCS8, ERR_R_MALLOC_FAILURE);
-		goto err;
-		}
-	sstr = ASN1_STRING_data(pbe->salt);
-	if (salt)
-		OPENSSL_memcpy(sstr, salt, saltlen);
-	else if (!RAND_bytes(sstr, saltlen))
-		goto err;
-
-	if(!ASN1_item_pack(pbe, ASN1_ITEM_rptr(PBEPARAM), &pbe_str))
-		{
-		OPENSSL_PUT_ERROR(PKCS8, ERR_R_MALLOC_FAILURE);
-		goto err;
-		}
-
-	PBEPARAM_free(pbe);
-	pbe = NULL;
-
-	if (X509_ALGOR_set0(algor, OBJ_nid2obj(alg), V_ASN1_SEQUENCE, pbe_str))
-		return 1;
-
-err:
-	if (pbe != NULL)
-		PBEPARAM_free(pbe);
-	if (pbe_str != NULL)
-		ASN1_STRING_free(pbe_str);
-	return 0;
-	}
-
-/* Return an algorithm identifier for a PKCS#5 PBE algorithm */
-
-X509_ALGOR *PKCS5_pbe_set(int alg, int iter,
-				const unsigned char *salt, int saltlen)
-	{
-	X509_ALGOR *ret;
-	ret = X509_ALGOR_new();
-	if (!ret)
-		{
-		OPENSSL_PUT_ERROR(PKCS8, ERR_R_MALLOC_FAILURE);
-		return NULL;
-		}
-
-	if (PKCS5_pbe_set0_algor(ret, alg, iter, salt, saltlen)) 
-		return ret;
-
-	X509_ALGOR_free(ret);
-	return NULL;
-	}
diff --git a/src/crypto/pkcs8/p5_pbev2.c b/src/crypto/pkcs8/p5_pbev2.c
index c16b83f..59e2067 100644
--- a/src/crypto/pkcs8/p5_pbev2.c
+++ b/src/crypto/pkcs8/p5_pbev2.c
@@ -53,390 +53,174 @@
  * (eay@cryptsoft.com).  This product includes software written by Tim
  * Hudson (tjh@cryptsoft.com). */
 
-#include <assert.h>
+#include <openssl/pkcs8.h>
+
 #include <limits.h>
 #include <string.h>
 
-#include <openssl/asn1t.h>
+#include <openssl/bytestring.h>
 #include <openssl/cipher.h>
 #include <openssl/err.h>
 #include <openssl/mem.h>
 #include <openssl/obj.h>
-#include <openssl/pkcs8.h>
 #include <openssl/rand.h>
-#include <openssl/x509.h>
 
 #include "internal.h"
 #include "../internal.h"
 
 
-/* PKCS#5 v2.0 password based encryption structures */
-
-ASN1_SEQUENCE(PBE2PARAM) = {
-	ASN1_SIMPLE(PBE2PARAM, keyfunc, X509_ALGOR),
-	ASN1_SIMPLE(PBE2PARAM, encryption, X509_ALGOR)
-} ASN1_SEQUENCE_END(PBE2PARAM)
-
-IMPLEMENT_ASN1_FUNCTIONS(PBE2PARAM)
-
-ASN1_SEQUENCE(PBKDF2PARAM) = {
-	ASN1_SIMPLE(PBKDF2PARAM, salt, ASN1_ANY),
-	ASN1_SIMPLE(PBKDF2PARAM, iter, ASN1_INTEGER),
-	ASN1_OPT(PBKDF2PARAM, keylength, ASN1_INTEGER),
-	ASN1_OPT(PBKDF2PARAM, prf, X509_ALGOR)
-} ASN1_SEQUENCE_END(PBKDF2PARAM)
-
-IMPLEMENT_ASN1_FUNCTIONS(PBKDF2PARAM)
-
-static int ASN1_TYPE_set_octetstring(ASN1_TYPE *a, unsigned char *data, int len)
-	{
-	ASN1_STRING *os;
-
-	if ((os=M_ASN1_OCTET_STRING_new()) == NULL) return(0);
-	if (!M_ASN1_OCTET_STRING_set(os,data,len))
-		{
-		M_ASN1_OCTET_STRING_free(os);
-		return 0;
-		}
-	ASN1_TYPE_set(a,V_ASN1_OCTET_STRING,os);
-	return(1);
-	}
-
-static int param_to_asn1(EVP_CIPHER_CTX *c, ASN1_TYPE *type)
-	{
-	unsigned iv_len;
-
-	iv_len = EVP_CIPHER_CTX_iv_length(c);
-	return ASN1_TYPE_set_octetstring(type, c->oiv, iv_len);
-	}
-
-/* Return an algorithm identifier for a PKCS#5 v2.0 PBE algorithm:
- * yes I know this is horrible!
- *
- * Extended version to allow application supplied PRF NID and IV. */
-
-X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
-				 unsigned char *salt, int saltlen,
-				 unsigned char *aiv, int prf_nid)
-{
-	X509_ALGOR *scheme = NULL, *kalg = NULL, *ret = NULL;
-	int alg_nid, keylen;
-	EVP_CIPHER_CTX ctx;
-	unsigned char iv[EVP_MAX_IV_LENGTH];
-	PBE2PARAM *pbe2 = NULL;
-	const ASN1_OBJECT *obj;
-
-	alg_nid = EVP_CIPHER_nid(cipher);
-	if(alg_nid == NID_undef) {
-		OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER);
-		goto err;
-	}
-	obj = OBJ_nid2obj(alg_nid);
-
-	if(!(pbe2 = PBE2PARAM_new())) goto merr;
-
-	/* Setup the AlgorithmIdentifier for the encryption scheme */
-	scheme = pbe2->encryption;
-
-	scheme->algorithm = (ASN1_OBJECT*) obj;
-	if(!(scheme->parameter = ASN1_TYPE_new())) goto merr;
-
-	/* Create random IV */
-	if (EVP_CIPHER_iv_length(cipher))
-		{
-		if (aiv)
-			OPENSSL_memcpy(iv, aiv, EVP_CIPHER_iv_length(cipher));
-		else if (!RAND_bytes(iv, EVP_CIPHER_iv_length(cipher)))
-  			goto err;
-		}
-
-	EVP_CIPHER_CTX_init(&ctx);
-
-	/* Dummy cipherinit to just setup the IV, and PRF */
-	if (!EVP_CipherInit_ex(&ctx, cipher, NULL, NULL, iv, 0))
-		goto err;
-	if(param_to_asn1(&ctx, scheme->parameter) < 0) {
-		OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_ERROR_SETTING_CIPHER_PARAMS);
-		EVP_CIPHER_CTX_cleanup(&ctx);
-		goto err;
-	}
-	/* If prf NID unspecified see if cipher has a preference.
-	 * An error is OK here: just means use default PRF.
-	 */
-	if ((prf_nid == -1) && 
-	EVP_CIPHER_CTX_ctrl(&ctx, EVP_CTRL_PBE_PRF_NID, 0, &prf_nid) <= 0)
-		{
-		ERR_clear_error();
-		prf_nid = NID_hmacWithSHA1;
-		}
-	EVP_CIPHER_CTX_cleanup(&ctx);
-
-	/* If its RC2 then we'd better setup the key length */
-
-	if(alg_nid == NID_rc2_cbc)
-		keylen = EVP_CIPHER_key_length(cipher);
-	else
-		keylen = -1;
-
-	/* Setup keyfunc */
-
-	X509_ALGOR_free(pbe2->keyfunc);
-
-	pbe2->keyfunc = PKCS5_pbkdf2_set(iter, salt, saltlen, prf_nid, keylen);
-
-	if (!pbe2->keyfunc)
-		goto merr;
-
-	/* Now set up top level AlgorithmIdentifier */
-
-	if(!(ret = X509_ALGOR_new())) goto merr;
-	if(!(ret->parameter = ASN1_TYPE_new())) goto merr;
-
-	ret->algorithm = (ASN1_OBJECT*) OBJ_nid2obj(NID_pbes2);
-
-	/* Encode PBE2PARAM into parameter */
-
-	if(!ASN1_item_pack(pbe2, ASN1_ITEM_rptr(PBE2PARAM),
-				 &ret->parameter->value.sequence)) goto merr;
-	ret->parameter->type = V_ASN1_SEQUENCE;
-
-	PBE2PARAM_free(pbe2);
-	pbe2 = NULL;
-
-	return ret;
-
-	merr:
-	OPENSSL_PUT_ERROR(PKCS8, ERR_R_MALLOC_FAILURE);
-
-	err:
-	PBE2PARAM_free(pbe2);
-	/* Note 'scheme' is freed as part of pbe2 */
-	X509_ALGOR_free(kalg);
-	X509_ALGOR_free(ret);
-
-	return NULL;
-
-}
-
-X509_ALGOR *PKCS5_pbe2_set(const EVP_CIPHER *cipher, int iter,
-				 unsigned char *salt, int saltlen)
-	{
-	return PKCS5_pbe2_set_iv(cipher, iter, salt, saltlen, NULL, -1);
-	}
-
-X509_ALGOR *PKCS5_pbkdf2_set(int iter, unsigned char *salt, int saltlen,
-				int prf_nid, int keylen)
-	{
-	X509_ALGOR *keyfunc = NULL;
-	PBKDF2PARAM *kdf = NULL;
-	ASN1_OCTET_STRING *osalt = NULL;
-
-	if(!(kdf = PBKDF2PARAM_new()))
-		goto merr;
-	if(!(osalt = M_ASN1_OCTET_STRING_new()))
-		goto merr;
-
-	kdf->salt->value.octet_string = osalt;
-	kdf->salt->type = V_ASN1_OCTET_STRING;
-
-	if (!saltlen)
-		saltlen = PKCS5_SALT_LEN;
-	if (!(osalt->data = OPENSSL_malloc (saltlen)))
-		goto merr;
-
-	osalt->length = saltlen;
-
-	if (salt)
-		OPENSSL_memcpy (osalt->data, salt, saltlen);
-	else if (!RAND_bytes(osalt->data, saltlen))
-		goto merr;
-
-	if(iter <= 0)
-		iter = PKCS5_DEFAULT_ITERATIONS;
-
-	if(!ASN1_INTEGER_set(kdf->iter, iter))
-		goto merr;
-
-	/* If have a key len set it up */
-
-	if(keylen > 0) 
-		{
-		if(!(kdf->keylength = M_ASN1_INTEGER_new()))
-			goto merr;
-		if(!ASN1_INTEGER_set (kdf->keylength, keylen))
-			goto merr;
-		}
-
-	/* prf can stay NULL if we are using hmacWithSHA1 */
-	if (prf_nid > 0 && prf_nid != NID_hmacWithSHA1)
-		{
-		kdf->prf = X509_ALGOR_new();
-		if (!kdf->prf)
-			goto merr;
-		X509_ALGOR_set0(kdf->prf, OBJ_nid2obj(prf_nid),
-					V_ASN1_NULL, NULL);
-		}
-
-	/* Finally setup the keyfunc structure */
-
-	keyfunc = X509_ALGOR_new();
-	if (!keyfunc)
-		goto merr;
-
-	keyfunc->algorithm = (ASN1_OBJECT*) OBJ_nid2obj(NID_id_pbkdf2);
-
-	/* Encode PBKDF2PARAM into parameter of pbe2 */
-
-	if(!(keyfunc->parameter = ASN1_TYPE_new()))
-		goto merr;
-
-	if(!ASN1_item_pack(kdf, ASN1_ITEM_rptr(PBKDF2PARAM),
-			 &keyfunc->parameter->value.sequence))
-		goto merr;
-	keyfunc->parameter->type = V_ASN1_SEQUENCE;
-
-	PBKDF2PARAM_free(kdf);
-	return keyfunc;
-
-	merr:
-	OPENSSL_PUT_ERROR(PKCS8, ERR_R_MALLOC_FAILURE);
-	PBKDF2PARAM_free(kdf);
-	X509_ALGOR_free(keyfunc);
-	return NULL;
-	}
-
-static int PKCS5_v2_PBKDF2_keyivgen(EVP_CIPHER_CTX *ctx,
-                                    const uint8_t *pass_raw,
-                                    size_t pass_raw_len, const ASN1_TYPE *param,
-                                    const ASN1_TYPE *iv, int enc) {
-  int rv = 0;
-  PBKDF2PARAM *pbkdf2param = NULL;
-
-  if (EVP_CIPHER_CTX_cipher(ctx) == NULL) {
-    OPENSSL_PUT_ERROR(PKCS8, CIPHER_R_NO_CIPHER_SET);
-    goto err;
+static int pkcs5_pbe2_cipher_init(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher,
+                                  unsigned iterations, const uint8_t *pass_raw,
+                                  size_t pass_raw_len, const uint8_t *salt,
+                                  size_t salt_len, const uint8_t *iv,
+                                  size_t iv_len, int enc) {
+  if (iv_len != EVP_CIPHER_iv_length(cipher)) {
+    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_ERROR_SETTING_CIPHER_PARAMS);
+    return 0;
   }
 
-  /* Decode parameters. */
-  if (param == NULL || param->type != V_ASN1_SEQUENCE) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR);
-    goto err;
-  }
-
-  const uint8_t *pbuf = param->value.sequence->data;
-  int plen = param->value.sequence->length;
-  pbkdf2param = d2i_PBKDF2PARAM(NULL, &pbuf, plen);
-  if (pbkdf2param == NULL || pbuf != param->value.sequence->data + plen) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR);
-    goto err;
-  }
-
-  /* Now check the parameters. */
   uint8_t key[EVP_MAX_KEY_LENGTH];
-  const size_t key_len = EVP_CIPHER_CTX_key_length(ctx);
-  assert(key_len <= sizeof(key));
-
-  if (pbkdf2param->keylength != NULL &&
-      ASN1_INTEGER_get(pbkdf2param->keylength) != (int) key_len) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_KEYLENGTH);
-    goto err;
-  }
-
-  if (pbkdf2param->prf != NULL &&
-      OBJ_obj2nid(pbkdf2param->prf->algorithm) != NID_hmacWithSHA1) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_PRF);
-    goto err;
-  }
-
-  if (pbkdf2param->salt->type != V_ASN1_OCTET_STRING) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_SALT_TYPE);
-    goto err;
-  }
-
-  if (pbkdf2param->iter->type != V_ASN1_INTEGER) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_ITERATION_COUNT);
-    goto err;
-  }
-  long iterations = ASN1_INTEGER_get(pbkdf2param->iter);
-  if (iterations <= 0 ||
-      (sizeof(long) > sizeof(unsigned) && iterations > (long)UINT_MAX)) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_ITERATION_COUNT);
-    goto err;
-  }
-
-  if (iv->type != V_ASN1_OCTET_STRING || iv->value.octet_string == NULL) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_ERROR_SETTING_CIPHER_PARAMS);
-    goto err;
-  }
-
-  const size_t iv_len = EVP_CIPHER_CTX_iv_length(ctx);
-  if ((size_t) iv->value.octet_string->length != iv_len) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_ERROR_SETTING_CIPHER_PARAMS);
-    goto err;
-  }
-
-  if (!PKCS5_PBKDF2_HMAC_SHA1((const char *) pass_raw, pass_raw_len,
-                              pbkdf2param->salt->value.octet_string->data,
-                              pbkdf2param->salt->value.octet_string->length,
-                              iterations, key_len, key)) {
-    goto err;
-  }
-
-  rv = EVP_CipherInit_ex(ctx, NULL /* cipher */, NULL /* engine */, key,
-                         iv->value.octet_string->data, enc);
-
- err:
-  PBKDF2PARAM_free(pbkdf2param);
-  return rv;
+  int ret = PKCS5_PBKDF2_HMAC_SHA1((const char *)pass_raw, pass_raw_len, salt,
+                                   salt_len, iterations,
+                                   EVP_CIPHER_key_length(cipher), key) &&
+            EVP_CipherInit_ex(ctx, cipher, NULL /* engine */, key, iv, enc);
+  OPENSSL_cleanse(key, EVP_MAX_KEY_LENGTH);
+  return ret;
 }
 
-int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const uint8_t *pass_raw,
-                          size_t pass_raw_len, ASN1_TYPE *param,
-                          const EVP_CIPHER *unused, const EVP_MD *unused2,
-                          int enc) {
-  PBE2PARAM *pbe2param = NULL;
-  int rv = 0;
-
-  if (param == NULL ||
-      param->type != V_ASN1_SEQUENCE ||
-      param->value.sequence == NULL) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR);
-    goto err;
+int PKCS5_pbe2_encrypt_init(CBB *out, EVP_CIPHER_CTX *ctx,
+                            const EVP_CIPHER *cipher, unsigned iterations,
+                            const uint8_t *pass_raw, size_t pass_raw_len,
+                            const uint8_t *salt, size_t salt_len) {
+  int cipher_nid = EVP_CIPHER_nid(cipher);
+  if (cipher_nid == NID_undef) {
+    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER);
+    return 0;
   }
 
-  const uint8_t *pbuf = param->value.sequence->data;
-  int plen = param->value.sequence->length;
-  pbe2param = d2i_PBE2PARAM(NULL, &pbuf, plen);
-  if (pbe2param == NULL || pbuf != param->value.sequence->data + plen) {
+  /* Generate a random IV. */
+  uint8_t iv[EVP_MAX_IV_LENGTH];
+  if (!RAND_bytes(iv, EVP_CIPHER_iv_length(cipher))) {
+    return 0;
+  }
+
+  /* See RFC 2898, appendix A. */
+  CBB algorithm, param, kdf, kdf_param, salt_cbb, cipher_cbb, iv_cbb;
+  if (!CBB_add_asn1(out, &algorithm, CBS_ASN1_SEQUENCE) ||
+      !OBJ_nid2cbb(&algorithm, NID_pbes2) ||
+      !CBB_add_asn1(&algorithm, &param, CBS_ASN1_SEQUENCE) ||
+      !CBB_add_asn1(&param, &kdf, CBS_ASN1_SEQUENCE) ||
+      !OBJ_nid2cbb(&kdf, NID_id_pbkdf2) ||
+      !CBB_add_asn1(&kdf, &kdf_param, CBS_ASN1_SEQUENCE) ||
+      !CBB_add_asn1(&kdf_param, &salt_cbb, CBS_ASN1_OCTETSTRING) ||
+      !CBB_add_bytes(&salt_cbb, salt, salt_len) ||
+      !CBB_add_asn1_uint64(&kdf_param, iterations) ||
+      /* Specify a key length for RC2. */
+      (cipher_nid == NID_rc2_cbc &&
+       !CBB_add_asn1_uint64(&kdf_param, EVP_CIPHER_key_length(cipher))) ||
+      /* Omit the PRF. We use the default hmacWithSHA1. */
+      !CBB_add_asn1(&param, &cipher_cbb, CBS_ASN1_SEQUENCE) ||
+      !OBJ_nid2cbb(&cipher_cbb, cipher_nid) ||
+      /* RFC 2898 says RC2-CBC and RC5-CBC-Pad use a SEQUENCE with version and
+       * IV, but OpenSSL always uses an OCTET STRING IV, so we do the same. */
+      !CBB_add_asn1(&cipher_cbb, &iv_cbb, CBS_ASN1_OCTETSTRING) ||
+      !CBB_add_bytes(&iv_cbb, iv, EVP_CIPHER_iv_length(cipher)) ||
+      !CBB_flush(out)) {
+    return 0;
+  }
+
+  return pkcs5_pbe2_cipher_init(ctx, cipher, iterations, pass_raw, pass_raw_len,
+                                salt, salt_len, iv,
+                                EVP_CIPHER_iv_length(cipher), 1 /* encrypt */);
+}
+
+int PKCS5_pbe2_decrypt_init(const struct pbe_suite *suite, EVP_CIPHER_CTX *ctx,
+                            const uint8_t *pass_raw, size_t pass_raw_len,
+                            CBS *param) {
+  CBS pbe_param, kdf, kdf_obj, enc_scheme, enc_obj;
+  if (!CBS_get_asn1(param, &pbe_param, CBS_ASN1_SEQUENCE) ||
+      CBS_len(param) != 0 ||
+      !CBS_get_asn1(&pbe_param, &kdf, CBS_ASN1_SEQUENCE) ||
+      !CBS_get_asn1(&pbe_param, &enc_scheme, CBS_ASN1_SEQUENCE) ||
+      CBS_len(&pbe_param) != 0 ||
+      !CBS_get_asn1(&kdf, &kdf_obj, CBS_ASN1_OBJECT) ||
+      !CBS_get_asn1(&enc_scheme, &enc_obj, CBS_ASN1_OBJECT)) {
     OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR);
-    goto err;
+    return 0;
   }
 
   /* Check that the key derivation function is PBKDF2. */
-  if (OBJ_obj2nid(pbe2param->keyfunc->algorithm) != NID_id_pbkdf2) {
+  if (OBJ_cbs2nid(&kdf_obj) != NID_id_pbkdf2) {
     OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_KEY_DERIVATION_FUNCTION);
-    goto err;
+    return 0;
   }
 
   /* See if we recognise the encryption algorithm. */
-  const EVP_CIPHER *cipher =
-      EVP_get_cipherbynid(OBJ_obj2nid(pbe2param->encryption->algorithm));
+  const EVP_CIPHER *cipher = EVP_get_cipherbynid(OBJ_cbs2nid(&enc_obj));
   if (cipher == NULL) {
     OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_CIPHER);
-    goto err;
+    return 0;
   }
 
-  /* Fixup cipher based on AlgorithmIdentifier. */
-  if (!EVP_CipherInit_ex(ctx, cipher, NULL /* engine */, NULL /* key */,
-                         NULL /* iv */, enc)) {
-    goto err;
+  /* Parse the KDF parameters. */
+  CBS pbkdf2_params, salt;
+  uint64_t iterations;
+  if (!CBS_get_asn1(&kdf, &pbkdf2_params, CBS_ASN1_SEQUENCE) ||
+      CBS_len(&kdf) != 0 ||
+      !CBS_get_asn1(&pbkdf2_params, &salt, CBS_ASN1_OCTETSTRING) ||
+      !CBS_get_asn1_uint64(&pbkdf2_params, &iterations)) {
+    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR);
+    return 0;
   }
 
-  rv = PKCS5_v2_PBKDF2_keyivgen(ctx, pass_raw, pass_raw_len,
-                                pbe2param->keyfunc->parameter,
-                                pbe2param->encryption->parameter, enc);
+  if (iterations == 0 || iterations > UINT_MAX) {
+    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_ITERATION_COUNT);
+    return 0;
+  }
 
- err:
-  PBE2PARAM_free(pbe2param);
-  return rv;
+  /* The optional keyLength parameter, if present, must match the key length of
+   * the cipher. */
+  if (CBS_peek_asn1_tag(&pbkdf2_params, CBS_ASN1_INTEGER)) {
+    uint64_t key_len;
+    if (!CBS_get_asn1_uint64(&pbkdf2_params, &key_len)) {
+      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR);
+      return 0;
+    }
+
+    if (key_len != EVP_CIPHER_key_length(cipher)) {
+      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_KEYLENGTH);
+      return 0;
+    }
+  }
+
+  if (CBS_len(&pbkdf2_params) != 0) {
+    CBS prf;
+    if (!CBS_get_asn1(&pbkdf2_params, &prf, CBS_ASN1_OBJECT) ||
+        CBS_len(&pbkdf2_params) != 0) {
+      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR);
+      return 0;
+    }
+
+    /* We only support hmacWithSHA1. It is the DEFAULT, so DER requires it be
+     * omitted, but we match OpenSSL in tolerating it being present. */
+    if (OBJ_cbs2nid(&prf) != NID_hmacWithSHA1) {
+      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_PRF);
+      return 0;
+    }
+  }
+
+  /* Parse the encryption scheme parameters. Note OpenSSL does not match the
+   * specification. Per RFC 2898, this should depend on the encryption scheme.
+   * In particular, RC2-CBC and RC5-CBC-Pad use a SEQUENCE with version and IV.
+   * We align with OpenSSL. */
+  CBS iv;
+  if (!CBS_get_asn1(&enc_scheme, &iv, CBS_ASN1_OCTETSTRING) ||
+      CBS_len(&enc_scheme) != 0) {
+    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_PRF);
+    return 0;
+  }
+
+  return pkcs5_pbe2_cipher_init(ctx, cipher, (unsigned)iterations, pass_raw,
+                                pass_raw_len, CBS_data(&salt), CBS_len(&salt),
+                                CBS_data(&iv), CBS_len(&iv), 0 /* decrypt */);
 }
diff --git a/src/crypto/pkcs8/pkcs8.c b/src/crypto/pkcs8/pkcs8.c
index e965bc9..efad81d 100644
--- a/src/crypto/pkcs8/pkcs8.c
+++ b/src/crypto/pkcs8/pkcs8.c
@@ -68,6 +68,7 @@
 #include <openssl/hmac.h>
 #include <openssl/mem.h>
 #include <openssl/obj.h>
+#include <openssl/rand.h>
 #include <openssl/x509.h>
 
 #include "internal.h"
@@ -81,23 +82,21 @@
 
 static int ascii_to_ucs2(const char *ascii, size_t ascii_len,
                          uint8_t **out, size_t *out_len) {
-  uint8_t *unitmp;
-  size_t ulen, i;
-
-  ulen = ascii_len * 2 + 2;
-  if (ulen < ascii_len) {
+  size_t ulen = ascii_len * 2 + 2;
+  if (ascii_len * 2 < ascii_len || ulen < ascii_len * 2) {
     return 0;
   }
-  unitmp = OPENSSL_malloc(ulen);
+
+  uint8_t *unitmp = OPENSSL_malloc(ulen);
   if (unitmp == NULL) {
     return 0;
   }
-  for (i = 0; i < ulen - 2; i += 2) {
+  for (size_t i = 0; i < ulen - 2; i += 2) {
     unitmp[i] = 0;
     unitmp[i + 1] = ascii[i >> 1];
   }
 
-  /* Make result double null terminated */
+  /* Terminate the result with a UCS-2 NUL. */
   unitmp[ulen - 2] = 0;
   unitmp[ulen - 1] = 0;
   *out_len = ulen;
@@ -107,7 +106,7 @@
 
 static int pkcs12_key_gen_raw(const uint8_t *pass_raw, size_t pass_raw_len,
                               const uint8_t *salt, size_t salt_len,
-                              uint8_t id, int iterations,
+                              uint8_t id, unsigned iterations,
                               size_t out_len, uint8_t *out,
                               const EVP_MD *md) {
   /* See https://tools.ietf.org/html/rfc7292#appendix-B. Quoted parts of the
@@ -177,7 +176,7 @@
         !EVP_DigestFinal_ex(&ctx, A, &A_len)) {
       goto err;
     }
-    for (int iter = 1; iter < iterations; iter++) {
+    for (unsigned iter = 1; iter < iterations; iter++) {
       if (!EVP_DigestInit_ex(&ctx, md, NULL) ||
           !EVP_DigestUpdate(&ctx, A, A_len) ||
           !EVP_DigestFinal_ex(&ctx, A, &A_len)) {
@@ -223,86 +222,75 @@
   return ret;
 }
 
-static int pkcs12_pbe_keyivgen(EVP_CIPHER_CTX *ctx, const uint8_t *pass_raw,
-                               size_t pass_raw_len, ASN1_TYPE *param,
-                               const EVP_CIPHER *cipher, const EVP_MD *md,
-                               int is_encrypt) {
-  PBEPARAM *pbe;
-  int salt_len, iterations, ret;
-  uint8_t *salt;
-  const uint8_t *pbuf;
-  uint8_t key[EVP_MAX_KEY_LENGTH], iv[EVP_MAX_IV_LENGTH];
+static int pkcs12_pbe_cipher_init(const struct pbe_suite *suite,
+                                  EVP_CIPHER_CTX *ctx, unsigned iterations,
+                                  const uint8_t *pass_raw, size_t pass_raw_len,
+                                  const uint8_t *salt, size_t salt_len,
+                                  int is_encrypt) {
+  const EVP_CIPHER *cipher = suite->cipher_func();
+  const EVP_MD *md = suite->md_func();
 
-  /* Extract useful info from parameter */
-  if (param == NULL || param->type != V_ASN1_SEQUENCE ||
-      param->value.sequence == NULL) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR);
-    return 0;
-  }
-
-  pbuf = param->value.sequence->data;
-  pbe = d2i_PBEPARAM(NULL, &pbuf, param->value.sequence->length);
-  if (pbe == NULL) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR);
-    return 0;
-  }
-
-  if (!pbe->iter) {
-    iterations = 1;
-  } else {
-    iterations = ASN1_INTEGER_get(pbe->iter);
-  }
-  salt = pbe->salt->data;
-  salt_len = pbe->salt->length;
-  if (!pkcs12_key_gen_raw(pass_raw, pass_raw_len, salt, salt_len, PKCS12_KEY_ID,
-                          iterations, EVP_CIPHER_key_length(cipher), key, md)) {
+  uint8_t key[EVP_MAX_KEY_LENGTH];
+  if (!pkcs12_key_gen_raw(pass_raw, pass_raw_len, salt,
+                          salt_len, PKCS12_KEY_ID, iterations,
+                          EVP_CIPHER_key_length(cipher), key, md)) {
     OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_KEY_GEN_ERROR);
-    PBEPARAM_free(pbe);
     return 0;
   }
-  if (!pkcs12_key_gen_raw(pass_raw, pass_raw_len, salt, salt_len, PKCS12_IV_ID,
-                          iterations, EVP_CIPHER_iv_length(cipher), iv, md)) {
+
+  uint8_t iv[EVP_MAX_IV_LENGTH];
+  if (!pkcs12_key_gen_raw(pass_raw, pass_raw_len, salt,
+                          salt_len, PKCS12_IV_ID, iterations,
+                          EVP_CIPHER_iv_length(cipher), iv, md)) {
     OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_KEY_GEN_ERROR);
-    PBEPARAM_free(pbe);
     return 0;
   }
-  PBEPARAM_free(pbe);
-  ret = EVP_CipherInit_ex(ctx, cipher, NULL, key, iv, is_encrypt);
+
+  int ret = EVP_CipherInit_ex(ctx, cipher, NULL, key, iv, is_encrypt);
   OPENSSL_cleanse(key, EVP_MAX_KEY_LENGTH);
   OPENSSL_cleanse(iv, EVP_MAX_IV_LENGTH);
   return ret;
 }
 
-typedef int (*keygen_func)(EVP_CIPHER_CTX *ctx, const uint8_t *pass_raw,
-                           size_t pass_raw_len, ASN1_TYPE *param,
-                           const EVP_CIPHER *cipher, const EVP_MD *md,
-                           int is_encrypt);
+static int pkcs12_pbe_decrypt_init(const struct pbe_suite *suite,
+                                   EVP_CIPHER_CTX *ctx, const uint8_t *pass_raw,
+                                   size_t pass_raw_len, CBS *param) {
+  CBS pbe_param, salt;
+  uint64_t iterations;
+  if (!CBS_get_asn1(param, &pbe_param, CBS_ASN1_SEQUENCE) ||
+      !CBS_get_asn1(&pbe_param, &salt, CBS_ASN1_OCTETSTRING) ||
+      !CBS_get_asn1_uint64(&pbe_param, &iterations) ||
+      CBS_len(&pbe_param) != 0 ||
+      CBS_len(param) != 0) {
+    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR);
+    return 0;
+  }
 
-struct pbe_suite {
-  int pbe_nid;
-  const EVP_CIPHER* (*cipher_func)(void);
-  const EVP_MD* (*md_func)(void);
-  keygen_func keygen;
-  int flags;
-};
+  if (iterations == 0 || iterations > UINT_MAX) {
+    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_ITERATION_COUNT);
+    return 0;
+  }
 
-#define PBE_UCS2_CONVERT_PASSWORD 0x1
+  return pkcs12_pbe_cipher_init(suite, ctx, (unsigned)iterations, pass_raw,
+                                pass_raw_len, CBS_data(&salt), CBS_len(&salt),
+                                0 /* decrypt */);
+}
 
 static const struct pbe_suite kBuiltinPBE[] = {
     {
-     NID_pbe_WithSHA1And40BitRC2_CBC, EVP_rc2_40_cbc, EVP_sha1,
-     pkcs12_pbe_keyivgen, PBE_UCS2_CONVERT_PASSWORD
+        NID_pbe_WithSHA1And40BitRC2_CBC, EVP_rc2_40_cbc, EVP_sha1,
+        pkcs12_pbe_decrypt_init, PBE_UCS2_CONVERT_PASSWORD,
     },
     {
-     NID_pbe_WithSHA1And128BitRC4, EVP_rc4, EVP_sha1, pkcs12_pbe_keyivgen,
-     PBE_UCS2_CONVERT_PASSWORD
+        NID_pbe_WithSHA1And128BitRC4, EVP_rc4, EVP_sha1,
+        pkcs12_pbe_decrypt_init, PBE_UCS2_CONVERT_PASSWORD,
     },
     {
-     NID_pbe_WithSHA1And3_Key_TripleDES_CBC, EVP_des_ede3_cbc, EVP_sha1,
-     pkcs12_pbe_keyivgen, PBE_UCS2_CONVERT_PASSWORD
+        NID_pbe_WithSHA1And3_Key_TripleDES_CBC, EVP_des_ede3_cbc, EVP_sha1,
+        pkcs12_pbe_decrypt_init, PBE_UCS2_CONVERT_PASSWORD,
     },
     {
-      NID_pbes2, NULL, NULL,  PKCS5_v2_PBE_keyivgen, 0
+        NID_pbes2, NULL, NULL, PKCS5_pbe2_decrypt_init, 0,
     },
 };
 
@@ -359,130 +347,85 @@
   return 1;
 }
 
-static int pbe_cipher_init(ASN1_OBJECT *pbe_obj,
-                           const uint8_t *pass_raw, size_t pass_raw_len,
-                           ASN1_TYPE *param,
-                           EVP_CIPHER_CTX *ctx, int is_encrypt) {
-  const EVP_CIPHER *cipher;
-  const EVP_MD *md;
-
-  const struct pbe_suite *suite = get_pbe_suite(OBJ_obj2nid(pbe_obj));
+static int pkcs12_pbe_encrypt_init(CBB *out, EVP_CIPHER_CTX *ctx, int alg,
+                                   unsigned iterations, const uint8_t *pass_raw,
+                                   size_t pass_raw_len, const uint8_t *salt,
+                                   size_t salt_len) {
+  const struct pbe_suite *suite = get_pbe_suite(alg);
   if (suite == NULL) {
-    char obj_str[80];
     OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNKNOWN_ALGORITHM);
-    if (!pbe_obj) {
-      strncpy(obj_str, "NULL", sizeof(obj_str));
-    } else {
-      i2t_ASN1_OBJECT(obj_str, sizeof(obj_str), pbe_obj);
-    }
-    ERR_add_error_data(2, "TYPE=", obj_str);
     return 0;
   }
 
-  if (suite->cipher_func == NULL) {
-    cipher = NULL;
-  } else {
-    cipher = suite->cipher_func();
-    if (!cipher) {
-      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNKNOWN_CIPHER);
-      return 0;
-    }
-  }
-
-  if (suite->md_func == NULL) {
-    md = NULL;
-  } else {
-    md = suite->md_func();
-    if (!md) {
-      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNKNOWN_DIGEST);
-      return 0;
-    }
-  }
-
-  if (!suite->keygen(ctx, pass_raw, pass_raw_len, param, cipher, md,
-                     is_encrypt)) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_KEYGEN_FAILURE);
+  /* See RFC 2898, appendix A.3. */
+  CBB algorithm, param, salt_cbb;
+  if (!CBB_add_asn1(out, &algorithm, CBS_ASN1_SEQUENCE) ||
+      !OBJ_nid2cbb(&algorithm, alg) ||
+      !CBB_add_asn1(&algorithm, &param, CBS_ASN1_SEQUENCE) ||
+      !CBB_add_asn1(&param, &salt_cbb, CBS_ASN1_OCTETSTRING) ||
+      !CBB_add_bytes(&salt_cbb, salt, salt_len) ||
+      !CBB_add_asn1_uint64(&param, iterations) ||
+      !CBB_flush(out)) {
     return 0;
   }
 
-  return 1;
+  return pkcs12_pbe_cipher_init(suite, ctx, iterations, pass_raw, pass_raw_len,
+                                salt, salt_len, 1 /* encrypt */);
 }
 
-static int pbe_crypt(const X509_ALGOR *algor,
-                     const uint8_t *pass_raw, size_t pass_raw_len,
-                     const uint8_t *in, size_t in_len,
-                     uint8_t **out, size_t *out_len,
-                     int is_encrypt) {
-  uint8_t *buf;
-  int n, ret = 0;
+static int pbe_decrypt(uint8_t **out, size_t *out_len, CBS *algorithm,
+                       const uint8_t *pass_raw, size_t pass_raw_len,
+                       const uint8_t *in, size_t in_len) {
+  int ret = 0;
+  uint8_t *buf = NULL;;
   EVP_CIPHER_CTX ctx;
-  unsigned block_size;
-
   EVP_CIPHER_CTX_init(&ctx);
 
-  if (!pbe_cipher_init(algor->algorithm, pass_raw, pass_raw_len,
-                       algor->parameter, &ctx, is_encrypt)) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNKNOWN_CIPHER_ALGORITHM);
-    return 0;
-  }
-  block_size = EVP_CIPHER_CTX_block_size(&ctx);
-
-  if (in_len + block_size < in_len) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_TOO_LONG);
+  CBS obj;
+  if (!CBS_get_asn1(algorithm, &obj, CBS_ASN1_OBJECT)) {
+    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR);
     goto err;
   }
 
-  buf = OPENSSL_malloc(in_len + block_size);
+  const struct pbe_suite *suite = get_pbe_suite(OBJ_cbs2nid(&obj));
+  if (suite == NULL) {
+    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNKNOWN_ALGORITHM);
+    goto err;
+  }
+
+  if (!suite->decrypt_init(suite, &ctx, pass_raw, pass_raw_len, algorithm)) {
+    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_KEYGEN_FAILURE);
+    goto err;
+  }
+
+  buf = OPENSSL_malloc(in_len);
   if (buf == NULL) {
     OPENSSL_PUT_ERROR(PKCS8, ERR_R_MALLOC_FAILURE);
     goto err;
   }
 
-  if (!EVP_CipherUpdate(&ctx, buf, &n, in, in_len)) {
-    OPENSSL_free(buf);
-    OPENSSL_PUT_ERROR(PKCS8, ERR_R_EVP_LIB);
+  if (in_len > INT_MAX) {
+    OPENSSL_PUT_ERROR(PKCS8, ERR_R_OVERFLOW);
     goto err;
   }
-  *out_len = n;
 
-  if (!EVP_CipherFinal_ex(&ctx, buf + n, &n)) {
-    OPENSSL_free(buf);
-    OPENSSL_PUT_ERROR(PKCS8, ERR_R_EVP_LIB);
+  int n1, n2;
+  if (!EVP_DecryptUpdate(&ctx, buf, &n1, in, (int)in_len) ||
+      !EVP_DecryptFinal_ex(&ctx, buf + n1, &n2)) {
     goto err;
   }
-  *out_len += n;
+
   *out = buf;
+  *out_len = n1 + n2;
   ret = 1;
+  buf = NULL;
 
 err:
+  OPENSSL_free(buf);
   EVP_CIPHER_CTX_cleanup(&ctx);
   return ret;
 }
 
-static void *pkcs12_item_decrypt_d2i(X509_ALGOR *algor, const ASN1_ITEM *it,
-                                     const uint8_t *pass_raw,
-                                     size_t pass_raw_len,
-                                     ASN1_OCTET_STRING *oct) {
-  uint8_t *out;
-  const uint8_t *p;
-  void *ret;
-  size_t out_len;
-
-  if (!pbe_crypt(algor, pass_raw, pass_raw_len, oct->data, oct->length,
-                 &out, &out_len, 0 /* decrypt */)) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_CRYPT_ERROR);
-    return NULL;
-  }
-  p = out;
-  ret = ASN1_item_d2i(NULL, &p, out_len, it);
-  OPENSSL_cleanse(out, out_len);
-  if (!ret) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR);
-  }
-  OPENSSL_free(out);
-  return ret;
-}
-
 PKCS8_PRIV_KEY_INFO *PKCS8_decrypt(X509_SIG *pkcs8, const char *pass,
                                    int pass_len) {
   uint8_t *pass_raw = NULL;
@@ -503,44 +446,57 @@
 
 PKCS8_PRIV_KEY_INFO *PKCS8_decrypt_pbe(X509_SIG *pkcs8, const uint8_t *pass_raw,
                                        size_t pass_raw_len) {
-  return pkcs12_item_decrypt_d2i(pkcs8->algor,
-                                 ASN1_ITEM_rptr(PKCS8_PRIV_KEY_INFO), pass_raw,
-                                 pass_raw_len, pkcs8->digest);
-}
+  PKCS8_PRIV_KEY_INFO *ret = NULL;
+  uint8_t *in = NULL, *out = NULL;
+  size_t out_len = 0;
 
-static ASN1_OCTET_STRING *pkcs12_item_i2d_encrypt(X509_ALGOR *algor,
-                                                  const ASN1_ITEM *it,
-                                                  const uint8_t *pass_raw,
-                                                  size_t pass_raw_len, void *obj) {
-  ASN1_OCTET_STRING *oct;
-  uint8_t *in = NULL;
-  int in_len;
-  size_t crypt_len;
+  /* Convert the legacy ASN.1 object to a byte string. */
+  int in_len = i2d_X509_SIG(pkcs8, &in);
+  if (in_len < 0) {
+    goto err;
+  }
 
-  oct = M_ASN1_OCTET_STRING_new();
-  if (oct == NULL) {
-    OPENSSL_PUT_ERROR(PKCS8, ERR_R_MALLOC_FAILURE);
-    return NULL;
+  /* See RFC 5208, section 6. */
+  CBS cbs, epki, algorithm, ciphertext;
+  CBS_init(&cbs, in, in_len);
+  if (!CBS_get_asn1(&cbs, &epki, CBS_ASN1_SEQUENCE) ||
+      !CBS_get_asn1(&epki, &algorithm, CBS_ASN1_SEQUENCE) ||
+      !CBS_get_asn1(&epki, &ciphertext, CBS_ASN1_OCTETSTRING) ||
+      CBS_len(&epki) != 0 ||
+      CBS_len(&cbs) != 0) {
+    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR);
+    goto err;
   }
-  in_len = ASN1_item_i2d(obj, &in, it);
-  if (!in) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_ENCODE_ERROR);
-    return NULL;
+
+  if (!pbe_decrypt(&out, &out_len, &algorithm, pass_raw, pass_raw_len,
+                   CBS_data(&ciphertext), CBS_len(&ciphertext))) {
+    goto err;
   }
-  if (!pbe_crypt(algor, pass_raw, pass_raw_len, in, in_len, &oct->data, &crypt_len,
-                 1 /* encrypt */)) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_ENCRYPT_ERROR);
-    OPENSSL_free(in);
-    return NULL;
+
+  if (out_len > LONG_MAX) {
+    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR);
+    goto err;
   }
-  oct->length = crypt_len;
-  OPENSSL_cleanse(in, in_len);
+
+  /* Convert back to legacy ASN.1 objects. */
+  const uint8_t *ptr = out;
+  ret = d2i_PKCS8_PRIV_KEY_INFO(NULL, &ptr, (long)out_len);
+  OPENSSL_cleanse(out, out_len);
+  if (ret == NULL || ptr != out + out_len) {
+    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR);
+    PKCS8_PRIV_KEY_INFO_free(ret);
+    ret = NULL;
+  }
+
+err:
   OPENSSL_free(in);
-  return oct;
+  OPENSSL_cleanse(out, out_len);
+  OPENSSL_free(out);
+  return ret;
 }
 
 X509_SIG *PKCS8_encrypt(int pbe_nid, const EVP_CIPHER *cipher, const char *pass,
-                        int pass_len, uint8_t *salt, size_t salt_len,
+                        int pass_len, const uint8_t *salt, size_t salt_len,
                         int iterations, PKCS8_PRIV_KEY_INFO *p8inf) {
   uint8_t *pass_raw = NULL;
   size_t pass_raw_len = 0;
@@ -560,42 +516,97 @@
 
 X509_SIG *PKCS8_encrypt_pbe(int pbe_nid, const EVP_CIPHER *cipher,
                             const uint8_t *pass_raw, size_t pass_raw_len,
-                            uint8_t *salt, size_t salt_len,
+                            const uint8_t *salt, size_t salt_len,
                             int iterations, PKCS8_PRIV_KEY_INFO *p8inf) {
-  X509_SIG *pkcs8 = NULL;
-  X509_ALGOR *pbe;
+  X509_SIG *ret = NULL;
+  uint8_t *plaintext = NULL, *salt_buf = NULL, *der = NULL;
+  int plaintext_len = -1;
+  size_t der_len;
+  CBB cbb;
+  CBB_zero(&cbb);
+  EVP_CIPHER_CTX ctx;
+  EVP_CIPHER_CTX_init(&ctx);
 
-  pkcs8 = X509_SIG_new();
-  if (pkcs8 == NULL) {
-    OPENSSL_PUT_ERROR(PKCS8, ERR_R_MALLOC_FAILURE);
+  /* Generate a random salt if necessary. */
+  if (salt == NULL) {
+    if (salt_len == 0) {
+      salt_len = PKCS5_SALT_LEN;
+    }
+
+    salt_buf = OPENSSL_malloc(salt_len);
+    if (salt_buf == NULL ||
+        !RAND_bytes(salt_buf, salt_len)) {
+      goto err;
+    }
+
+    salt = salt_buf;
+  }
+
+  if (iterations <= 0) {
+    iterations = PKCS5_DEFAULT_ITERATIONS;
+  }
+
+  /* Convert the input from the legacy ASN.1 format. */
+  plaintext_len = i2d_PKCS8_PRIV_KEY_INFO(p8inf, &plaintext);
+  if (plaintext_len < 0) {
     goto err;
   }
 
+  CBB epki;
+  if (!CBB_init(&cbb, 128) ||
+      !CBB_add_asn1(&cbb, &epki, CBS_ASN1_SEQUENCE)) {
+    goto err;
+  }
+
+  int alg_ok;
   if (pbe_nid == -1) {
-    pbe = PKCS5_pbe2_set(cipher, iterations, salt, salt_len);
+    alg_ok = PKCS5_pbe2_encrypt_init(&epki, &ctx, cipher, (unsigned)iterations,
+                                     pass_raw, pass_raw_len, salt, salt_len);
   } else {
-    pbe = PKCS5_pbe_set(pbe_nid, iterations, salt, salt_len);
+    alg_ok = pkcs12_pbe_encrypt_init(&epki, &ctx, pbe_nid, (unsigned)iterations,
+                                     pass_raw, pass_raw_len, salt, salt_len);
   }
-  if (!pbe) {
-    OPENSSL_PUT_ERROR(PKCS8, ERR_R_ASN1_LIB);
+  if (!alg_ok) {
     goto err;
   }
 
-  X509_ALGOR_free(pkcs8->algor);
-  pkcs8->algor = pbe;
-  M_ASN1_OCTET_STRING_free(pkcs8->digest);
-  pkcs8->digest = pkcs12_item_i2d_encrypt(
-      pbe, ASN1_ITEM_rptr(PKCS8_PRIV_KEY_INFO), pass_raw, pass_raw_len, p8inf);
-  if (!pkcs8->digest) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_ENCRYPT_ERROR);
+  size_t max_out = (size_t)plaintext_len + EVP_CIPHER_CTX_block_size(&ctx);
+  if (max_out < (size_t)plaintext_len) {
+    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_TOO_LONG);
     goto err;
   }
 
-  return pkcs8;
+  CBB ciphertext;
+  uint8_t *out;
+  int n1, n2;
+  if (!CBB_add_asn1(&epki, &ciphertext, CBS_ASN1_OCTETSTRING) ||
+      !CBB_reserve(&ciphertext, &out, max_out) ||
+      !EVP_CipherUpdate(&ctx, out, &n1, plaintext, plaintext_len) ||
+      !EVP_CipherFinal_ex(&ctx, out + n1, &n2) ||
+      !CBB_did_write(&ciphertext, n1 + n2) ||
+      !CBB_finish(&cbb, &der, &der_len)) {
+    goto err;
+  }
+
+  /* Convert back to legacy ASN.1 objects. */
+  const uint8_t *ptr = der;
+  ret = d2i_X509_SIG(NULL, &ptr, der_len);
+  if (ret == NULL || ptr != der + der_len) {
+    OPENSSL_PUT_ERROR(PKCS8, ERR_R_INTERNAL_ERROR);
+    X509_SIG_free(ret);
+    ret = NULL;
+  }
 
 err:
-  X509_SIG_free(pkcs8);
-  return NULL;
+  if (plaintext_len > 0) {
+    OPENSSL_cleanse(plaintext, plaintext_len);
+  }
+  OPENSSL_free(plaintext);
+  OPENSSL_free(salt_buf);
+  OPENSSL_free(der);
+  CBB_cleanup(&cbb);
+  EVP_CIPHER_CTX_cleanup(&ctx);
+  return ret;
 }
 
 EVP_PKEY *EVP_PKCS82PKEY(PKCS8_PRIV_KEY_INFO *p8) {
@@ -655,32 +666,21 @@
   size_t password_len;
 };
 
-static int PKCS12_handle_content_info(CBS *content_info, unsigned depth,
-                                      struct pkcs12_context *ctx);
-
-/* PKCS12_handle_content_infos parses a series of PKCS#7 ContentInfos in a
- * SEQUENCE. */
-static int PKCS12_handle_content_infos(CBS *content_infos,
-                                       unsigned depth,
-                                       struct pkcs12_context *ctx) {
+/* PKCS12_handle_sequence parses a BER-encoded SEQUENCE of elements in a PKCS#12
+ * structure. */
+static int PKCS12_handle_sequence(
+    CBS *sequence, struct pkcs12_context *ctx,
+    int (*handle_element)(CBS *cbs, struct pkcs12_context *ctx)) {
   uint8_t *der_bytes = NULL;
   size_t der_len;
   CBS in;
   int ret = 0;
 
-  /* Generally we only expect depths 0 (the top level, with a
-   * pkcs7-encryptedData and a pkcs7-data) and depth 1 (the various PKCS#12
-   * bags). */
-  if (depth > 3) {
-    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_PKCS12_TOO_DEEPLY_NESTED);
-    return 0;
-  }
-
   /* Although a BER->DER conversion is done at the beginning of |PKCS12_parse|,
    * the ASN.1 data gets wrapped in OCTETSTRINGs and/or encrypted and the
    * conversion cannot see through those wrappings. So each time we step
    * through one we need to convert to DER again. */
-  if (!CBS_asn1_ber_to_der(content_infos, &der_bytes, &der_len)) {
+  if (!CBS_asn1_ber_to_der(sequence, &der_bytes, &der_len)) {
     OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
     return 0;
   }
@@ -688,30 +688,28 @@
   if (der_bytes != NULL) {
     CBS_init(&in, der_bytes, der_len);
   } else {
-    CBS_init(&in, CBS_data(content_infos), CBS_len(content_infos));
+    CBS_init(&in, CBS_data(sequence), CBS_len(sequence));
   }
 
-  if (!CBS_get_asn1(&in, &in, CBS_ASN1_SEQUENCE)) {
+  CBS child;
+  if (!CBS_get_asn1(&in, &child, CBS_ASN1_SEQUENCE) ||
+      CBS_len(&in) != 0) {
     OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
     goto err;
   }
 
-  while (CBS_len(&in) > 0) {
-    CBS content_info;
-    if (!CBS_get_asn1(&in, &content_info, CBS_ASN1_SEQUENCE)) {
+  while (CBS_len(&child) > 0) {
+    CBS element;
+    if (!CBS_get_asn1(&child, &element, CBS_ASN1_SEQUENCE)) {
       OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
       goto err;
     }
 
-    if (!PKCS12_handle_content_info(&content_info, depth + 1, ctx)) {
+    if (!handle_element(&element, ctx)) {
       goto err;
     }
   }
 
-  /* NSS includes additional data after the SEQUENCE, but it's an (unwrapped)
-   * copy of the same encrypted private key (with the same IV and
-   * ciphertext)! */
-
   ret = 1;
 
 err:
@@ -719,17 +717,116 @@
   return ret;
 }
 
+/* PKCS12_handle_safe_bag parses a single SafeBag element in a PKCS#12
+ * structure. */
+static int PKCS12_handle_safe_bag(CBS *safe_bag, struct pkcs12_context *ctx) {
+  CBS bag_id, wrapped_value;
+  if (!CBS_get_asn1(safe_bag, &bag_id, CBS_ASN1_OBJECT) ||
+      !CBS_get_asn1(safe_bag, &wrapped_value,
+                        CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0)
+      /* Ignore the bagAttributes field. */) {
+    OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
+    return 0;
+  }
+
+  int nid = OBJ_cbs2nid(&bag_id);
+  if (nid == NID_pkcs8ShroudedKeyBag) {
+    /* See RFC 7292, section 4.2.2. */
+    if (*ctx->out_key) {
+      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_MULTIPLE_PRIVATE_KEYS_IN_PKCS12);
+      return 0;
+    }
+
+    if (CBS_len(&wrapped_value) > LONG_MAX) {
+      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
+      return 0;
+    }
+
+    /* |encrypted| isn't actually an X.509 signature, but it has the same
+     * structure as one and so |X509_SIG| is reused to store it. */
+    const uint8_t *inp = CBS_data(&wrapped_value);
+    X509_SIG *encrypted =
+        d2i_X509_SIG(NULL, &inp, (long)CBS_len(&wrapped_value));
+    if (encrypted == NULL) {
+      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
+      return 0;
+    }
+    if (inp != CBS_data(&wrapped_value) + CBS_len(&wrapped_value)) {
+      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
+      X509_SIG_free(encrypted);
+      return 0;
+    }
+
+    PKCS8_PRIV_KEY_INFO *pki =
+        PKCS8_decrypt_pbe(encrypted, ctx->password, ctx->password_len);
+    X509_SIG_free(encrypted);
+    if (pki == NULL) {
+      return 0;
+    }
+
+    *ctx->out_key = EVP_PKCS82PKEY(pki);
+    PKCS8_PRIV_KEY_INFO_free(pki);
+    return ctx->out_key != NULL;
+  }
+
+  if (nid == NID_certBag) {
+    /* See RFC 7292, section 4.2.3. */
+    CBS cert_bag, cert_type, wrapped_cert, cert;
+    if (!CBS_get_asn1(&wrapped_value, &cert_bag, CBS_ASN1_SEQUENCE) ||
+        !CBS_get_asn1(&cert_bag, &cert_type, CBS_ASN1_OBJECT) ||
+        !CBS_get_asn1(&cert_bag, &wrapped_cert,
+                      CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0) ||
+        !CBS_get_asn1(&wrapped_cert, &cert, CBS_ASN1_OCTETSTRING)) {
+      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
+      return 0;
+    }
+
+    if (OBJ_cbs2nid(&cert_type) != NID_x509Certificate) {
+      return 1;
+    }
+
+    if (CBS_len(&cert) > LONG_MAX) {
+      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
+      return 0;
+    }
+
+    const uint8_t *inp = CBS_data(&cert);
+    X509 *x509 = d2i_X509(NULL, &inp, (long)CBS_len(&cert));
+    if (!x509) {
+      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
+      return 0;
+    }
+
+    if (inp != CBS_data(&cert) + CBS_len(&cert)) {
+      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
+      X509_free(x509);
+      return 0;
+    }
+
+    if (0 == sk_X509_push(ctx->out_certs, x509)) {
+      X509_free(x509);
+      return 0;
+    }
+
+    return 1;
+  }
+
+  /* Unknown element type - ignore it. */
+  return 1;
+}
+
 /* PKCS12_handle_content_info parses a single PKCS#7 ContentInfo element in a
  * PKCS#12 structure. */
-static int PKCS12_handle_content_info(CBS *content_info, unsigned depth,
+static int PKCS12_handle_content_info(CBS *content_info,
                                       struct pkcs12_context *ctx) {
-  CBS content_type, wrapped_contents, contents, content_infos;
+  CBS content_type, wrapped_contents, contents;
   int nid, ret = 0;
   uint8_t *storage = NULL;
 
   if (!CBS_get_asn1(content_info, &content_type, CBS_ASN1_OBJECT) ||
       !CBS_get_asn1(content_info, &wrapped_contents,
-                        CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0)) {
+                        CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0) ||
+      CBS_len(content_info) != 0) {
     OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
     goto err;
   }
@@ -742,8 +839,6 @@
      * encrypted certificate bag and it's generally encrypted with 40-bit
      * RC2-CBC. */
     CBS version_bytes, eci, contents_type, ai, encrypted_contents;
-    X509_ALGOR *algor = NULL;
-    const uint8_t *inp;
     uint8_t *out;
     size_t out_len;
 
@@ -755,7 +850,7 @@
         !CBS_get_asn1(&eci, &contents_type, CBS_ASN1_OBJECT) ||
         /* AlgorithmIdentifier, see
          * https://tools.ietf.org/html/rfc5280#section-4.1.1.2 */
-        !CBS_get_asn1_element(&eci, &ai, CBS_ASN1_SEQUENCE) ||
+        !CBS_get_asn1(&eci, &ai, CBS_ASN1_SEQUENCE) ||
         !CBS_get_asn1_implicit_string(
             &eci, &encrypted_contents, &storage,
             CBS_ASN1_CONTEXT_SPECIFIC | 0, CBS_ASN1_OCTETSTRING)) {
@@ -763,122 +858,32 @@
       goto err;
     }
 
-    if (OBJ_cbs2nid(&contents_type) != NID_pkcs7_data ||
-        CBS_len(&ai) > LONG_MAX) {
+    if (OBJ_cbs2nid(&contents_type) != NID_pkcs7_data) {
       OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
       goto err;
     }
 
-    inp = CBS_data(&ai);
-    algor = d2i_X509_ALGOR(NULL, &inp, (long)CBS_len(&ai));
-    if (algor == NULL) {
-      goto err;
-    }
-    if (inp != CBS_data(&ai) + CBS_len(&ai)) {
-      X509_ALGOR_free(algor);
-      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
+    if (!pbe_decrypt(&out, &out_len, &ai, ctx->password, ctx->password_len,
+                     CBS_data(&encrypted_contents),
+                     CBS_len(&encrypted_contents))) {
       goto err;
     }
 
-    if (!pbe_crypt(algor, ctx->password, ctx->password_len,
-                   CBS_data(&encrypted_contents), CBS_len(&encrypted_contents),
-                   &out, &out_len, 0 /* decrypt */)) {
-      X509_ALGOR_free(algor);
-      goto err;
-    }
-    X509_ALGOR_free(algor);
-
-    CBS_init(&content_infos, out, out_len);
-    ret = PKCS12_handle_content_infos(&content_infos, depth + 1, ctx);
+    CBS safe_contents;
+    CBS_init(&safe_contents, out, out_len);
+    ret = PKCS12_handle_sequence(&safe_contents, ctx, PKCS12_handle_safe_bag);
     OPENSSL_free(out);
   } else if (nid == NID_pkcs7_data) {
     CBS octet_string_contents;
 
     if (!CBS_get_asn1(&wrapped_contents, &octet_string_contents,
-                          CBS_ASN1_OCTETSTRING)) {
+                      CBS_ASN1_OCTETSTRING)) {
       OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
       goto err;
     }
 
-    ret = PKCS12_handle_content_infos(&octet_string_contents, depth + 1, ctx);
-  } else if (nid == NID_pkcs8ShroudedKeyBag) {
-    /* See ftp://ftp.rsasecurity.com/pub/pkcs/pkcs-12/pkcs-12v1.pdf, section
-     * 4.2.2. */
-    const uint8_t *inp = CBS_data(&wrapped_contents);
-    PKCS8_PRIV_KEY_INFO *pki = NULL;
-    X509_SIG *encrypted = NULL;
-
-    if (*ctx->out_key) {
-      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_MULTIPLE_PRIVATE_KEYS_IN_PKCS12);
-      goto err;
-    }
-
-    if (CBS_len(&wrapped_contents) > LONG_MAX) {
-      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
-      goto err;
-    }
-
-    /* encrypted isn't actually an X.509 signature, but it has the same
-     * structure as one and so |X509_SIG| is reused to store it. */
-    encrypted = d2i_X509_SIG(NULL, &inp, (long)CBS_len(&wrapped_contents));
-    if (encrypted == NULL) {
-      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
-      goto err;
-    }
-    if (inp != CBS_data(&wrapped_contents) + CBS_len(&wrapped_contents)) {
-      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
-      X509_SIG_free(encrypted);
-      goto err;
-    }
-
-    pki = PKCS8_decrypt_pbe(encrypted, ctx->password, ctx->password_len);
-    X509_SIG_free(encrypted);
-    if (pki == NULL) {
-      goto err;
-    }
-
-    *ctx->out_key = EVP_PKCS82PKEY(pki);
-    PKCS8_PRIV_KEY_INFO_free(pki);
-
-    if (ctx->out_key == NULL) {
-      goto err;
-    }
-    ret = 1;
-  } else if (nid == NID_certBag) {
-    CBS cert_bag, cert_type, wrapped_cert, cert;
-
-    if (!CBS_get_asn1(&wrapped_contents, &cert_bag, CBS_ASN1_SEQUENCE) ||
-        !CBS_get_asn1(&cert_bag, &cert_type, CBS_ASN1_OBJECT) ||
-        !CBS_get_asn1(&cert_bag, &wrapped_cert,
-                      CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0) ||
-        !CBS_get_asn1(&wrapped_cert, &cert, CBS_ASN1_OCTETSTRING)) {
-      OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
-      goto err;
-    }
-
-    if (OBJ_cbs2nid(&cert_type) == NID_x509Certificate) {
-      if (CBS_len(&cert) > LONG_MAX) {
-        OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
-        goto err;
-      }
-      const uint8_t *inp = CBS_data(&cert);
-      X509 *x509 = d2i_X509(NULL, &inp, (long)CBS_len(&cert));
-      if (!x509) {
-        OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
-        goto err;
-      }
-      if (inp != CBS_data(&cert) + CBS_len(&cert)) {
-        OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
-        X509_free(x509);
-        goto err;
-      }
-
-      if (0 == sk_X509_push(ctx->out_certs, x509)) {
-        X509_free(x509);
-        goto err;
-      }
-    }
-    ret = 1;
+    ret = PKCS12_handle_sequence(&octet_string_contents, ctx,
+                                 PKCS12_handle_safe_bag);
   } else {
     /* Unknown element type - ignore it. */
     ret = 1;
@@ -995,7 +1000,7 @@
     iterations = 1;
     if (CBS_len(&mac_data) > 0) {
       if (!CBS_get_asn1_uint64(&mac_data, &iterations) ||
-          iterations > INT_MAX) {
+          iterations > UINT_MAX) {
         OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA);
         goto err;
       }
@@ -1026,7 +1031,7 @@
   }
 
   /* authsafes contains a series of PKCS#7 ContentInfos. */
-  if (!PKCS12_handle_content_infos(&authsafes, 0, &ctx)) {
+  if (!PKCS12_handle_sequence(&authsafes, &ctx, PKCS12_handle_content_info)) {
     goto err;
   }
 
@@ -1054,7 +1059,8 @@
   size_t ber_len;
 };
 
-PKCS12* d2i_PKCS12(PKCS12 **out_p12, const uint8_t **ber_bytes, size_t ber_len) {
+PKCS12 *d2i_PKCS12(PKCS12 **out_p12, const uint8_t **ber_bytes,
+                   size_t ber_len) {
   PKCS12 *p12;
 
   p12 = OPENSSL_malloc(sizeof(PKCS12));
diff --git a/src/crypto/pkcs8/pkcs8_test.cc b/src/crypto/pkcs8/pkcs8_test.cc
index cbb2043..1196f9f 100644
--- a/src/crypto/pkcs8/pkcs8_test.cc
+++ b/src/crypto/pkcs8/pkcs8_test.cc
@@ -21,6 +21,8 @@
 #include <openssl/pkcs8.h>
 #include <openssl/x509.h>
 
+#include "../internal.h"
+
 
 /* kDER is a PKCS#8 encrypted private key. It was generated with:
  *
@@ -60,7 +62,89 @@
   0xd6, 0x2d,
 };
 
-static bool test(const uint8_t *der, size_t der_len) {
+/* kNullPassword is a PKCS#8 encrypted private key using the null password. */
+static const uint8_t kNullPassword[] = {
+    0x30, 0x81, 0xb0, 0x30, 0x1b, 0x06, 0x0a, 0x2a, 0x86, 0x48, 0x86, 0xf7,
+    0x0d, 0x01, 0x0c, 0x01, 0x03, 0x30, 0x0d, 0x04, 0x08, 0xb2, 0xfe, 0x68,
+    0xc2, 0xea, 0x0f, 0x10, 0x9c, 0x02, 0x01, 0x01, 0x04, 0x81, 0x90, 0xe2,
+    0xf6, 0x1c, 0xca, 0xad, 0x64, 0x30, 0xbf, 0x88, 0x04, 0x35, 0xe5, 0x0f,
+    0x11, 0x49, 0x06, 0x01, 0x14, 0x33, 0x80, 0xa2, 0x78, 0x44, 0x5b, 0xaa,
+    0x0d, 0xd7, 0x00, 0x36, 0x9d, 0x91, 0x97, 0x37, 0x20, 0x7b, 0x27, 0xc1,
+    0xa0, 0xa2, 0x73, 0x06, 0x15, 0xdf, 0xc8, 0x13, 0x9b, 0xc9, 0x8c, 0x9c,
+    0xce, 0x00, 0xd0, 0xc8, 0x42, 0xc1, 0xda, 0x2b, 0x07, 0x2b, 0x12, 0xa3,
+    0xce, 0x10, 0x39, 0x7a, 0xf1, 0x55, 0x69, 0x8d, 0xa5, 0xc4, 0x2a, 0x00,
+    0x0d, 0x94, 0xc6, 0xde, 0x6a, 0x3d, 0xb7, 0xe5, 0x6d, 0x59, 0x3e, 0x09,
+    0xb5, 0xe3, 0x3e, 0xfc, 0x50, 0x56, 0xe9, 0x50, 0x42, 0x7c, 0xe7, 0xf0,
+    0x19, 0xbd, 0x31, 0xa7, 0x85, 0x47, 0xb3, 0xe9, 0xb3, 0x50, 0x3c, 0xc9,
+    0x32, 0x37, 0x1a, 0x93, 0x78, 0x48, 0x78, 0x82, 0xde, 0xad, 0x5c, 0xf2,
+    0xcf, 0xf2, 0xbb, 0x2c, 0x44, 0x05, 0x7f, 0x4a, 0xf9, 0xb1, 0x2b, 0xdd,
+    0x49, 0xf6, 0x7e, 0xd0, 0x42, 0xaa, 0x14, 0x3c, 0x24, 0x77, 0xb4,
+};
+
+/* kNullPasswordNSS is a PKCS#8 encrypted private key using the null password
+ * and generated by NSS. */
+static const uint8_t kNullPasswordNSS[] = {
+    0x30, 0x81, 0xb8, 0x30, 0x23, 0x06, 0x0a, 0x2a, 0x86, 0x48, 0x86, 0xf7,
+    0x0d, 0x01, 0x0c, 0x01, 0x03, 0x30, 0x15, 0x04, 0x10, 0x3f, 0xac, 0xe9,
+    0x38, 0xdb, 0x40, 0x6b, 0x26, 0x89, 0x09, 0x73, 0x18, 0x8d, 0x7f, 0x1c,
+    0x82, 0x02, 0x01, 0x01, 0x04, 0x81, 0x90, 0x5e, 0x5e, 0x11, 0xef, 0xbb,
+    0x7c, 0x4d, 0xec, 0xc0, 0xdc, 0xc7, 0x23, 0xd2, 0xc4, 0x77, 0xbc, 0xf4,
+    0x5d, 0x59, 0x4c, 0x07, 0xc2, 0x8a, 0x26, 0xfa, 0x25, 0x1c, 0xaa, 0x42,
+    0xed, 0xd0, 0xed, 0xbb, 0x5c, 0xe9, 0x13, 0x07, 0xaa, 0xdd, 0x52, 0x3c,
+    0x65, 0x25, 0xbf, 0x94, 0x02, 0xaf, 0xd6, 0x97, 0xe9, 0x33, 0x00, 0x76,
+    0x64, 0x4a, 0x73, 0xab, 0xfb, 0x99, 0x6e, 0x83, 0x12, 0x05, 0x86, 0x72,
+    0x6c, 0xd5, 0xa4, 0xcf, 0xb1, 0xd5, 0x4d, 0x54, 0x87, 0x8b, 0x4b, 0x95,
+    0x1d, 0xcd, 0xf3, 0xfe, 0xa8, 0xda, 0xe0, 0xb6, 0x72, 0x13, 0x3f, 0x2e,
+    0x66, 0xe0, 0xb9, 0x2e, 0xfa, 0x69, 0x40, 0xbe, 0xd7, 0x67, 0x6e, 0x53,
+    0x2b, 0x3f, 0x53, 0xe5, 0x39, 0x54, 0x77, 0xe1, 0x1d, 0xe6, 0x81, 0x92,
+    0x58, 0x82, 0x14, 0xfb, 0x47, 0x85, 0x3c, 0xc3, 0xdf, 0xdd, 0xcc, 0x79,
+    0x9f, 0x41, 0x83, 0x72, 0xf2, 0x0a, 0xe9, 0xe1, 0x2c, 0x12, 0xb0, 0xb0,
+    0x0a, 0xb2, 0x1d, 0xca, 0x15, 0xb2, 0xca,
+};
+
+/* kEmptyPasswordOpenSSL is a PKCS#8 encrypted private key using the empty
+ * password and generated by OpenSSL. */
+static const uint8_t kEmptyPasswordOpenSSL[] = {
+    0x30, 0x82, 0x01, 0xa1, 0x30, 0x1b, 0x06, 0x0a, 0x2a, 0x86, 0x48, 0x86,
+    0xf7, 0x0d, 0x01, 0x0c, 0x01, 0x03, 0x30, 0x0d, 0x04, 0x08, 0x86, 0xaa,
+    0xd7, 0xdf, 0x3b, 0x91, 0x97, 0x60, 0x02, 0x01, 0x01, 0x04, 0x82, 0x01,
+    0x80, 0xcb, 0x2a, 0x14, 0xaa, 0x4f, 0x38, 0x4c, 0xe1, 0x49, 0x00, 0xe2,
+    0x1a, 0x3a, 0x75, 0x87, 0x7e, 0x3d, 0xea, 0x4d, 0x53, 0xd4, 0x46, 0x47,
+    0x23, 0x8f, 0xa1, 0x72, 0x51, 0x92, 0x86, 0x8b, 0xeb, 0x53, 0xe6, 0x6a,
+    0x0a, 0x6b, 0xb6, 0xa0, 0xdc, 0x0f, 0xdc, 0x20, 0xc3, 0x45, 0x85, 0xf1,
+    0x95, 0x90, 0x5c, 0xf4, 0xfa, 0xee, 0x47, 0xaf, 0x35, 0xd0, 0xd0, 0xd3,
+    0x14, 0xde, 0x0d, 0xca, 0x1b, 0xd3, 0xbb, 0x20, 0xec, 0x9d, 0x6a, 0xd4,
+    0xc1, 0xce, 0x60, 0x81, 0xab, 0x0c, 0x72, 0x10, 0xfa, 0x28, 0x3c, 0xac,
+    0x87, 0x7b, 0x82, 0x85, 0x00, 0xb8, 0x58, 0x9c, 0x07, 0xc4, 0x7d, 0xa9,
+    0xc5, 0x94, 0x95, 0xf7, 0x23, 0x93, 0x3f, 0xed, 0xef, 0x92, 0x55, 0x25,
+    0x74, 0xbb, 0xd3, 0xd1, 0x67, 0x3b, 0x3d, 0x5a, 0xfe, 0x84, 0xf8, 0x97,
+    0x7d, 0x7c, 0x01, 0xc7, 0xd7, 0x0d, 0xf8, 0xc3, 0x6d, 0xd6, 0xf1, 0xaa,
+    0x9d, 0x1f, 0x69, 0x97, 0x45, 0x06, 0xc4, 0x1c, 0x95, 0x3c, 0xe0, 0xef,
+    0x11, 0xb2, 0xb3, 0x72, 0x91, 0x9e, 0x7d, 0x0f, 0x7f, 0xc8, 0xf6, 0x64,
+    0x49, 0x5e, 0x3c, 0x53, 0x37, 0x79, 0x03, 0x1c, 0x3f, 0x29, 0x6c, 0x6b,
+    0xea, 0x4c, 0x35, 0x9b, 0x6d, 0x1b, 0x59, 0x43, 0x4c, 0x14, 0x47, 0x2a,
+    0x36, 0x39, 0x2a, 0xd8, 0x96, 0x90, 0xdc, 0xfc, 0xd2, 0xdd, 0x23, 0x0e,
+    0x2c, 0xb3, 0x83, 0xf9, 0xf2, 0xe3, 0xe6, 0x99, 0x53, 0x57, 0x33, 0xc5,
+    0x5f, 0xf9, 0xfd, 0x56, 0x0b, 0x32, 0xd4, 0xf3, 0x9d, 0x5b, 0x34, 0xe5,
+    0x94, 0xbf, 0xb6, 0xc0, 0xce, 0xe1, 0x73, 0x5c, 0x02, 0x7a, 0x4c, 0xed,
+    0xde, 0x23, 0x38, 0x89, 0x9f, 0xcd, 0x51, 0xf3, 0x90, 0x80, 0xd3, 0x4b,
+    0x83, 0xd3, 0xee, 0xf2, 0x9e, 0x35, 0x91, 0xa5, 0xa3, 0xc0, 0x5c, 0xce,
+    0xdb, 0xaa, 0x70, 0x1e, 0x1d, 0xc1, 0x44, 0xea, 0x3b, 0xa7, 0x5a, 0x11,
+    0xd1, 0xf3, 0xf3, 0xd0, 0xf4, 0x5a, 0xc4, 0x99, 0xaf, 0x8d, 0xe2, 0xbc,
+    0xa2, 0xb9, 0x3d, 0x86, 0x5e, 0xba, 0xa0, 0xdf, 0x78, 0x81, 0x7c, 0x54,
+    0x31, 0xe3, 0x98, 0xb5, 0x46, 0xcb, 0x4d, 0x26, 0x4b, 0xf8, 0xac, 0x3a,
+    0x54, 0x1b, 0x77, 0x5a, 0x18, 0xa5, 0x43, 0x0e, 0x14, 0xde, 0x7b, 0xb7,
+    0x4e, 0x45, 0x99, 0x03, 0xd1, 0x3d, 0x18, 0xb2, 0x36, 0x00, 0x48, 0x07,
+    0x72, 0xbb, 0x4f, 0x21, 0x25, 0x3e, 0xda, 0x25, 0x24, 0x5b, 0xc8, 0xa0,
+    0x28, 0xd5, 0x9b, 0x96, 0x87, 0x07, 0x77, 0x84, 0xff, 0xd7, 0xac, 0x71,
+    0xf6, 0x61, 0x63, 0x0b, 0xfb, 0x42, 0xfd, 0x52, 0xf4, 0xc4, 0x35, 0x0c,
+    0xc2, 0xc1, 0x55, 0x22, 0x42, 0x2f, 0x13, 0x7d, 0x93, 0x27, 0xc8, 0x11,
+    0x35, 0xc5, 0xe3, 0xc5, 0xaa, 0x15, 0x3c, 0xac, 0x30, 0xbc, 0x45, 0x16,
+    0xed,
+};
+
+static bool TestDecrypt(const uint8_t *der, size_t der_len,
+                        const char *password) {
   const uint8_t *data = der;
   bssl::UniquePtr<X509_SIG> sig(d2i_X509_SIG(NULL, &data, der_len));
   if (sig.get() == NULL || data != der + der_len) {
@@ -68,8 +152,8 @@
     return false;
   }
 
-  static const char kPassword[] = "testing";
-  bssl::UniquePtr<PKCS8_PRIV_KEY_INFO> keypair(PKCS8_decrypt(sig.get(), kPassword, -1));
+  bssl::UniquePtr<PKCS8_PRIV_KEY_INFO> keypair(
+      PKCS8_decrypt(sig.get(), password, -1));
   if (!keypair) {
     fprintf(stderr, "PKCS8_decrypt failed.\n");
     ERR_print_errors_fp(stderr);
@@ -79,8 +163,92 @@
   return true;
 }
 
+static bool TestRoundTrip(int pbe_nid, const EVP_CIPHER *cipher,
+                          const char *password, const uint8_t *salt,
+                          size_t salt_len, int iterations) {
+  static const uint8_t kSampleKey[] = {
+      0x30, 0x81, 0x87, 0x02, 0x01, 0x00, 0x30, 0x13, 0x06, 0x07, 0x2a, 0x86,
+      0x48, 0xce, 0x3d, 0x02, 0x01, 0x06, 0x08, 0x2a, 0x86, 0x48, 0xce, 0x3d,
+      0x03, 0x01, 0x07, 0x04, 0x6d, 0x30, 0x6b, 0x02, 0x01, 0x01, 0x04, 0x20,
+      0x8a, 0x87, 0x2f, 0xb6, 0x28, 0x93, 0xc4, 0xd1, 0xff, 0xc5, 0xb9, 0xf0,
+      0xf9, 0x17, 0x58, 0x06, 0x9f, 0x83, 0x52, 0xe0, 0x8f, 0xa0, 0x5a, 0x49,
+      0xf8, 0xdb, 0x92, 0x6c, 0xb5, 0x72, 0x87, 0x25, 0xa1, 0x44, 0x03, 0x42,
+      0x00, 0x04, 0x2c, 0x15, 0x0f, 0x42, 0x9c, 0xe7, 0x0f, 0x21, 0x6c, 0x25,
+      0x2c, 0xf5, 0xe0, 0x62, 0xce, 0x1f, 0x63, 0x9c, 0xd5, 0xd1, 0x65, 0xc7,
+      0xf8, 0x94, 0x24, 0x07, 0x2c, 0x27, 0x19, 0x7d, 0x78, 0xb3, 0x3b, 0x92,
+      0x0e, 0x95, 0xcd, 0xb6, 0x64, 0xe9, 0x90, 0xdc, 0xf0, 0xcf, 0xea, 0x0d,
+      0x94, 0xe2, 0xa8, 0xe6, 0xaf, 0x9d, 0x0e, 0x58, 0x05, 0x6e, 0x65, 0x31,
+      0x04, 0x92, 0x5b, 0x9f, 0xe6, 0xc9,
+  };
+
+  const uint8_t *ptr = kSampleKey;
+  bssl::UniquePtr<PKCS8_PRIV_KEY_INFO> key(
+      d2i_PKCS8_PRIV_KEY_INFO(nullptr, &ptr, sizeof(kSampleKey)));
+  if (!key || ptr != kSampleKey + sizeof(kSampleKey)) {
+    return false;
+  }
+
+  bssl::UniquePtr<X509_SIG> encrypted(PKCS8_encrypt(
+      pbe_nid, cipher, password, -1, salt, salt_len, iterations, key.get()));
+  if (!encrypted) {
+    fprintf(stderr, "Failed to encrypt private key.\n");
+    return false;
+  }
+
+  bssl::UniquePtr<PKCS8_PRIV_KEY_INFO> key2(
+      PKCS8_decrypt(encrypted.get(), password, -1));
+  if (!key2) {
+    fprintf(stderr, "Failed to decrypt private key.\n");
+    return false;
+  }
+
+  uint8_t *encoded = nullptr;
+  int len = i2d_PKCS8_PRIV_KEY_INFO(key2.get(), &encoded);
+  bssl::UniquePtr<uint8_t> free_encoded(encoded);
+  if (len < 0 ||
+      static_cast<size_t>(len) != sizeof(kSampleKey) ||
+      OPENSSL_memcmp(encoded, kSampleKey, sizeof(kSampleKey)) != 0) {
+    fprintf(stderr, "Decrypted private key did not round-trip.");
+    return false;
+  }
+
+  return true;
+}
+
 int main(int argc, char **argv) {
-  if (!test(kDER, sizeof(kDER))) {
+  CRYPTO_library_init();
+
+  if (!TestDecrypt(kDER, sizeof(kDER), "testing") ||
+      !TestDecrypt(kNullPassword, sizeof(kNullPassword), NULL) ||
+      !TestDecrypt(kNullPasswordNSS, sizeof(kNullPasswordNSS), NULL) ||
+      !TestDecrypt(kEmptyPasswordOpenSSL, sizeof(kEmptyPasswordOpenSSL), "") ||
+      !TestRoundTrip(NID_pbe_WithSHA1And3_Key_TripleDES_CBC, nullptr,
+                     "password", nullptr, 0, 10) ||
+      // Vary the salt
+      !TestRoundTrip(NID_pbe_WithSHA1And3_Key_TripleDES_CBC, nullptr,
+                     "password", nullptr, 4, 10) ||
+      !TestRoundTrip(NID_pbe_WithSHA1And3_Key_TripleDES_CBC, nullptr,
+                     "password", (const uint8_t *)"salt", 4, 10) ||
+      // Vary the iteration count.
+      !TestRoundTrip(NID_pbe_WithSHA1And3_Key_TripleDES_CBC, nullptr,
+                     "password", nullptr, 0, 1) ||
+      // Vary the password.
+      !TestRoundTrip(NID_pbe_WithSHA1And3_Key_TripleDES_CBC, nullptr, "",
+                     nullptr, 0, 1) ||
+      !TestRoundTrip(NID_pbe_WithSHA1And3_Key_TripleDES_CBC, nullptr, nullptr,
+                     nullptr, 0, 1) ||
+      // Vary the PBE suite.
+      !TestRoundTrip(NID_pbe_WithSHA1And40BitRC2_CBC, nullptr, "password",
+                     nullptr, 0, 10) ||
+      !TestRoundTrip(NID_pbe_WithSHA1And128BitRC4, nullptr, "password", nullptr,
+                     0, 10) ||
+      // Test PBES2.
+      !TestRoundTrip(-1, EVP_aes_128_cbc(), "password", nullptr, 0, 10) ||
+      !TestRoundTrip(-1, EVP_aes_128_cbc(), "password", nullptr, 4, 10) ||
+      !TestRoundTrip(-1, EVP_aes_128_cbc(), "password", (const uint8_t *)"salt",
+                     4, 10) ||
+      !TestRoundTrip(-1, EVP_aes_128_cbc(), "password", nullptr, 0, 1) ||
+      !TestRoundTrip(-1, EVP_rc2_cbc(), "password", nullptr, 0, 10)) {
     return 1;
   }
 
diff --git a/src/crypto/rand/CMakeLists.txt b/src/crypto/rand/CMakeLists.txt
index c66d2ee..f7c11f1 100644
--- a/src/crypto/rand/CMakeLists.txt
+++ b/src/crypto/rand/CMakeLists.txt
@@ -14,6 +14,7 @@
   OBJECT
 
   deterministic.c
+  fuchsia.c
   rand.c
   urandom.c
   windows.c
diff --git a/src/crypto/rand/fuchsia.c b/src/crypto/rand/fuchsia.c
new file mode 100644
index 0000000..2e138d0
--- /dev/null
+++ b/src/crypto/rand/fuchsia.c
@@ -0,0 +1,43 @@
+/* Copyright (c) 2017, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/rand.h>
+
+#if defined(OPENSSL_FUCHSIA) && !defined(BORINGSSL_UNSAFE_DETERMINISTIC_MODE)
+
+#include <limits.h>
+#include <stdlib.h>
+
+#include <magenta/syscalls.h>
+
+#include "internal.h"
+
+void CRYPTO_sysrand(uint8_t *out, size_t requested) {
+  while (requested > 0) {
+    size_t output_bytes_this_pass = MX_CPRNG_DRAW_MAX_LEN;
+    if (requested < output_bytes_this_pass) {
+      output_bytes_this_pass = requested;
+    }
+    size_t bytes_drawn;
+    mx_status_t status =
+        mx_cprng_draw(out, output_bytes_this_pass, &bytes_drawn);
+    if (status != NO_ERROR) {
+      abort();
+    }
+    requested -= bytes_drawn;
+    out += bytes_drawn;
+  }
+}
+
+#endif /* OPENSSL_FUCHSIA && !BORINGSSL_UNSAFE_DETERMINISTIC_MODE */
diff --git a/src/crypto/rand/urandom.c b/src/crypto/rand/urandom.c
index 2233203..3ccd940 100644
--- a/src/crypto/rand/urandom.c
+++ b/src/crypto/rand/urandom.c
@@ -16,7 +16,8 @@
 
 #include <openssl/rand.h>
 
-#if !defined(OPENSSL_WINDOWS) && !defined(BORINGSSL_UNSAFE_DETERMINISTIC_MODE)
+#if !defined(OPENSSL_WINDOWS) && !defined(OPENSSL_FUCHSIA) && \
+    !defined(BORINGSSL_UNSAFE_DETERMINISTIC_MODE)
 
 #include <assert.h>
 #include <errno.h>
@@ -328,4 +329,5 @@
   }
 }
 
-#endif  /* !OPENSSL_WINDOWS && !BORINGSSL_UNSAFE_DETERMINISTIC_MODE */
+#endif /* !OPENSSL_WINDOWS && !defined(OPENSSL_FUCHSIA) && \
+          !BORINGSSL_UNSAFE_DETERMINISTIC_MODE */
diff --git a/src/crypto/rsa/rsa_impl.c b/src/crypto/rsa/rsa_impl.c
index 3834be5..8e0aa9c 100644
--- a/src/crypto/rsa/rsa_impl.c
+++ b/src/crypto/rsa/rsa_impl.c
@@ -769,8 +769,6 @@
 int rsa_default_multi_prime_keygen(RSA *rsa, int bits, int num_primes,
                                    BIGNUM *e_value, BN_GENCB *cb) {
   BIGNUM *r0 = NULL, *r1 = NULL, *r2 = NULL, *r3 = NULL, *tmp;
-  BIGNUM local_r0, local_p;
-  BIGNUM *pr0, *p;
   int prime_bits, ok = -1, n = 0, i, j;
   BN_CTX *ctx = NULL;
   STACK_OF(RSA_additional_prime) *additional_primes = NULL;
@@ -999,9 +997,7 @@
       goto err;
     }
   }
-  pr0 = &local_r0;
-  BN_with_flags(pr0, r0, BN_FLG_CONSTTIME);
-  if (!BN_mod_inverse(rsa->d, rsa->e, pr0, ctx)) {
+  if (!BN_mod_inverse(rsa->d, rsa->e, r0, ctx)) {
     goto err; /* d */
   }
 
@@ -1019,10 +1015,9 @@
    * from constant-time, |bn_mod_inverse_secret_prime| uses the same modular
    * exponentation logic as in RSA private key operations and, if the RSAZ-1024
    * code is enabled, will be optimized for common RSA prime sizes. */
-  p = &local_p;
-  BN_with_flags(p, rsa->p, BN_FLG_CONSTTIME);
   if (!BN_MONT_CTX_set_locked(&rsa->mont_p, &rsa->lock, rsa->p, ctx) ||
-      !bn_mod_inverse_secret_prime(rsa->iqmp, rsa->q, p, ctx, rsa->mont_p)) {
+      !bn_mod_inverse_secret_prime(rsa->iqmp, rsa->q, rsa->p, ctx,
+                                   rsa->mont_p)) {
     goto err;
   }
 
diff --git a/src/crypto/x509/t_x509.c b/src/crypto/x509/t_x509.c
index d5d48ba..d4f6bba 100644
--- a/src/crypto/x509/t_x509.c
+++ b/src/crypto/x509/t_x509.c
@@ -54,6 +54,7 @@
  * copied and put under another distribution licence
  * [including the GNU Public Licence.] */
 
+#include <ctype.h>
 #include <openssl/asn1.h>
 #include <openssl/bio.h>
 #include <openssl/digest.h>
@@ -415,45 +416,84 @@
     return (0);
 }
 
-int ASN1_UTCTIME_print(BIO *bp, const ASN1_UTCTIME *tm)
-{
-    const char *v;
-    int gmt = 0;
-    int i;
-    int y = 0, M = 0, d = 0, h = 0, m = 0, s = 0;
+// consume_two_digits is a helper function for ASN1_UTCTIME_print. If |*v|,
+// assumed to be |*len| bytes long, has two leading digits, updates |*out| with
+// their value, updates |v| and |len|, and returns one. Otherwise, returns
+// zero.
+static int consume_two_digits(int* out, const char **v, int *len) {
+  if (*len < 2|| !isdigit((*v)[0]) || !isdigit((*v)[1])) {
+    return 0;
+  }
+  *out = ((*v)[0] - '0') * 10 + ((*v)[1] - '0');
+  *len -= 2;
+  *v += 2;
+  return 1;
+}
 
-    i = tm->length;
-    v = (const char *)tm->data;
+// consume_zulu_timezone is a helper function for ASN1_UTCTIME_print. If |*v|,
+// assumed to be |*len| bytes long, starts with "Z" then it updates |*v| and
+// |*len| and returns one. Otherwise returns zero.
+static int consume_zulu_timezone(const char **v, int *len) {
+  if (*len == 0 || (*v)[0] != 'Z') {
+    return 0;
+  }
 
-    if (i < 10)
-        goto err;
-    if (v[i - 1] == 'Z')
-        gmt = 1;
-    for (i = 0; i < 10; i++)
-        if ((v[i] > '9') || (v[i] < '0'))
-            goto err;
-    y = (v[0] - '0') * 10 + (v[1] - '0');
-    if (y < 50)
-        y += 100;
-    M = (v[2] - '0') * 10 + (v[3] - '0');
-    if ((M > 12) || (M < 1))
-        goto err;
-    d = (v[4] - '0') * 10 + (v[5] - '0');
-    h = (v[6] - '0') * 10 + (v[7] - '0');
-    m = (v[8] - '0') * 10 + (v[9] - '0');
-    if (tm->length >= 12 &&
-        (v[10] >= '0') && (v[10] <= '9') && (v[11] >= '0') && (v[11] <= '9'))
-        s = (v[10] - '0') * 10 + (v[11] - '0');
+  *len -= 1;
+  *v += 1;
+  return 1;
+}
 
-    if (BIO_printf(bp, "%s %2d %02d:%02d:%02d %d%s",
-                   mon[M - 1], d, h, m, s, y + 1900,
-                   (gmt) ? " GMT" : "") <= 0)
-        return (0);
-    else
-        return (1);
- err:
-    BIO_write(bp, "Bad time value", 14);
-    return (0);
+int ASN1_UTCTIME_print(BIO *bp, const ASN1_UTCTIME *tm) {
+  const char *v = (const char *)tm->data;
+  int len = tm->length;
+  int Y = 0, M = 0, D = 0, h = 0, m = 0, s = 0;
+
+  // YYMMDDhhmm are required to be present.
+  if (!consume_two_digits(&Y, &v, &len) ||
+      !consume_two_digits(&M, &v, &len) ||
+      !consume_two_digits(&D, &v, &len) ||
+      !consume_two_digits(&h, &v, &len) ||
+      !consume_two_digits(&m, &v, &len)) {
+    goto err;
+  }
+  // https://tools.ietf.org/html/rfc5280, section 4.1.2.5.1, requires seconds
+  // to be present, but historically this code has forgiven its absence.
+  consume_two_digits(&s, &v, &len);
+
+  // https://tools.ietf.org/html/rfc5280, section 4.1.2.5.1, specifies this
+  // interpretation of the year.
+  if (Y < 50) {
+    Y += 2000;
+  } else {
+    Y += 1900;
+  }
+  if (M > 12 || M == 0) {
+    goto err;
+  }
+  if (D > 31 || D == 0) {
+    goto err;
+  }
+  if (h > 23 || m > 59 || s > 60) {
+    goto err;
+  }
+
+  // https://tools.ietf.org/html/rfc5280, section 4.1.2.5.1, requires the "Z"
+  // to be present, but historically this code has forgiven its absence.
+  const int is_gmt = consume_zulu_timezone(&v, &len);
+
+  // https://tools.ietf.org/html/rfc5280, section 4.1.2.5.1, does not permit
+  // the specification of timezones using the +hhmm / -hhmm syntax, which is
+  // the only other thing that might legitimately be found at the end.
+  if (len) {
+    goto err;
+  }
+
+  return BIO_printf(bp, "%s %2d %02d:%02d:%02d %d%s", mon[M - 1], D, h, m, s, Y,
+                    is_gmt ? " GMT" : "") > 0;
+
+err:
+  BIO_write(bp, "Bad time value", 14);
+  return 0;
 }
 
 int X509_NAME_print(BIO *bp, X509_NAME *name, int obase)
diff --git a/src/crypto/x509/x509_test.cc b/src/crypto/x509/x509_test.cc
index 3629c13..4b80af8 100644
--- a/src/crypto/x509/x509_test.cc
+++ b/src/crypto/x509/x509_test.cc
@@ -17,6 +17,7 @@
 #include <assert.h>
 #include <string.h>
 
+#include <openssl/bio.h>
 #include <openssl/bytestring.h>
 #include <openssl/crypto.h>
 #include <openssl/digest.h>
@@ -1011,6 +1012,73 @@
   return true;
 }
 
+static bool TestPrintUTCTIME() {
+  static const struct {
+    const char *val, *want;
+  } asn1_utctime_tests[] = {
+    {"", "Bad time value"},
+
+    // Correct RFC 5280 form. Test years < 2000 and > 2000.
+    {"090303125425Z", "Mar  3 12:54:25 2009 GMT"},
+    {"900303125425Z", "Mar  3 12:54:25 1990 GMT"},
+    {"000303125425Z", "Mar  3 12:54:25 2000 GMT"},
+
+    // Correct form, bad values.
+    {"000000000000Z", "Bad time value"},
+    {"999999999999Z", "Bad time value"},
+
+    // Missing components. Not legal RFC 5280, but permitted.
+    {"090303125425", "Mar  3 12:54:25 2009"},
+    {"9003031254", "Mar  3 12:54:00 1990"},
+    {"9003031254Z", "Mar  3 12:54:00 1990 GMT"},
+
+    // GENERALIZEDTIME confused for UTCTIME.
+    {"20090303125425Z", "Bad time value"},
+
+    // Legal ASN.1, but not legal RFC 5280.
+    {"9003031254+0800", "Bad time value"},
+    {"9003031254-0800", "Bad time value"},
+
+    // Trailing garbage.
+    {"9003031254Z ", "Bad time value"},
+  };
+
+  for (auto t : asn1_utctime_tests) {
+    bssl::UniquePtr<ASN1_UTCTIME> tm(ASN1_UTCTIME_new());
+    bssl::UniquePtr<BIO> bio(BIO_new(BIO_s_mem()));
+
+    // Use this instead of ASN1_UTCTIME_set() because some callers get
+    // type-confused and pass ASN1_GENERALIZEDTIME to ASN1_UTCTIME_print().
+    // ASN1_UTCTIME_set_string() is stricter, and would reject the inputs in
+    // question.
+    if (!ASN1_STRING_set(tm.get(), t.val, strlen(t.val))) {
+      fprintf(stderr, "ASN1_STRING_set\n");
+      return false;
+    }
+    const int ok = ASN1_UTCTIME_print(bio.get(), tm.get());
+
+    const uint8_t *contents;
+    size_t len;
+    if (!BIO_mem_contents(bio.get(), &contents, &len)) {
+      fprintf(stderr, "BIO_mem_contents\n");
+      return false;
+    }
+
+    if (ok != (strcmp(t.want, "Bad time value") != 0)) {
+      fprintf(stderr, "ASN1_UTCTIME_print(%s): bad return value\n", t.val);
+      return false;
+    }
+    if (len != strlen(t.want) || memcmp(contents, t.want, len)) {
+      fprintf(stderr, "ASN1_UTCTIME_print(%s): got %.*s, want %s\n", t.val,
+              static_cast<int>(len),
+              reinterpret_cast<const char *>(contents), t.want);
+      return false;
+    }
+  }
+
+  return true;
+}
+
 int main() {
   CRYPTO_library_init();
 
@@ -1023,7 +1091,8 @@
       !TestFromBufferTrailingData() ||
       !TestFromBufferModified() ||
       !TestFromBufferReused() ||
-      !TestFailedParseFromBuffer()) {
+      !TestFailedParseFromBuffer() ||
+      !TestPrintUTCTIME()) {
     return 1;
   }
 
diff --git a/src/include/openssl/aead.h b/src/include/openssl/aead.h
index eaa2b8f..35e0f13 100644
--- a/src/include/openssl/aead.h
+++ b/src/include/openssl/aead.h
@@ -64,7 +64,7 @@
  *
  * The "seal" and "open" operations are atomic - an entire message must be
  * encrypted or decrypted in a single call. Large messages may have to be split
- * up in order to accomodate this. When doing so, be mindful of the need not to
+ * up in order to accommodate this. When doing so, be mindful of the need not to
  * repeat nonces and the possibility that an attacker could duplicate, reorder
  * or drop message chunks. For example, using a single key for a given (large)
  * message and sealing chunks with nonces counting from zero would be secure as
diff --git a/src/include/openssl/base.h b/src/include/openssl/base.h
index bdb590a..3f47521 100644
--- a/src/include/openssl/base.h
+++ b/src/include/openssl/base.h
@@ -118,6 +118,10 @@
 #define OPENSSL_LINUX
 #endif
 
+#if defined(__Fuchsia__)
+#define OPENSSL_FUCHSIA
+#endif
+
 #if defined(TRUSTY)
 #define OPENSSL_TRUSTY
 #define OPENSSL_NO_THREADS
@@ -229,9 +233,6 @@
 typedef struct Netscape_certificate_sequence NETSCAPE_CERT_SEQUENCE;
 typedef struct Netscape_spkac_st NETSCAPE_SPKAC;
 typedef struct Netscape_spki_st NETSCAPE_SPKI;
-typedef struct PBE2PARAM_st PBE2PARAM;
-typedef struct PBEPARAM_st PBEPARAM;
-typedef struct PBKDF2PARAM_st PBKDF2PARAM;
 typedef struct RIPEMD160state_st RIPEMD160_CTX;
 typedef struct X509_POLICY_CACHE_st X509_POLICY_CACHE;
 typedef struct X509_POLICY_LEVEL_st X509_POLICY_LEVEL;
diff --git a/src/include/openssl/bio.h b/src/include/openssl/bio.h
index 847cb44..110629e 100644
--- a/src/include/openssl/bio.h
+++ b/src/include/openssl/bio.h
@@ -226,7 +226,7 @@
                             long larg, long return_value);
 
 /* BIO_callback_ctrl allows the callback function to be manipulated. The |cmd|
- * arg will generally be |BIO_CTRL_SET_CALLBACK| but arbitary command values
+ * arg will generally be |BIO_CTRL_SET_CALLBACK| but arbitrary command values
  * can be interpreted by the |BIO|. */
 OPENSSL_EXPORT long BIO_callback_ctrl(BIO *bio, int cmd, bio_info_cb fp);
 
@@ -469,7 +469,7 @@
 
 /* BIO_set_fp sets the |FILE| for |bio|. If |close_flag| is |BIO_CLOSE| then
  * |fclose| will be called on |file| when |bio| is closed. It returns one on
- * sucess and zero otherwise. */
+ * success and zero otherwise. */
 OPENSSL_EXPORT int BIO_set_fp(BIO *bio, FILE *file, int close_flag);
 
 /* BIO_read_filename opens |filename| for reading and sets the result as the
@@ -656,7 +656,8 @@
 #define BIO_CTRL_GET_CALLBACK	15  /* opt - set callback function */
 #define BIO_CTRL_SET_FILENAME	30	/* BIO_s_file special */
 
-/* BIO_CTRL_DUP is never used, but exists to allow code to compile more easily. */
+/* BIO_CTRL_DUP is never used, but exists to allow code to compile more
+ * easily. */
 #define BIO_CTRL_DUP	12
 
 
diff --git a/src/include/openssl/bn.h b/src/include/openssl/bn.h
index 18beba4..77f6196 100644
--- a/src/include/openssl/bn.h
+++ b/src/include/openssl/bn.h
@@ -134,7 +134,7 @@
 #endif
 
 
-/* BN provides support for working with arbitary sized integers. For example,
+/* BN provides support for working with arbitrary sized integers. For example,
  * although the largest integer supported by the compiler might be 64 bits, BN
  * will allow you to work with numbers until you run out of memory. */
 
@@ -194,13 +194,6 @@
 /* BN_value_one returns a static BIGNUM with value 1. */
 OPENSSL_EXPORT const BIGNUM *BN_value_one(void);
 
-/* BN_with_flags initialises a stack allocated |BIGNUM| with pointers to the
- * contents of |in| but with |flags| ORed into the flags field.
- *
- * Note: the two BIGNUMs share state and so |out| should /not/ be passed to
- * |BN_free|. */
-OPENSSL_EXPORT void BN_with_flags(BIGNUM *out, const BIGNUM *in, int flags);
-
 
 /* Basic functions. */
 
@@ -233,12 +226,6 @@
 /* BN_is_negative returns one if |bn| is negative and zero otherwise. */
 OPENSSL_EXPORT int BN_is_negative(const BIGNUM *bn);
 
-/* BN_get_flags returns |bn->flags| & |flags|. */
-OPENSSL_EXPORT int BN_get_flags(const BIGNUM *bn, int flags);
-
-/* BN_set_flags sets |flags| on |bn|. */
-OPENSSL_EXPORT void BN_set_flags(BIGNUM *bn, int flags);
-
 
 /* Conversion functions. */
 
@@ -358,7 +345,7 @@
 /* BIGNUM pools.
  *
  * Certain BIGNUM operations need to use many temporary variables and
- * allocating and freeing them can be quite slow. Thus such opertions typically
+ * allocating and freeing them can be quite slow. Thus such operations typically
  * take a |BN_CTX| parameter, which contains a pool of |BIGNUMs|. The |ctx|
  * argument to a public function may be NULL, in which case a local |BN_CTX|
  * will be created just for the lifetime of that call.
@@ -657,7 +644,7 @@
  * |BN_GENCB| structure.
  *
  * The callback receives the address of that |BN_GENCB| structure as its last
- * argument and the user is free to put an arbitary pointer in |arg|. The other
+ * argument and the user is free to put an arbitrary pointer in |arg|. The other
  * arguments are set as follows:
  *   event=BN_GENCB_GENERATED, n=i:   after generating the i'th possible prime
  *                                    number.
@@ -762,11 +749,10 @@
 /* BN_mod_inverse sets |out| equal to |a|^-1, mod |n|. If |out| is NULL, a
  * fresh BIGNUM is allocated. It returns the result or NULL on error.
  *
- * If either of |a| or |n| have |BN_FLG_CONSTTIME| set then the operation is
- * performed using an algorithm that avoids some branches but which isn't
- * constant-time. This function shouldn't be used for secret values, even
- * with |BN_FLG_CONSTTIME|; use |BN_mod_inverse_blinded| instead. Or, if
- * |n| is guaranteed to be prime, use
+ * If |n| is even then the operation is performed using an algorithm that avoids
+ * some branches but which isn't constant-time. This function shouldn't be used
+ * for secret values; use |BN_mod_inverse_blinded| instead. Or, if |n| is
+ * guaranteed to be prime, use
  * |BN_mod_exp_mont_consttime(out, a, m_minus_2, m, ctx, m_mont)|, taking
  * advantage of Fermat's Little Theorem. */
 OPENSSL_EXPORT BIGNUM *BN_mod_inverse(BIGNUM *out, const BIGNUM *a,
@@ -775,11 +761,9 @@
 /* BN_mod_inverse_blinded sets |out| equal to |a|^-1, mod |n|, where |n| is the
  * Montgomery modulus for |mont|. |a| must be non-negative and must be less
  * than |n|. |n| must be greater than 1. |a| is blinded (masked by a random
- * value) to protect it against side-channel attacks. |BN_mod_inverse_blinded|
- * may or may not ignore the |BN_FLG_CONSTTIME| flag on any/all of its inputs.
- * It returns one on success or zero on failure. On failure, if the failure was
- * caused by |a| having no inverse mod |n| then |*out_no_inverse| will be set
- * to one; otherwise it will be set to zero. */
+ * value) to protect it against side-channel attacks. On failure, if the failure
+ * was caused by |a| having no inverse mod |n| then |*out_no_inverse| will be
+ * set to one; otherwise it will be set to zero. */
 int BN_mod_inverse_blinded(BIGNUM *out, int *out_no_inverse, const BIGNUM *a,
                            const BN_MONT_CTX *mont, BN_CTX *ctx);
 
@@ -860,9 +844,9 @@
                           BN_CTX *ctx);
 
 /* BN_mod_exp sets |r| equal to |a|^{|p|} mod |m|. It does so with the best
- * algorithm for the values provided and can run in constant time if
- * |BN_FLG_CONSTTIME| is set for |p|. It returns one on success or zero
- * otherwise. */
+ * algorithm for the values provided. It returns one on success or zero
+ * otherwise. The |BN_mod_exp_mont_consttime| variant must be used if the
+ * exponent is secret. */
 OPENSSL_EXPORT int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
                               const BIGNUM *m, BN_CTX *ctx);
 
@@ -930,10 +914,11 @@
 
 #define BN_FLG_MALLOCED 0x01
 #define BN_FLG_STATIC_DATA 0x02
-/* Avoid leaking exponent information through timing. |BN_mod_exp_mont| will
- * call |BN_mod_exp_mont_consttime| and |BN_mod_inverse| will call
- * |BN_mod_inverse_no_branch|. */
-#define BN_FLG_CONSTTIME 0x04
+/* |BN_FLG_CONSTTIME| has been removed and intentionally omitted so code relying
+ * on it will not compile. Consumers outside BoringSSL should use the
+ * higher-level cryptographic algorithms exposed by other modules. Consumers
+ * within the library should call the appropriate timing-sensitive algorithm
+ * directly. */
 
 
 #if defined(__cplusplus)
diff --git a/src/include/openssl/cipher.h b/src/include/openssl/cipher.h
index a8585d7..2ee74ef 100644
--- a/src/include/openssl/cipher.h
+++ b/src/include/openssl/cipher.h
@@ -191,7 +191,7 @@
  * |*out_len| to the number of bytes written. If padding is enabled (the
  * default) then padding is removed from the final block.
  *
- * WARNING: it is unsafe to call this function with unauthenticted
+ * WARNING: it is unsafe to call this function with unauthenticated
  * ciphertext if padding is enabled. */
 OPENSSL_EXPORT int EVP_DecryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out,
                                        int *out_len);
@@ -282,12 +282,13 @@
 /* EVP_CIPHER_CTX_set_key_length sets the key length for |ctx|. This is only
  * valid for ciphers that can take a variable length key. It returns one on
  * success and zero on error. */
-OPENSSL_EXPORT int EVP_CIPHER_CTX_set_key_length(EVP_CIPHER_CTX *ctx, unsigned key_len);
+OPENSSL_EXPORT int EVP_CIPHER_CTX_set_key_length(EVP_CIPHER_CTX *ctx,
+                                                 unsigned key_len);
 
 
 /* Cipher accessors. */
 
-/* EVP_CIPHER_nid returns a NID identifing |cipher|. (For example,
+/* EVP_CIPHER_nid returns a NID identifying |cipher|. (For example,
  * |NID_aes_128_gcm|.) */
 OPENSSL_EXPORT int EVP_CIPHER_nid(const EVP_CIPHER *cipher);
 
diff --git a/src/include/openssl/conf.h b/src/include/openssl/conf.h
index 0a09e24..8b82fd4 100644
--- a/src/include/openssl/conf.h
+++ b/src/include/openssl/conf.h
@@ -77,7 +77,7 @@
  *   [section_name]
  *   key2=value2
  *
- * Config files are representated by a |CONF|. */
+ * Config files are represented by a |CONF|. */
 
 struct conf_value_st {
   char *section;
diff --git a/src/include/openssl/digest.h b/src/include/openssl/digest.h
index ec62993..caf5861 100644
--- a/src/include/openssl/digest.h
+++ b/src/include/openssl/digest.h
@@ -143,7 +143,8 @@
  * at least this much space. */
 #define EVP_MAX_MD_SIZE 64 /* SHA-512 is the longest so far. */
 
-/* EVP_MAX_MD_BLOCK_SIZE is the largest digest block size supported, in bytes. */
+/* EVP_MAX_MD_BLOCK_SIZE is the largest digest block size supported, in
+ * bytes. */
 #define EVP_MAX_MD_BLOCK_SIZE 128 /* SHA-512 is the longest so far. */
 
 /* EVP_DigestFinal_ex finishes the digest in |ctx| and writes the output to
@@ -175,7 +176,7 @@
  * These functions allow code to learn details about an abstract hash
  * function. */
 
-/* EVP_MD_type returns a NID identifing |md|. (For example, |NID_sha256|.) */
+/* EVP_MD_type returns a NID identifying |md|. (For example, |NID_sha256|.) */
 OPENSSL_EXPORT int EVP_MD_type(const EVP_MD *md);
 
 /* EVP_MD_flags returns the flags for |md|, which is a set of |EVP_MD_FLAG_*|
diff --git a/src/include/openssl/dsa.h b/src/include/openssl/dsa.h
index d6c3204..2988877 100644
--- a/src/include/openssl/dsa.h
+++ b/src/include/openssl/dsa.h
@@ -71,7 +71,7 @@
 #endif
 
 
-/* DSA contains functions for signing and verifing with the Digital Signature
+/* DSA contains functions for signing and verifying with the Digital Signature
  * Algorithm. */
 
 
@@ -351,10 +351,10 @@
  * Use |DSA_parse_private_key| instead. */
 OPENSSL_EXPORT DSA *d2i_DSAPrivateKey(DSA **out, const uint8_t **inp, long len);
 
-/* i2d_DSAPrivateKey marshals a private key from |in| to an ASN.1, DER structure.
- * If |outp| is not NULL then the result is written to |*outp| and |*outp| is
- * advanced just past the output. It returns the number of bytes in the result,
- * whether written or not, or a negative value on error.
+/* i2d_DSAPrivateKey marshals a private key from |in| to an ASN.1, DER
+ * structure. If |outp| is not NULL then the result is written to |*outp| and
+ * |*outp| is advanced just past the output. It returns the number of bytes in
+ * the result, whether written or not, or a negative value on error.
  *
  * Use |DSA_marshal_private_key| instead. */
 OPENSSL_EXPORT int i2d_DSAPrivateKey(const DSA *in, uint8_t **outp);
diff --git a/src/include/openssl/ec.h b/src/include/openssl/ec.h
index e780347..a1cd5c7 100644
--- a/src/include/openssl/ec.h
+++ b/src/include/openssl/ec.h
@@ -272,8 +272,8 @@
 OPENSSL_EXPORT int EC_POINT_dbl(const EC_GROUP *group, EC_POINT *r,
                                 const EC_POINT *a, BN_CTX *ctx);
 
-/* EC_POINT_invert sets |a| equal to minus |a|. It returns one on success and zero
- * otherwise. If |ctx| is not NULL, it may be used. */
+/* EC_POINT_invert sets |a| equal to minus |a|. It returns one on success and
+ * zero otherwise. If |ctx| is not NULL, it may be used. */
 OPENSSL_EXPORT int EC_POINT_invert(const EC_GROUP *group, EC_POINT *a,
                                    BN_CTX *ctx);
 
diff --git a/src/include/openssl/ecdsa.h b/src/include/openssl/ecdsa.h
index f6e9982..3890744 100644
--- a/src/include/openssl/ecdsa.h
+++ b/src/include/openssl/ecdsa.h
@@ -66,7 +66,7 @@
  * Algorithm over elliptic curves. */
 
 
-/* Signing and verifing. */
+/* Signing and verifying. */
 
 /* ECDSA_sign signs |digest_len| bytes from |digest| with |key| and writes the
  * resulting signature to |sig|, which must have |ECDSA_size(key)| bytes of
@@ -80,7 +80,7 @@
 /* ECDSA_verify verifies that |sig_len| bytes from |sig| constitute a valid
  * signature by |key| of |digest|. (The |type| argument should be zero.) It
  * returns one on success or zero if the signature is invalid or an error
- * occured. */
+ * occurred. */
 OPENSSL_EXPORT int ECDSA_verify(int type, const uint8_t *digest,
                                 size_t digest_len, const uint8_t *sig,
                                 size_t sig_len, EC_KEY *key);
diff --git a/src/include/openssl/err.h b/src/include/openssl/err.h
index f6efa12..a747b30 100644
--- a/src/include/openssl/err.h
+++ b/src/include/openssl/err.h
@@ -120,7 +120,7 @@
 
 /* Error queue handling functions.
  *
- * Errors in OpenSSL are generally signalled by the return value of a function.
+ * Errors in OpenSSL are generally signaled by the return value of a function.
  * When a function fails it may add an entry to a per-thread error queue,
  * which is managed by the functions in this header.
  *
@@ -306,7 +306,7 @@
   ERR_put_error(ERR_LIB_SYS, 0, 0, __FILE__, __LINE__);
 
 /* ERR_put_error adds an error to the error queue, dropping the least recent
- * error if neccessary for space reasons. */
+ * error if necessary for space reasons. */
 OPENSSL_EXPORT void ERR_put_error(int library, int unused, int reason,
                                   const char *file, unsigned line);
 
@@ -331,14 +331,14 @@
 OPENSSL_EXPORT int ERR_pop_to_mark(void);
 
 struct err_error_st {
-  /* file contains the filename where the error occured. */
+  /* file contains the filename where the error occurred. */
   const char *file;
   /* data contains optional data. It must be freed with |OPENSSL_free| if
    * |flags&ERR_FLAG_MALLOCED|. */
   char *data;
   /* packed contains the error library and reason, as packed by ERR_PACK. */
   uint32_t packed;
-  /* line contains the line number where the error occured. */
+  /* line contains the line number where the error occurred. */
   uint16_t line;
   /* flags contains a bitwise-OR of ERR_FLAG_* values. */
   uint8_t flags;
diff --git a/src/include/openssl/evp.h b/src/include/openssl/evp.h
index e8deb10..7debbc5 100644
--- a/src/include/openssl/evp.h
+++ b/src/include/openssl/evp.h
@@ -174,7 +174,7 @@
 OPENSSL_EXPORT int EVP_PKEY_assign(EVP_PKEY *pkey, int type, void *key);
 
 /* EVP_PKEY_set_type sets the type of |pkey| to |type|, which should be one of
- * the |EVP_PKEY_*| values. It returns one if sucessful or zero otherwise. If
+ * the |EVP_PKEY_*| values. It returns one if successful or zero otherwise. If
  * |pkey| is NULL, it simply reports whether the type is known. */
 OPENSSL_EXPORT int EVP_PKEY_set_type(EVP_PKEY *pkey, int type);
 
@@ -378,9 +378,10 @@
 /* PKCS5_PBKDF2_HMAC_SHA1 is the same as PKCS5_PBKDF2_HMAC, but with |digest|
  * fixed to |EVP_sha1|. */
 OPENSSL_EXPORT int PKCS5_PBKDF2_HMAC_SHA1(const char *password,
-                                          size_t password_len, const uint8_t *salt,
-                                          size_t salt_len, unsigned iterations,
-                                          size_t key_len, uint8_t *out_key);
+                                          size_t password_len,
+                                          const uint8_t *salt, size_t salt_len,
+                                          unsigned iterations, size_t key_len,
+                                          uint8_t *out_key);
 
 
 /* Public key contexts.
@@ -435,8 +436,8 @@
  * It returns one on success or zero on error. */
 OPENSSL_EXPORT int EVP_PKEY_verify_init(EVP_PKEY_CTX *ctx);
 
-/* EVP_PKEY_verify verifies that |sig_len| bytes from |sig| are a valid signature
- * for |data|.
+/* EVP_PKEY_verify verifies that |sig_len| bytes from |sig| are a valid
+ * signature for |data|.
  *
  * It returns one on success or zero on error. */
 OPENSSL_EXPORT int EVP_PKEY_verify(EVP_PKEY_CTX *ctx, const uint8_t *sig,
diff --git a/src/include/openssl/obj.h b/src/include/openssl/obj.h
index 8819593..63cf866 100644
--- a/src/include/openssl/obj.h
+++ b/src/include/openssl/obj.h
@@ -135,7 +135,7 @@
 
 /* Dealing with textual representations of object identifiers. */
 
-/* OBJ_txt2obj returns an ASN1_OBJECT for the textual respresentation in |s|.
+/* OBJ_txt2obj returns an ASN1_OBJECT for the textual representation in |s|.
  * If |dont_search_names| is zero, then |s| will be matched against the long
  * and short names of a known objects to find a match. Otherwise |s| must
  * contain an ASCII string with a dotted sequence of numbers. The resulting
diff --git a/src/include/openssl/pkcs8.h b/src/include/openssl/pkcs8.h
index e04a4f3..141ed8d 100644
--- a/src/include/openssl/pkcs8.h
+++ b/src/include/openssl/pkcs8.h
@@ -88,7 +88,7 @@
                                            const EVP_CIPHER *cipher,
                                            const uint8_t *pass_raw,
                                            size_t pass_raw_len,
-                                           uint8_t *salt, size_t salt_len,
+                                           const uint8_t *salt, size_t salt_len,
                                            int iterations,
                                            PKCS8_PRIV_KEY_INFO *p8inf);
 
@@ -123,7 +123,7 @@
  * unchanged.  */
 OPENSSL_EXPORT X509_SIG *PKCS8_encrypt(int pbe_nid, const EVP_CIPHER *cipher,
                                        const char *pass, int pass_len,
-                                       uint8_t *salt, size_t salt_len,
+                                       const uint8_t *salt, size_t salt_len,
                                        int iterations,
                                        PKCS8_PRIV_KEY_INFO *p8inf);
 
diff --git a/src/include/openssl/pool.h b/src/include/openssl/pool.h
index a6dee9f..dc5c938 100644
--- a/src/include/openssl/pool.h
+++ b/src/include/openssl/pool.h
@@ -24,9 +24,9 @@
 
 /* Buffers and buffer pools.
  *
- * |CRYPTO_BUFFER|s are simply reference-counted blobs. A |CRYPTO_BUFFER_POOL| is
- * an intern table for |CRYPTO_BUFFER|s. This allows for a single copy of a given
- * blob to be kept in memory and referenced from multiple places. */
+ * |CRYPTO_BUFFER|s are simply reference-counted blobs. A |CRYPTO_BUFFER_POOL|
+ * is an intern table for |CRYPTO_BUFFER|s. This allows for a single copy of a
+ * given blob to be kept in memory and referenced from multiple places. */
 
 
 /* CRYPTO_BUFFER_POOL_new returns a freshly allocated |CRYPTO_BUFFER_POOL| or
diff --git a/src/include/openssl/rsa.h b/src/include/openssl/rsa.h
index 19be3ba..bad3fad 100644
--- a/src/include/openssl/rsa.h
+++ b/src/include/openssl/rsa.h
@@ -517,7 +517,7 @@
                                         int sLen);
 
 /* RSA_padding_add_PKCS1_OAEP acts like |RSA_padding_add_PKCS1_OAEP_mgf1| but
- * the |md| and |mgf1md| paramaters of the latter are implicitly set to NULL,
+ * the |md| and |mgf1md| parameters of the latter are implicitly set to NULL,
  * which means SHA-1. */
 OPENSSL_EXPORT int RSA_padding_add_PKCS1_OAEP(uint8_t *to, unsigned to_len,
                                               const uint8_t *from,
diff --git a/src/include/openssl/ssl.h b/src/include/openssl/ssl.h
index c230f8c..70646b0 100644
--- a/src/include/openssl/ssl.h
+++ b/src/include/openssl/ssl.h
@@ -904,14 +904,14 @@
                                                           const uint8_t *list,
                                                           size_t list_len);
 
-/* SSL_CTX_set_ocsp_response sets the OCSP reponse that is sent to clients
+/* SSL_CTX_set_ocsp_response sets the OCSP response that is sent to clients
  * which request it. It returns one on success and zero on error. The caller
  * retains ownership of |response|. */
 OPENSSL_EXPORT int SSL_CTX_set_ocsp_response(SSL_CTX *ctx,
                                              const uint8_t *response,
                                              size_t response_len);
 
-/* SSL_set_ocsp_response sets the OCSP reponse that is sent to clients which
+/* SSL_set_ocsp_response sets the OCSP response that is sent to clients which
  * request it. It returns one on success and zero on error. The caller retains
  * ownership of |response|. */
 OPENSSL_EXPORT int SSL_set_ocsp_response(SSL *ssl,
@@ -1152,7 +1152,7 @@
  * mode). */
 OPENSSL_EXPORT int SSL_CIPHER_is_AES(const SSL_CIPHER *cipher);
 
-/* SSL_CIPHER_has_MD5_HMAC returns one if |cipher| uses HMAC-MD5. */
+/* SSL_CIPHER_has_MD5_HMAC returns zero. */
 OPENSSL_EXPORT int SSL_CIPHER_has_MD5_HMAC(const SSL_CIPHER *cipher);
 
 /* SSL_CIPHER_has_SHA1_HMAC returns one if |cipher| uses HMAC-SHA1. */
@@ -1161,6 +1161,9 @@
 /* SSL_CIPHER_has_SHA256_HMAC returns one if |cipher| uses HMAC-SHA256. */
 OPENSSL_EXPORT int SSL_CIPHER_has_SHA256_HMAC(const SSL_CIPHER *cipher);
 
+/* SSL_CIPHER_is_AEAD returns one if |cipher| uses an AEAD cipher. */
+OPENSSL_EXPORT int SSL_CIPHER_is_AEAD(const SSL_CIPHER *cipher);
+
 /* SSL_CIPHER_is_AESGCM returns one if |cipher| uses AES-GCM. */
 OPENSSL_EXPORT int SSL_CIPHER_is_AESGCM(const SSL_CIPHER *cipher);
 
@@ -1195,6 +1198,10 @@
 /* SSL_CIPHER_is_ECDHE returns one if |cipher| uses ECDHE. */
 OPENSSL_EXPORT int SSL_CIPHER_is_ECDHE(const SSL_CIPHER *cipher);
 
+/* SSL_CIPHER_is_static_RSA returns one if |cipher| uses the static RSA key
+ * exchange. */
+OPENSSL_EXPORT int SSL_CIPHER_is_static_RSA(const SSL_CIPHER *cipher);
+
 /* SSL_CIPHER_get_min_version returns the minimum protocol version required
  * for |cipher|. */
 OPENSSL_EXPORT uint16_t SSL_CIPHER_get_min_version(const SSL_CIPHER *cipher);
@@ -1276,7 +1283,7 @@
  *   whose bulk cipher use the corresponding encryption scheme. Note that
  *   |AES|, |AES128|, and |AES256| match both CBC and GCM ciphers.
  *
- *   |MD5|, |SHA1|, |SHA256|, and |SHA384| match legacy cipher suites using the
+ *   |SHA1|, |SHA256|, and |SHA384| match legacy cipher suites using the
  *   corresponding hash function in their MAC. AEADs are matched by none of
  *   these.
  *
@@ -1428,8 +1435,8 @@
 OPENSSL_EXPORT int SSL_get_tls_unique(const SSL *ssl, uint8_t *out,
                                       size_t *out_len, size_t max_out);
 
-/* SSL_get_extms_support returns one if the Extended Master Secret
- * extension was negotiated. Otherwise, it returns zero. */
+/* SSL_get_extms_support returns one if the Extended Master Secret extension or
+ * TLS 1.3 was negotiated. Otherwise, it returns zero. */
 OPENSSL_EXPORT int SSL_get_extms_support(const SSL *ssl);
 
 /* SSL_get_current_cipher returns the cipher used in the current outgoing
@@ -1444,7 +1451,7 @@
 OPENSSL_EXPORT int SSL_session_reused(const SSL *ssl);
 
 /* SSL_get_secure_renegotiation_support returns one if the peer supports secure
- * renegotiation (RFC 5746) and zero otherwise. */
+ * renegotiation (RFC 5746) or TLS 1.3. Otherwise, it returns zero. */
 OPENSSL_EXPORT int SSL_get_secure_renegotiation_support(const SSL *ssl);
 
 /* SSL_export_keying_material exports a value derived from the master secret, as
@@ -1589,8 +1596,8 @@
  * was established at. For example, "TLSv1.2" or "SSLv3". */
 OPENSSL_EXPORT const char *SSL_SESSION_get_version(const SSL_SESSION *session);
 
-/* SSL_SESSION_get_id returns a pointer to a buffer containg |session|'s session
- * ID and sets |*out_len| to its length. */
+/* SSL_SESSION_get_id returns a pointer to a buffer containing |session|'s
+ * session ID and sets |*out_len| to its length. */
 OPENSSL_EXPORT const uint8_t *SSL_SESSION_get_id(const SSL_SESSION *session,
                                                  unsigned *out_len);
 
@@ -1601,7 +1608,7 @@
 /* SSL_SESSION_get_timeout returns the lifetime of |session| in seconds. */
 OPENSSL_EXPORT long SSL_SESSION_get_timeout(const SSL_SESSION *session);
 
-/* SSL_SESSION_get0_peer return's the peer leaf certificate stored in
+/* SSL_SESSION_get0_peer returns the peer leaf certificate stored in
  * |session|.
  *
  * TODO(davidben): This should return a const X509 *. */
@@ -1651,7 +1658,7 @@
  *
  * Note that offering or accepting a session short-circuits most parameter
  * negotiation. Resuming sessions across different configurations may result in
- * surprising behavor. So, for instance, a client implementing a version
+ * surprising behavior. So, for instance, a client implementing a version
  * fallback should shard its session cache by maximum protocol version. */
 
 /* SSL_SESS_CACHE_OFF disables all session caching. */
@@ -2368,7 +2375,7 @@
 #define SSL_TLSEXT_ERR_NOACK 3
 
 
-/* Application-layer protocol negotation.
+/* Application-layer protocol negotiation.
  *
  * The ALPN extension (RFC 7301) allows negotiating different application-layer
  * protocols over a single port. This is used, for example, to negotiate
@@ -2483,7 +2490,7 @@
  * or have a default application level protocol.
  *
  * 2) If the server supports NPN, but advertises an empty list then the
- * client selects the first protcol in its list, but indicates via the
+ * client selects the first protocol in its list, but indicates via the
  * API that this fallback case was enacted.
  *
  * 3) Otherwise, the client finds the first protocol in the server's list
@@ -2910,12 +2917,12 @@
 OPENSSL_EXPORT int SSL_renegotiate_pending(SSL *ssl);
 
 /* SSL_total_renegotiations returns the total number of renegotiation handshakes
- * peformed by |ssl|. This includes the pending renegotiation, if any. */
+ * performed by |ssl|. This includes the pending renegotiation, if any. */
 OPENSSL_EXPORT int SSL_total_renegotiations(const SSL *ssl);
 
 /* SSL_CTX_set_early_data_enabled sets whether early data is allowed to be used
  * with resumptions using |ctx|. WARNING: This is experimental and may cause
- * interop failures until fully implemented. */
+ * interoperability failures until fully implemented. */
 OPENSSL_EXPORT void SSL_CTX_set_early_data_enabled(SSL_CTX *ctx, int enabled);
 
 /* SSL_MAX_CERT_LIST_DEFAULT is the default maximum length, in bytes, of a peer
@@ -3034,14 +3041,14 @@
 #define SSL_CB_HANDSHAKE_DONE 0x20
 
 /* SSL_CTX_set_info_callback configures a callback to be run when various
- * events occur during a connection's lifetime. The |type| argumentj determines
+ * events occur during a connection's lifetime. The |type| argument determines
  * the type of event and the meaning of the |value| argument. Callbacks must
  * ignore unexpected |type| values.
  *
  * |SSL_CB_READ_ALERT| is signaled for each alert received, warning or fatal.
  * The |value| argument is a 16-bit value where the alert level (either
- * |SSL3_AL_WARNING| or |SSL3_AL_FATAL|) is in the most-significant eight bits and
- * the alert type (one of |SSL_AD_*|) is in the least-significant eight.
+ * |SSL3_AL_WARNING| or |SSL3_AL_FATAL|) is in the most-significant eight bits
+ * and the alert type (one of |SSL_AD_*|) is in the least-significant eight.
  *
  * |SSL_CB_WRITE_ALERT| is signaled for each alert sent. The |value| argument
  * is constructed as with |SSL_CB_READ_ALERT|.
@@ -3456,7 +3463,7 @@
 OPENSSL_EXPORT const char *SSL_get_version(const SSL *ssl);
 
 /* SSL_get_cipher_list returns the name of the |n|th cipher in the output of
- * |SSL_get_ciphers| or NULL if out of range. Use |SSL_get_ciphers| insteads. */
+ * |SSL_get_ciphers| or NULL if out of range. Use |SSL_get_ciphers| instead. */
 OPENSSL_EXPORT const char *SSL_get_cipher_list(const SSL *ssl, int n);
 
 /* SSL_CTX_set_client_cert_cb sets a callback which is called on the client if
@@ -4066,9 +4073,7 @@
    * be negotiated and zero otherwise. */
   unsigned short_header_enabled:1;
 
-  /* extra_certs is a dummy value included for compatibility.
-   * TODO(agl): remove once node.js no longer references this. */
-  STACK_OF(X509)* extra_certs;
+  /* TODO(agl): remove once node.js no longer references this. */
   int freelist_max_len;
 };
 
@@ -4240,9 +4245,6 @@
    * session space. Only effective on the server side. */
   unsigned retain_only_sha256_of_client_certs:1;
 
-  /* TODO(agl): remove once node.js not longer references this. */
-  int tlsext_status_type;
-
   /* session_timeout is the default lifetime in seconds of the session
    * created in this connection. */
   long session_timeout;
diff --git a/src/include/openssl/tls1.h b/src/include/openssl/tls1.h
index 7d9f2ac..4b3e1aa 100644
--- a/src/include/openssl/tls1.h
+++ b/src/include/openssl/tls1.h
@@ -604,8 +604,8 @@
 #define TLS1_TXT_ECDHE_PSK_WITH_CHACHA20_POLY1305_SHA256 \
   "ECDHE-PSK-CHACHA20-POLY1305"
 
-/* TODO(davidben): Remove this. Historically, the TXT names for CHACHA20_POLY1305
- * were missing 'SHA256'. */
+/* TODO(davidben): Remove this. Historically, the TXT names for
+ * CHACHA20_POLY1305 were missing 'SHA256'. */
 #define TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305 \
   TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256
 
diff --git a/src/include/openssl/x509.h b/src/include/openssl/x509.h
index ffd0fbc..88455dd 100644
--- a/src/include/openssl/x509.h
+++ b/src/include/openssl/x509.h
@@ -509,28 +509,6 @@
 	} CBC_PARAM;
 */
 
-/* Password based encryption structure */
-
-struct PBEPARAM_st {
-ASN1_OCTET_STRING *salt;
-ASN1_INTEGER *iter;
-} /* PBEPARAM */;
-
-/* Password based encryption V2 structures */
-
-struct PBE2PARAM_st {
-X509_ALGOR *keyfunc;
-X509_ALGOR *encryption;
-} /* PBE2PARAM */;
-
-struct PBKDF2PARAM_st {
-ASN1_TYPE *salt;	/* Usually OCTET STRING but could be anything */
-ASN1_INTEGER *iter;
-ASN1_INTEGER *keylength;
-X509_ALGOR *prf;
-} /* PBKDF2PARAM */;
-
-
 /* PKCS#8 private key info structure */
 
 struct pkcs8_priv_key_info_st
@@ -1090,24 +1068,6 @@
 				     ASN1_INTEGER *serial);
 OPENSSL_EXPORT X509 *X509_find_by_subject(STACK_OF(X509) *sk,X509_NAME *name);
 
-DECLARE_ASN1_FUNCTIONS(PBEPARAM)
-DECLARE_ASN1_FUNCTIONS(PBE2PARAM)
-DECLARE_ASN1_FUNCTIONS(PBKDF2PARAM)
-
-OPENSSL_EXPORT int PKCS5_pbe_set0_algor(X509_ALGOR *algor, int alg, int iter,
-				const unsigned char *salt, int saltlen);
-
-OPENSSL_EXPORT X509_ALGOR *PKCS5_pbe_set(int alg, int iter,
-				const unsigned char *salt, int saltlen);
-OPENSSL_EXPORT X509_ALGOR *PKCS5_pbe2_set(const EVP_CIPHER *cipher, int iter,
-					 unsigned char *salt, int saltlen);
-OPENSSL_EXPORT X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
-				 unsigned char *salt, int saltlen,
-				 unsigned char *aiv, int prf_nid);
-
-OPENSSL_EXPORT X509_ALGOR *PKCS5_pbkdf2_set(int iter, unsigned char *salt, int saltlen,
-				int prf_nid, int keylen);
-
 /* PKCS#8 utilities */
 
 DECLARE_ASN1_FUNCTIONS(PKCS8_PRIV_KEY_INFO)
diff --git a/src/ssl/internal.h b/src/ssl/internal.h
index 919f5aa..eb47785 100644
--- a/src/ssl/internal.h
+++ b/src/ssl/internal.h
@@ -195,12 +195,11 @@
 #define SSL_AES (SSL_AES128 | SSL_AES256 | SSL_AES128GCM | SSL_AES256GCM)
 
 /* Bits for |algorithm_mac| (symmetric authentication). */
-#define SSL_MD5 0x00000001L
-#define SSL_SHA1 0x00000002L
-#define SSL_SHA256 0x00000004L
-#define SSL_SHA384 0x00000008L
+#define SSL_SHA1 0x00000001L
+#define SSL_SHA256 0x00000002L
+#define SSL_SHA384 0x00000004L
 /* SSL_AEAD is set for all AEADs. */
-#define SSL_AEAD 0x00000010L
+#define SSL_AEAD 0x00000008L
 
 /* Bits for |algorithm_prf| (handshake digest). */
 #define SSL_HANDSHAKE_MAC_DEFAULT 0x1
diff --git a/src/ssl/ssl_cipher.c b/src/ssl/ssl_cipher.c
index 20b075e..7ca79ab 100644
--- a/src/ssl/ssl_cipher.c
+++ b/src/ssl/ssl_cipher.c
@@ -678,7 +678,6 @@
      0},
 
     /* MAC aliases */
-    {"MD5", ~0u, ~0u, ~0u, SSL_MD5, 0},
     {"SHA1", ~0u, ~0u, ~SSL_eNULL, SSL_SHA1, 0},
     {"SHA", ~0u, ~0u, ~SSL_eNULL, SSL_SHA1, 0},
     {"SHA256", ~0u, ~0u, ~0u, SSL_SHA256, 0},
@@ -1473,7 +1472,7 @@
 }
 
 int SSL_CIPHER_has_MD5_HMAC(const SSL_CIPHER *cipher) {
-  return (cipher->algorithm_mac & SSL_MD5) != 0;
+  return 0;
 }
 
 int SSL_CIPHER_has_SHA1_HMAC(const SSL_CIPHER *cipher) {
@@ -1484,6 +1483,10 @@
   return (cipher->algorithm_mac & SSL_SHA256) != 0;
 }
 
+int SSL_CIPHER_is_AEAD(const SSL_CIPHER *cipher) {
+  return (cipher->algorithm_mac & SSL_AEAD) != 0;
+}
+
 int SSL_CIPHER_is_AESGCM(const SSL_CIPHER *cipher) {
   return (cipher->algorithm_enc & (SSL_AES128GCM | SSL_AES256GCM)) != 0;
 }
@@ -1526,6 +1529,10 @@
   return (cipher->algorithm_mkey & SSL_kECDHE) != 0;
 }
 
+int SSL_CIPHER_is_static_RSA(const SSL_CIPHER *cipher) {
+  return (cipher->algorithm_mkey & SSL_kRSA) != 0;
+}
+
 uint16_t SSL_CIPHER_get_min_version(const SSL_CIPHER *cipher) {
   if (cipher->algorithm_mkey == SSL_kGENERIC ||
       cipher->algorithm_auth == SSL_aGENERIC) {
@@ -1627,15 +1634,10 @@
 static const char *ssl_cipher_get_prf_name(const SSL_CIPHER *cipher) {
   switch (cipher->algorithm_prf) {
     case SSL_HANDSHAKE_MAC_DEFAULT:
-      /* Before TLS 1.2, the PRF component is the hash used in the HMAC, which is
-       * only ever MD5 or SHA-1. */
-      switch (cipher->algorithm_mac) {
-        case SSL_MD5:
-          return "MD5";
-        case SSL_SHA1:
-          return "SHA";
-      }
-      break;
+      /* Before TLS 1.2, the PRF component is the hash used in the HMAC, which
+       * is SHA-1 for all supported ciphers. */
+      assert(cipher->algorithm_mac == SSL_SHA1);
+      return "SHA";
     case SSL_HANDSHAKE_MAC_SHA256:
       return "SHA256";
     case SSL_HANDSHAKE_MAC_SHA384:
@@ -1824,10 +1826,6 @@
   }
 
   switch (alg_mac) {
-    case SSL_MD5:
-      mac = "MD5";
-      break;
-
     case SSL_SHA1:
       mac = "SHA1";
       break;
@@ -1917,19 +1915,9 @@
       return 0;
   }
 
-  size_t mac_len;
-  switch (cipher->algorithm_mac) {
-    case SSL_MD5:
-      mac_len = MD5_DIGEST_LENGTH;
-      break;
-    case SSL_SHA1:
-      mac_len = SHA_DIGEST_LENGTH;
-      break;
-    default:
-      return 0;
-  }
-
-  size_t ret = 1 + mac_len;
+  /* All supported TLS 1.0 ciphers use SHA-1. */
+  assert(cipher->algorithm_mac == SSL_SHA1);
+  size_t ret = 1 + SHA_DIGEST_LENGTH;
   ret += block_size - (ret % block_size);
   return ret;
 }
diff --git a/src/ssl/ssl_lib.c b/src/ssl/ssl_lib.c
index a60bf81..e0ab803 100644
--- a/src/ssl/ssl_lib.c
+++ b/src/ssl/ssl_lib.c
@@ -1506,7 +1506,11 @@
 }
 
 int SSL_get_secure_renegotiation_support(const SSL *ssl) {
-  return ssl->s3->send_connection_binding;
+  if (!ssl->s3->have_version) {
+    return 0;
+  }
+  return ssl3_protocol_version(ssl) >= TLS1_3_VERSION ||
+         ssl->s3->send_connection_binding;
 }
 
 LHASH_OF(SSL_SESSION) *SSL_CTX_sessions(SSL_CTX *ctx) { return ctx->sessions; }
diff --git a/src/ssl/t1_lib.c b/src/ssl/t1_lib.c
index 0681919..dc377c4 100644
--- a/src/ssl/t1_lib.c
+++ b/src/ssl/t1_lib.c
@@ -1136,10 +1136,6 @@
  *
  * https://tools.ietf.org/html/rfc6066#section-8 */
 
-static void ext_ocsp_init(SSL_HANDSHAKE *hs) {
-  hs->ssl->tlsext_status_type = -1;
-}
-
 static int ext_ocsp_add_clienthello(SSL_HANDSHAKE *hs, CBB *out) {
   SSL *const ssl = hs->ssl;
   if (!ssl->ocsp_stapling_enabled) {
@@ -1156,7 +1152,6 @@
     return 0;
   }
 
-  ssl->tlsext_status_type = TLSEXT_STATUSTYPE_ocsp;
   return 1;
 }
 
@@ -2633,7 +2628,7 @@
   },
   {
     TLSEXT_TYPE_status_request,
-    ext_ocsp_init,
+    NULL,
     ext_ocsp_add_clienthello,
     ext_ocsp_parse_serverhello,
     ext_ocsp_parse_clienthello,
diff --git a/src/ssl/test/bssl_shim.cc b/src/ssl/test/bssl_shim.cc
index a98ff43..bbbebe7 100644
--- a/src/ssl/test/bssl_shim.cc
+++ b/src/ssl/test/bssl_shim.cc
@@ -1305,11 +1305,22 @@
     }
   }
 
-  if (config->expect_extended_master_secret) {
-    if (!SSL_get_extms_support(ssl)) {
-      fprintf(stderr, "No EMS for connection when expected");
-      return false;
-    }
+  if (config->expect_extended_master_secret && !SSL_get_extms_support(ssl)) {
+    fprintf(stderr, "No EMS for connection when expected\n");
+    return false;
+  }
+
+  if (config->expect_secure_renegotiation &&
+      !SSL_get_secure_renegotiation_support(ssl)) {
+    fprintf(stderr, "No secure renegotiation for connection when expected\n");
+    return false;
+  }
+
+  if (config->expect_no_secure_renegotiation &&
+      SSL_get_secure_renegotiation_support(ssl)) {
+    fprintf(stderr,
+            "Secure renegotiation unexpectedly negotiated for connection\n");
+    return false;
   }
 
   if (!config->expected_ocsp_response.empty()) {
@@ -1622,6 +1633,9 @@
   if (is_resume && config->retain_only_sha256_client_cert_resume) {
     SSL_set_retain_only_sha256_of_client_certs(ssl.get(), 1);
   }
+  if (config->max_send_fragment > 0) {
+    SSL_set_max_send_fragment(ssl.get(), config->max_send_fragment);
+  }
 
   int sock = Connect(config->port);
   if (sock == -1) {
@@ -1785,12 +1799,15 @@
     }
     if (!config->shim_shuts_down) {
       for (;;) {
-        static const size_t kBufLen = 16384;
-        std::unique_ptr<uint8_t[]> buf(new uint8_t[kBufLen]);
-
         // Read only 512 bytes at a time in TLS to ensure records may be
         // returned in multiple reads.
-        int n = DoRead(ssl.get(), buf.get(), config->is_dtls ? kBufLen : 512);
+        size_t read_size = config->is_dtls ? 16384 : 512;
+        if (config->read_size > 0) {
+          read_size = config->read_size;
+        }
+        std::unique_ptr<uint8_t[]> buf(new uint8_t[read_size]);
+
+        int n = DoRead(ssl.get(), buf.get(), read_size);
         int err = SSL_get_error(ssl.get(), n);
         if (err == SSL_ERROR_ZERO_RETURN ||
             (n == 0 && err == SSL_ERROR_SYSCALL)) {
diff --git a/src/ssl/test/runner/common.go b/src/ssl/test/runner/common.go
index 96bc39a..3afd73f 100644
--- a/src/ssl/test/runner/common.go
+++ b/src/ssl/test/runner/common.go
@@ -1253,6 +1253,10 @@
 	// formats to send in ClientHello or ServerHello. If set to a non-nil
 	// empty slice, no extension will be sent.
 	SendSupportedPointFormats []byte
+
+	// MaxReceivePlaintext, if non-zero, is the maximum plaintext record
+	// length accepted from the peer.
+	MaxReceivePlaintext int
 }
 
 func (c *Config) serverInit() {
diff --git a/src/ssl/test/runner/conn.go b/src/ssl/test/runner/conn.go
index ee342fa..3cbd496 100644
--- a/src/ssl/test/runner/conn.go
+++ b/src/ssl/test/runner/conn.go
@@ -876,7 +876,11 @@
 		return err
 	}
 	data := b.data[b.off:]
-	if len(data) > maxPlaintext {
+	max := maxPlaintext
+	if c.config.Bugs.MaxReceivePlaintext != 0 {
+		max = c.config.Bugs.MaxReceivePlaintext
+	}
+	if len(data) > max {
 		err := c.sendAlert(alertRecordOverflow)
 		c.in.freeBlock(b)
 		return c.in.setErrorLocked(err)
diff --git a/src/ssl/test/runner/runner.go b/src/ssl/test/runner/runner.go
index ba78fce..3a884bc 100644
--- a/src/ssl/test/runner/runner.go
+++ b/src/ssl/test/runner/runner.go
@@ -2401,6 +2401,42 @@
 				},
 			},
 		},
+		{
+			// Test the server so there is a large certificate as
+			// well as application data.
+			testType: serverTest,
+			name:     "MaxSendFragment",
+			config: Config{
+				Bugs: ProtocolBugs{
+					MaxReceivePlaintext: 512,
+				},
+			},
+			messageLen: 1024,
+			flags: []string{
+				"-max-send-fragment", "512",
+				"-read-size", "1024",
+			},
+		},
+		{
+			// Test the server so there is a large certificate as
+			// well as application data.
+			testType: serverTest,
+			name:     "MaxSendFragment-TooLarge",
+			config: Config{
+				Bugs: ProtocolBugs{
+					// Ensure that some of the records are
+					// 512.
+					MaxReceivePlaintext: 511,
+				},
+			},
+			messageLen: 1024,
+			flags: []string{
+				"-max-send-fragment", "512",
+				"-read-size", "1024",
+			},
+			shouldFail:         true,
+			expectedLocalError: "local error: record overflow",
+		},
 	}
 	testCases = append(testCases, basicTests...)
 
@@ -5246,6 +5282,7 @@
 				RequireRenegotiationInfo: true,
 			},
 		},
+		flags: []string{"-expect-secure-renegotiation"},
 	})
 	testCases = append(testCases, testCase{
 		testType: serverTest,
@@ -5258,6 +5295,7 @@
 				RequireRenegotiationInfo: true,
 			},
 		},
+		flags: []string{"-expect-secure-renegotiation"},
 	})
 
 	// Test that illegal extensions in TLS 1.3 are rejected by the client if
@@ -6015,6 +6053,7 @@
 		flags: []string{
 			"-renegotiate-freely",
 			"-expect-total-renegotiations", "1",
+			"-expect-secure-renegotiation",
 		},
 	})
 	testCases = append(testCases, testCase{
@@ -6081,6 +6120,7 @@
 		flags: []string{
 			"-renegotiate-freely",
 			"-expect-total-renegotiations", "1",
+			"-expect-no-secure-renegotiation",
 		},
 	})
 
@@ -6347,6 +6387,22 @@
 		shouldFail:    true,
 		expectedError: ":UNEXPECTED_MESSAGE:",
 	})
+
+	// The renegotiation_info extension is not sent in TLS 1.3, but TLS 1.3
+	// always reads as supporting it, regardless of whether it was
+	// negotiated.
+	testCases = append(testCases, testCase{
+		name: "AlwaysReportRenegotiationInfo-TLS13",
+		config: Config{
+			MaxVersion: VersionTLS13,
+			Bugs: ProtocolBugs{
+				NoRenegotiationInfo: true,
+			},
+		},
+		flags: []string{
+			"-expect-secure-renegotiation",
+		},
+	})
 }
 
 func addDTLSReplayTests() {
diff --git a/src/ssl/test/test_config.cc b/src/ssl/test/test_config.cc
index a06b5e5..5bc9544 100644
--- a/src/ssl/test/test_config.cc
+++ b/src/ssl/test/test_config.cc
@@ -119,6 +119,10 @@
     &TestConfig::expect_sha256_client_cert_resume },
   { "-enable-short-header", &TestConfig::enable_short_header },
   { "-read-with-unfinished-write", &TestConfig::read_with_unfinished_write },
+  { "-expect-secure-renegotiation",
+    &TestConfig::expect_secure_renegotiation },
+  { "-expect-no-secure-renegotiation",
+    &TestConfig::expect_no_secure_renegotiation },
 };
 
 const Flag<std::string> kStringFlags[] = {
@@ -172,6 +176,8 @@
   { "-expect-cipher-aes", &TestConfig::expect_cipher_aes },
   { "-expect-cipher-no-aes", &TestConfig::expect_cipher_no_aes },
   { "-resumption-delay", &TestConfig::resumption_delay },
+  { "-max-send-fragment", &TestConfig::max_send_fragment },
+  { "-read-size", &TestConfig::read_size },
 };
 
 const Flag<std::vector<int>> kIntVectorFlags[] = {
diff --git a/src/ssl/test/test_config.h b/src/ssl/test/test_config.h
index 1307d56..7cf0a6f 100644
--- a/src/ssl/test/test_config.h
+++ b/src/ssl/test/test_config.h
@@ -127,6 +127,10 @@
   bool expect_sha256_client_cert_resume = false;
   bool enable_short_header = false;
   bool read_with_unfinished_write = false;
+  bool expect_secure_renegotiation = false;
+  bool expect_no_secure_renegotiation = false;
+  int max_send_fragment = 0;
+  int read_size = 0;
 };
 
 bool ParseConfig(int argc, char **argv, TestConfig *out_config);
diff --git a/src/tool/client.cc b/src/tool/client.cc
index dd3f846..c832ab5 100644
--- a/src/tool/client.cc
+++ b/src/tool/client.cc
@@ -19,7 +19,9 @@
 #if !defined(OPENSSL_WINDOWS)
 #include <sys/select.h>
 #else
+OPENSSL_MSVC_PRAGMA(warning(push, 3))
 #include <winsock2.h>
+OPENSSL_MSVC_PRAGMA(warning(pop))
 #endif
 
 #include <openssl/err.h>
diff --git a/src/util/doc.go b/src/util/doc.go
index 681b834..987794c 100644
--- a/src/util/doc.go
+++ b/src/util/doc.go
@@ -139,7 +139,7 @@
 }
 
 func extractDecl(lines []string, lineNo int) (decl string, rest []string, restLineNo int, err error) {
-	if len(lines) == 0 {
+	if len(lines) == 0 || len(lines[0]) == 0 {
 		return "", lines, lineNo, nil
 	}
 
diff --git a/win-x86/crypto/modes/ghash-x86.asm b/win-x86/crypto/modes/ghash-x86.asm
index eb493ac..1d350d6 100644
--- a/win-x86/crypto/modes/ghash-x86.asm
+++ b/win-x86/crypto/modes/ghash-x86.asm
@@ -14,205 +14,6 @@
 %else
 section	.text	code
 %endif
-global	_gcm_gmult_4bit_x86
-align	16
-_gcm_gmult_4bit_x86:
-L$_gcm_gmult_4bit_x86_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	sub	esp,84
-	mov	edi,DWORD [104+esp]
-	mov	esi,DWORD [108+esp]
-	mov	ebp,DWORD [edi]
-	mov	edx,DWORD [4+edi]
-	mov	ecx,DWORD [8+edi]
-	mov	ebx,DWORD [12+edi]
-	mov	DWORD [16+esp],0
-	mov	DWORD [20+esp],471859200
-	mov	DWORD [24+esp],943718400
-	mov	DWORD [28+esp],610271232
-	mov	DWORD [32+esp],1887436800
-	mov	DWORD [36+esp],1822425088
-	mov	DWORD [40+esp],1220542464
-	mov	DWORD [44+esp],1423966208
-	mov	DWORD [48+esp],3774873600
-	mov	DWORD [52+esp],4246732800
-	mov	DWORD [56+esp],3644850176
-	mov	DWORD [60+esp],3311403008
-	mov	DWORD [64+esp],2441084928
-	mov	DWORD [68+esp],2376073216
-	mov	DWORD [72+esp],2847932416
-	mov	DWORD [76+esp],3051356160
-	mov	DWORD [esp],ebp
-	mov	DWORD [4+esp],edx
-	mov	DWORD [8+esp],ecx
-	mov	DWORD [12+esp],ebx
-	shr	ebx,20
-	and	ebx,240
-	mov	ebp,DWORD [4+ebx*1+esi]
-	mov	edx,DWORD [ebx*1+esi]
-	mov	ecx,DWORD [12+ebx*1+esi]
-	mov	ebx,DWORD [8+ebx*1+esi]
-	xor	eax,eax
-	mov	edi,15
-	jmp	NEAR L$000x86_loop
-align	16
-L$000x86_loop:
-	mov	al,bl
-	shrd	ebx,ecx,4
-	and	al,15
-	shrd	ecx,edx,4
-	shrd	edx,ebp,4
-	shr	ebp,4
-	xor	ebp,DWORD [16+eax*4+esp]
-	mov	al,BYTE [edi*1+esp]
-	and	al,240
-	xor	ebx,DWORD [8+eax*1+esi]
-	xor	ecx,DWORD [12+eax*1+esi]
-	xor	edx,DWORD [eax*1+esi]
-	xor	ebp,DWORD [4+eax*1+esi]
-	dec	edi
-	js	NEAR L$001x86_break
-	mov	al,bl
-	shrd	ebx,ecx,4
-	and	al,15
-	shrd	ecx,edx,4
-	shrd	edx,ebp,4
-	shr	ebp,4
-	xor	ebp,DWORD [16+eax*4+esp]
-	mov	al,BYTE [edi*1+esp]
-	shl	al,4
-	xor	ebx,DWORD [8+eax*1+esi]
-	xor	ecx,DWORD [12+eax*1+esi]
-	xor	edx,DWORD [eax*1+esi]
-	xor	ebp,DWORD [4+eax*1+esi]
-	jmp	NEAR L$000x86_loop
-align	16
-L$001x86_break:
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	bswap	ebp
-	mov	edi,DWORD [104+esp]
-	mov	DWORD [12+edi],ebx
-	mov	DWORD [8+edi],ecx
-	mov	DWORD [4+edi],edx
-	mov	DWORD [edi],ebp
-	add	esp,84
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-global	_gcm_ghash_4bit_x86
-align	16
-_gcm_ghash_4bit_x86:
-L$_gcm_ghash_4bit_x86_begin:
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-	sub	esp,84
-	mov	ebx,DWORD [104+esp]
-	mov	esi,DWORD [108+esp]
-	mov	edi,DWORD [112+esp]
-	mov	ecx,DWORD [116+esp]
-	add	ecx,edi
-	mov	DWORD [116+esp],ecx
-	mov	ebp,DWORD [ebx]
-	mov	edx,DWORD [4+ebx]
-	mov	ecx,DWORD [8+ebx]
-	mov	ebx,DWORD [12+ebx]
-	mov	DWORD [16+esp],0
-	mov	DWORD [20+esp],471859200
-	mov	DWORD [24+esp],943718400
-	mov	DWORD [28+esp],610271232
-	mov	DWORD [32+esp],1887436800
-	mov	DWORD [36+esp],1822425088
-	mov	DWORD [40+esp],1220542464
-	mov	DWORD [44+esp],1423966208
-	mov	DWORD [48+esp],3774873600
-	mov	DWORD [52+esp],4246732800
-	mov	DWORD [56+esp],3644850176
-	mov	DWORD [60+esp],3311403008
-	mov	DWORD [64+esp],2441084928
-	mov	DWORD [68+esp],2376073216
-	mov	DWORD [72+esp],2847932416
-	mov	DWORD [76+esp],3051356160
-align	16
-L$002x86_outer_loop:
-	xor	ebx,DWORD [12+edi]
-	xor	ecx,DWORD [8+edi]
-	xor	edx,DWORD [4+edi]
-	xor	ebp,DWORD [edi]
-	mov	DWORD [12+esp],ebx
-	mov	DWORD [8+esp],ecx
-	mov	DWORD [4+esp],edx
-	mov	DWORD [esp],ebp
-	shr	ebx,20
-	and	ebx,240
-	mov	ebp,DWORD [4+ebx*1+esi]
-	mov	edx,DWORD [ebx*1+esi]
-	mov	ecx,DWORD [12+ebx*1+esi]
-	mov	ebx,DWORD [8+ebx*1+esi]
-	xor	eax,eax
-	mov	edi,15
-	jmp	NEAR L$003x86_loop
-align	16
-L$003x86_loop:
-	mov	al,bl
-	shrd	ebx,ecx,4
-	and	al,15
-	shrd	ecx,edx,4
-	shrd	edx,ebp,4
-	shr	ebp,4
-	xor	ebp,DWORD [16+eax*4+esp]
-	mov	al,BYTE [edi*1+esp]
-	and	al,240
-	xor	ebx,DWORD [8+eax*1+esi]
-	xor	ecx,DWORD [12+eax*1+esi]
-	xor	edx,DWORD [eax*1+esi]
-	xor	ebp,DWORD [4+eax*1+esi]
-	dec	edi
-	js	NEAR L$004x86_break
-	mov	al,bl
-	shrd	ebx,ecx,4
-	and	al,15
-	shrd	ecx,edx,4
-	shrd	edx,ebp,4
-	shr	ebp,4
-	xor	ebp,DWORD [16+eax*4+esp]
-	mov	al,BYTE [edi*1+esp]
-	shl	al,4
-	xor	ebx,DWORD [8+eax*1+esi]
-	xor	ecx,DWORD [12+eax*1+esi]
-	xor	edx,DWORD [eax*1+esi]
-	xor	ebp,DWORD [4+eax*1+esi]
-	jmp	NEAR L$003x86_loop
-align	16
-L$004x86_break:
-	bswap	ebx
-	bswap	ecx
-	bswap	edx
-	bswap	ebp
-	mov	edi,DWORD [112+esp]
-	lea	edi,[16+edi]
-	cmp	edi,DWORD [116+esp]
-	mov	DWORD [112+esp],edi
-	jb	NEAR L$002x86_outer_loop
-	mov	edi,DWORD [104+esp]
-	mov	DWORD [12+edi],ebx
-	mov	DWORD [8+edi],ecx
-	mov	DWORD [4+edi],edx
-	mov	DWORD [edi],ebp
-	add	esp,84
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
 global	_gcm_gmult_4bit_mmx
 align	16
 _gcm_gmult_4bit_mmx:
@@ -223,10 +24,10 @@
 	push	edi
 	mov	edi,DWORD [20+esp]
 	mov	esi,DWORD [24+esp]
-	call	L$005pic_point
-L$005pic_point:
+	call	L$000pic_point
+L$000pic_point:
 	pop	eax
-	lea	eax,[(L$rem_4bit-L$005pic_point)+eax]
+	lea	eax,[(L$rem_4bit-L$000pic_point)+eax]
 	movzx	ebx,BYTE [15+edi]
 	xor	ecx,ecx
 	mov	edx,ebx
@@ -237,9 +38,9 @@
 	movq	mm0,[8+ecx*1+esi]
 	movq	mm1,[ecx*1+esi]
 	movd	ebx,mm0
-	jmp	NEAR L$006mmx_loop
+	jmp	NEAR L$001mmx_loop
 align	16
-L$006mmx_loop:
+L$001mmx_loop:
 	psrlq	mm0,4
 	and	ebx,15
 	movq	mm2,mm1
@@ -253,7 +54,7 @@
 	pxor	mm1,[edx*1+esi]
 	mov	edx,ecx
 	pxor	mm0,mm2
-	js	NEAR L$007mmx_break
+	js	NEAR L$002mmx_break
 	shl	cl,4
 	and	ebx,15
 	psrlq	mm0,4
@@ -266,9 +67,9 @@
 	movd	ebx,mm0
 	pxor	mm1,[ecx*1+esi]
 	pxor	mm0,mm2
-	jmp	NEAR L$006mmx_loop
+	jmp	NEAR L$001mmx_loop
 align	16
-L$007mmx_break:
+L$002mmx_break:
 	shl	cl,4
 	and	ebx,15
 	psrlq	mm0,4
@@ -323,10 +124,10 @@
 	mov	ecx,DWORD [28+esp]
 	mov	edx,DWORD [32+esp]
 	mov	ebp,esp
-	call	L$008pic_point
-L$008pic_point:
+	call	L$003pic_point
+L$003pic_point:
 	pop	esi
-	lea	esi,[(L$rem_8bit-L$008pic_point)+esi]
+	lea	esi,[(L$rem_8bit-L$003pic_point)+esi]
 	sub	esp,544
 	and	esp,-64
 	sub	esp,16
@@ -565,7 +366,7 @@
 	mov	ebx,DWORD [8+eax]
 	mov	edx,DWORD [12+eax]
 align	16
-L$009outer:
+L$004outer:
 	xor	edx,DWORD [12+ecx]
 	xor	ebx,DWORD [8+ecx]
 	pxor	mm6,[ecx]
@@ -900,7 +701,7 @@
 	pshufw	mm6,mm6,27
 	bswap	ebx
 	cmp	ecx,DWORD [552+esp]
-	jne	NEAR L$009outer
+	jne	NEAR L$004outer
 	mov	eax,DWORD [544+esp]
 	mov	DWORD [12+eax],edx
 	mov	DWORD [8+eax],ebx
@@ -918,10 +719,10 @@
 L$_gcm_init_clmul_begin:
 	mov	edx,DWORD [4+esp]
 	mov	eax,DWORD [8+esp]
-	call	L$010pic
-L$010pic:
+	call	L$005pic
+L$005pic:
 	pop	ecx
-	lea	ecx,[(L$bswap-L$010pic)+ecx]
+	lea	ecx,[(L$bswap-L$005pic)+ecx]
 	movdqu	xmm2,[eax]
 	pshufd	xmm2,xmm2,78
 	pshufd	xmm4,xmm2,255
@@ -985,10 +786,10 @@
 L$_gcm_gmult_clmul_begin:
 	mov	eax,DWORD [4+esp]
 	mov	edx,DWORD [8+esp]
-	call	L$011pic
-L$011pic:
+	call	L$006pic
+L$006pic:
 	pop	ecx
-	lea	ecx,[(L$bswap-L$011pic)+ecx]
+	lea	ecx,[(L$bswap-L$006pic)+ecx]
 	movdqu	xmm0,[eax]
 	movdqa	xmm5,[ecx]
 	movups	xmm2,[edx]
@@ -1042,16 +843,16 @@
 	mov	edx,DWORD [24+esp]
 	mov	esi,DWORD [28+esp]
 	mov	ebx,DWORD [32+esp]
-	call	L$012pic
-L$012pic:
+	call	L$007pic
+L$007pic:
 	pop	ecx
-	lea	ecx,[(L$bswap-L$012pic)+ecx]
+	lea	ecx,[(L$bswap-L$007pic)+ecx]
 	movdqu	xmm0,[eax]
 	movdqa	xmm5,[ecx]
 	movdqu	xmm2,[edx]
 db	102,15,56,0,197
 	sub	ebx,16
-	jz	NEAR L$013odd_tail
+	jz	NEAR L$008odd_tail
 	movdqu	xmm3,[esi]
 	movdqu	xmm6,[16+esi]
 db	102,15,56,0,221
@@ -1068,10 +869,10 @@
 	movups	xmm2,[16+edx]
 	nop
 	sub	ebx,32
-	jbe	NEAR L$014even_tail
-	jmp	NEAR L$015mod_loop
+	jbe	NEAR L$009even_tail
+	jmp	NEAR L$010mod_loop
 align	32
-L$015mod_loop:
+L$010mod_loop:
 	pshufd	xmm4,xmm0,78
 	movdqa	xmm1,xmm0
 	pxor	xmm4,xmm0
@@ -1126,8 +927,8 @@
 db	102,15,58,68,221,0
 	lea	esi,[32+esi]
 	sub	ebx,32
-	ja	NEAR L$015mod_loop
-L$014even_tail:
+	ja	NEAR L$010mod_loop
+L$009even_tail:
 	pshufd	xmm4,xmm0,78
 	movdqa	xmm1,xmm0
 	pxor	xmm4,xmm0
@@ -1166,9 +967,9 @@
 	psrlq	xmm0,1
 	pxor	xmm0,xmm1
 	test	ebx,ebx
-	jnz	NEAR L$016done
+	jnz	NEAR L$011done
 	movups	xmm2,[edx]
-L$013odd_tail:
+L$008odd_tail:
 	movdqu	xmm3,[esi]
 db	102,15,56,0,221
 	pxor	xmm0,xmm3
@@ -1207,7 +1008,7 @@
 	pxor	xmm0,xmm4
 	psrlq	xmm0,1
 	pxor	xmm0,xmm1
-L$016done:
+L$011done:
 db	102,15,56,0,197
 	movdqu	[eax],xmm0
 	pop	edi
diff --git a/win-x86_64/crypto/ec/p256-x86_64-asm.asm b/win-x86_64/crypto/ec/p256-x86_64-asm.asm
index cbcf883..55db00e 100644
--- a/win-x86_64/crypto/ec/p256-x86_64-asm.asm
+++ b/win-x86_64/crypto/ec/p256-x86_64-asm.asm
@@ -21,58 +21,6 @@
 	DQ	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
 
 
-ALIGN	64
-ecp_nistz256_mul_by_2:
-	mov	QWORD[8+rsp],rdi	;WIN64 prologue
-	mov	QWORD[16+rsp],rsi
-	mov	rax,rsp
-$L$SEH_begin_ecp_nistz256_mul_by_2:
-	mov	rdi,rcx
-	mov	rsi,rdx
-
-
-	push	r12
-	push	r13
-
-	mov	r8,QWORD[rsi]
-	xor	r13,r13
-	mov	r9,QWORD[8+rsi]
-	add	r8,r8
-	mov	r10,QWORD[16+rsi]
-	adc	r9,r9
-	mov	r11,QWORD[24+rsi]
-	lea	rsi,[$L$poly]
-	mov	rax,r8
-	adc	r10,r10
-	adc	r11,r11
-	mov	rdx,r9
-	adc	r13,0
-
-	sub	r8,QWORD[rsi]
-	mov	rcx,r10
-	sbb	r9,QWORD[8+rsi]
-	sbb	r10,QWORD[16+rsi]
-	mov	r12,r11
-	sbb	r11,QWORD[24+rsi]
-	sbb	r13,0
-
-	cmovc	r8,rax
-	cmovc	r9,rdx
-	mov	QWORD[rdi],r8
-	cmovc	r10,rcx
-	mov	QWORD[8+rdi],r9
-	cmovc	r11,r12
-	mov	QWORD[16+rdi],r10
-	mov	QWORD[24+rdi],r11
-
-	pop	r13
-	pop	r12
-	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
-	mov	rsi,QWORD[16+rsp]
-	DB	0F3h,0C3h		;repret
-$L$SEH_end_ecp_nistz256_mul_by_2:
-
-
 
 global	ecp_nistz256_neg
 
@@ -595,112 +543,6 @@
 
 
 
-
-
-
-
-global	ecp_nistz256_from_mont
-
-ALIGN	32
-ecp_nistz256_from_mont:
-	mov	QWORD[8+rsp],rdi	;WIN64 prologue
-	mov	QWORD[16+rsp],rsi
-	mov	rax,rsp
-$L$SEH_begin_ecp_nistz256_from_mont:
-	mov	rdi,rcx
-	mov	rsi,rdx
-
-
-	push	r12
-	push	r13
-
-	mov	rax,QWORD[rsi]
-	mov	r13,QWORD[(($L$poly+24))]
-	mov	r9,QWORD[8+rsi]
-	mov	r10,QWORD[16+rsi]
-	mov	r11,QWORD[24+rsi]
-	mov	r8,rax
-	mov	r12,QWORD[(($L$poly+8))]
-
-
-
-	mov	rcx,rax
-	shl	r8,32
-	mul	r13
-	shr	rcx,32
-	add	r9,r8
-	adc	r10,rcx
-	adc	r11,rax
-	mov	rax,r9
-	adc	rdx,0
-
-
-
-	mov	rcx,r9
-	shl	r9,32
-	mov	r8,rdx
-	mul	r13
-	shr	rcx,32
-	add	r10,r9
-	adc	r11,rcx
-	adc	r8,rax
-	mov	rax,r10
-	adc	rdx,0
-
-
-
-	mov	rcx,r10
-	shl	r10,32
-	mov	r9,rdx
-	mul	r13
-	shr	rcx,32
-	add	r11,r10
-	adc	r8,rcx
-	adc	r9,rax
-	mov	rax,r11
-	adc	rdx,0
-
-
-
-	mov	rcx,r11
-	shl	r11,32
-	mov	r10,rdx
-	mul	r13
-	shr	rcx,32
-	add	r8,r11
-	adc	r9,rcx
-	mov	rcx,r8
-	adc	r10,rax
-	mov	rsi,r9
-	adc	rdx,0
-
-
-
-	sub	r8,-1
-	mov	rax,r10
-	sbb	r9,r12
-	sbb	r10,0
-	mov	r11,rdx
-	sbb	rdx,r13
-	sbb	r13,r13
-
-	cmovnz	r8,rcx
-	cmovnz	r9,rsi
-	mov	QWORD[rdi],r8
-	cmovnz	r10,rax
-	mov	QWORD[8+rdi],r9
-	cmovz	r11,rdx
-	mov	QWORD[16+rdi],r10
-	mov	QWORD[24+rdi],r11
-
-	pop	r13
-	pop	r12
-	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
-	mov	rsi,QWORD[16+rsp]
-	DB	0F3h,0C3h		;repret
-$L$SEH_end_ecp_nistz256_from_mont:
-
-
 global	ecp_nistz256_select_w5
 
 ALIGN	32
diff --git a/win-x86_64/crypto/modes/aesni-gcm-x86_64.asm b/win-x86_64/crypto/modes/aesni-gcm-x86_64.asm
index d7fff6a..741a9e4 100644
--- a/win-x86_64/crypto/modes/aesni-gcm-x86_64.asm
+++ b/win-x86_64/crypto/modes/aesni-gcm-x86_64.asm
@@ -5,16 +5,982 @@
 section	.text code align=64
 
 
-global	aesni_gcm_encrypt
 
-aesni_gcm_encrypt:
-	xor	eax,eax
+ALIGN	32
+_aesni_ctr32_ghash_6x:
+	vmovdqu	xmm2,XMMWORD[32+r11]
+	sub	rdx,6
+	vpxor	xmm4,xmm4,xmm4
+	vmovdqu	xmm15,XMMWORD[((0-128))+rcx]
+	vpaddb	xmm10,xmm1,xmm2
+	vpaddb	xmm11,xmm10,xmm2
+	vpaddb	xmm12,xmm11,xmm2
+	vpaddb	xmm13,xmm12,xmm2
+	vpaddb	xmm14,xmm13,xmm2
+	vpxor	xmm9,xmm1,xmm15
+	vmovdqu	XMMWORD[(16+8)+rsp],xmm4
+	jmp	NEAR $L$oop6x
+
+ALIGN	32
+$L$oop6x:
+	add	ebx,100663296
+	jc	NEAR $L$handle_ctr32
+	vmovdqu	xmm3,XMMWORD[((0-32))+r9]
+	vpaddb	xmm1,xmm14,xmm2
+	vpxor	xmm10,xmm10,xmm15
+	vpxor	xmm11,xmm11,xmm15
+
+$L$resume_ctr32:
+	vmovdqu	XMMWORD[r8],xmm1
+	vpclmulqdq	xmm5,xmm7,xmm3,0x10
+	vpxor	xmm12,xmm12,xmm15
+	vmovups	xmm2,XMMWORD[((16-128))+rcx]
+	vpclmulqdq	xmm6,xmm7,xmm3,0x01
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	xor	r12,r12
+	cmp	r15,r14
+
+	vaesenc	xmm9,xmm9,xmm2
+	vmovdqu	xmm0,XMMWORD[((48+8))+rsp]
+	vpxor	xmm13,xmm13,xmm15
+	vpclmulqdq	xmm1,xmm7,xmm3,0x00
+	vaesenc	xmm10,xmm10,xmm2
+	vpxor	xmm14,xmm14,xmm15
+	setnc	r12b
+	vpclmulqdq	xmm7,xmm7,xmm3,0x11
+	vaesenc	xmm11,xmm11,xmm2
+	vmovdqu	xmm3,XMMWORD[((16-32))+r9]
+	neg	r12
+	vaesenc	xmm12,xmm12,xmm2
+	vpxor	xmm6,xmm6,xmm5
+	vpclmulqdq	xmm5,xmm0,xmm3,0x00
+	vpxor	xmm8,xmm8,xmm4
+	vaesenc	xmm13,xmm13,xmm2
+	vpxor	xmm4,xmm1,xmm5
+	and	r12,0x60
+	vmovups	xmm15,XMMWORD[((32-128))+rcx]
+	vpclmulqdq	xmm1,xmm0,xmm3,0x10
+	vaesenc	xmm14,xmm14,xmm2
+
+	vpclmulqdq	xmm2,xmm0,xmm3,0x01
+	lea	r14,[r12*1+r14]
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm8,xmm8,XMMWORD[((16+8))+rsp]
+	vpclmulqdq	xmm3,xmm0,xmm3,0x11
+	vmovdqu	xmm0,XMMWORD[((64+8))+rsp]
+	vaesenc	xmm10,xmm10,xmm15
+	movbe	r13,QWORD[88+r14]
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[80+r14]
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((32+8))+rsp],r13
+	vaesenc	xmm13,xmm13,xmm15
+	mov	QWORD[((40+8))+rsp],r12
+	vmovdqu	xmm5,XMMWORD[((48-32))+r9]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vmovups	xmm15,XMMWORD[((48-128))+rcx]
+	vpxor	xmm6,xmm6,xmm1
+	vpclmulqdq	xmm1,xmm0,xmm5,0x00
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm6,xmm6,xmm2
+	vpclmulqdq	xmm2,xmm0,xmm5,0x10
+	vaesenc	xmm10,xmm10,xmm15
+	vpxor	xmm7,xmm7,xmm3
+	vpclmulqdq	xmm3,xmm0,xmm5,0x01
+	vaesenc	xmm11,xmm11,xmm15
+	vpclmulqdq	xmm5,xmm0,xmm5,0x11
+	vmovdqu	xmm0,XMMWORD[((80+8))+rsp]
+	vaesenc	xmm12,xmm12,xmm15
+	vaesenc	xmm13,xmm13,xmm15
+	vpxor	xmm4,xmm4,xmm1
+	vmovdqu	xmm1,XMMWORD[((64-32))+r9]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vmovups	xmm15,XMMWORD[((64-128))+rcx]
+	vpxor	xmm6,xmm6,xmm2
+	vpclmulqdq	xmm2,xmm0,xmm1,0x00
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm6,xmm6,xmm3
+	vpclmulqdq	xmm3,xmm0,xmm1,0x10
+	vaesenc	xmm10,xmm10,xmm15
+	movbe	r13,QWORD[72+r14]
+	vpxor	xmm7,xmm7,xmm5
+	vpclmulqdq	xmm5,xmm0,xmm1,0x01
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[64+r14]
+	vpclmulqdq	xmm1,xmm0,xmm1,0x11
+	vmovdqu	xmm0,XMMWORD[((96+8))+rsp]
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((48+8))+rsp],r13
+	vaesenc	xmm13,xmm13,xmm15
+	mov	QWORD[((56+8))+rsp],r12
+	vpxor	xmm4,xmm4,xmm2
+	vmovdqu	xmm2,XMMWORD[((96-32))+r9]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vmovups	xmm15,XMMWORD[((80-128))+rcx]
+	vpxor	xmm6,xmm6,xmm3
+	vpclmulqdq	xmm3,xmm0,xmm2,0x00
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm6,xmm6,xmm5
+	vpclmulqdq	xmm5,xmm0,xmm2,0x10
+	vaesenc	xmm10,xmm10,xmm15
+	movbe	r13,QWORD[56+r14]
+	vpxor	xmm7,xmm7,xmm1
+	vpclmulqdq	xmm1,xmm0,xmm2,0x01
+	vpxor	xmm8,xmm8,XMMWORD[((112+8))+rsp]
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[48+r14]
+	vpclmulqdq	xmm2,xmm0,xmm2,0x11
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((64+8))+rsp],r13
+	vaesenc	xmm13,xmm13,xmm15
+	mov	QWORD[((72+8))+rsp],r12
+	vpxor	xmm4,xmm4,xmm3
+	vmovdqu	xmm3,XMMWORD[((112-32))+r9]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vmovups	xmm15,XMMWORD[((96-128))+rcx]
+	vpxor	xmm6,xmm6,xmm5
+	vpclmulqdq	xmm5,xmm8,xmm3,0x10
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm6,xmm6,xmm1
+	vpclmulqdq	xmm1,xmm8,xmm3,0x01
+	vaesenc	xmm10,xmm10,xmm15
+	movbe	r13,QWORD[40+r14]
+	vpxor	xmm7,xmm7,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm3,0x00
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[32+r14]
+	vpclmulqdq	xmm8,xmm8,xmm3,0x11
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((80+8))+rsp],r13
+	vaesenc	xmm13,xmm13,xmm15
+	mov	QWORD[((88+8))+rsp],r12
+	vpxor	xmm6,xmm6,xmm5
+	vaesenc	xmm14,xmm14,xmm15
+	vpxor	xmm6,xmm6,xmm1
+
+	vmovups	xmm15,XMMWORD[((112-128))+rcx]
+	vpslldq	xmm5,xmm6,8
+	vpxor	xmm4,xmm4,xmm2
+	vmovdqu	xmm3,XMMWORD[16+r11]
+
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm7,xmm7,xmm8
+	vaesenc	xmm10,xmm10,xmm15
+	vpxor	xmm4,xmm4,xmm5
+	movbe	r13,QWORD[24+r14]
+	vaesenc	xmm11,xmm11,xmm15
+	movbe	r12,QWORD[16+r14]
+	vpalignr	xmm0,xmm4,xmm4,8
+	vpclmulqdq	xmm4,xmm4,xmm3,0x10
+	mov	QWORD[((96+8))+rsp],r13
+	vaesenc	xmm12,xmm12,xmm15
+	mov	QWORD[((104+8))+rsp],r12
+	vaesenc	xmm13,xmm13,xmm15
+	vmovups	xmm1,XMMWORD[((128-128))+rcx]
+	vaesenc	xmm14,xmm14,xmm15
+
+	vaesenc	xmm9,xmm9,xmm1
+	vmovups	xmm15,XMMWORD[((144-128))+rcx]
+	vaesenc	xmm10,xmm10,xmm1
+	vpsrldq	xmm6,xmm6,8
+	vaesenc	xmm11,xmm11,xmm1
+	vpxor	xmm7,xmm7,xmm6
+	vaesenc	xmm12,xmm12,xmm1
+	vpxor	xmm4,xmm4,xmm0
+	movbe	r13,QWORD[8+r14]
+	vaesenc	xmm13,xmm13,xmm1
+	movbe	r12,QWORD[r14]
+	vaesenc	xmm14,xmm14,xmm1
+	vmovups	xmm1,XMMWORD[((160-128))+rcx]
+	cmp	ebp,11
+	jb	NEAR $L$enc_tail
+
+	vaesenc	xmm9,xmm9,xmm15
+	vaesenc	xmm10,xmm10,xmm15
+	vaesenc	xmm11,xmm11,xmm15
+	vaesenc	xmm12,xmm12,xmm15
+	vaesenc	xmm13,xmm13,xmm15
+	vaesenc	xmm14,xmm14,xmm15
+
+	vaesenc	xmm9,xmm9,xmm1
+	vaesenc	xmm10,xmm10,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+	vaesenc	xmm13,xmm13,xmm1
+	vmovups	xmm15,XMMWORD[((176-128))+rcx]
+	vaesenc	xmm14,xmm14,xmm1
+	vmovups	xmm1,XMMWORD[((192-128))+rcx]
+	je	NEAR $L$enc_tail
+
+	vaesenc	xmm9,xmm9,xmm15
+	vaesenc	xmm10,xmm10,xmm15
+	vaesenc	xmm11,xmm11,xmm15
+	vaesenc	xmm12,xmm12,xmm15
+	vaesenc	xmm13,xmm13,xmm15
+	vaesenc	xmm14,xmm14,xmm15
+
+	vaesenc	xmm9,xmm9,xmm1
+	vaesenc	xmm10,xmm10,xmm1
+	vaesenc	xmm11,xmm11,xmm1
+	vaesenc	xmm12,xmm12,xmm1
+	vaesenc	xmm13,xmm13,xmm1
+	vmovups	xmm15,XMMWORD[((208-128))+rcx]
+	vaesenc	xmm14,xmm14,xmm1
+	vmovups	xmm1,XMMWORD[((224-128))+rcx]
+	jmp	NEAR $L$enc_tail
+
+ALIGN	32
+$L$handle_ctr32:
+	vmovdqu	xmm0,XMMWORD[r11]
+	vpshufb	xmm6,xmm1,xmm0
+	vmovdqu	xmm5,XMMWORD[48+r11]
+	vpaddd	xmm10,xmm6,XMMWORD[64+r11]
+	vpaddd	xmm11,xmm6,xmm5
+	vmovdqu	xmm3,XMMWORD[((0-32))+r9]
+	vpaddd	xmm12,xmm10,xmm5
+	vpshufb	xmm10,xmm10,xmm0
+	vpaddd	xmm13,xmm11,xmm5
+	vpshufb	xmm11,xmm11,xmm0
+	vpxor	xmm10,xmm10,xmm15
+	vpaddd	xmm14,xmm12,xmm5
+	vpshufb	xmm12,xmm12,xmm0
+	vpxor	xmm11,xmm11,xmm15
+	vpaddd	xmm1,xmm13,xmm5
+	vpshufb	xmm13,xmm13,xmm0
+	vpshufb	xmm14,xmm14,xmm0
+	vpshufb	xmm1,xmm1,xmm0
+	jmp	NEAR $L$resume_ctr32
+
+ALIGN	32
+$L$enc_tail:
+	vaesenc	xmm9,xmm9,xmm15
+	vmovdqu	XMMWORD[(16+8)+rsp],xmm7
+	vpalignr	xmm8,xmm4,xmm4,8
+	vaesenc	xmm10,xmm10,xmm15
+	vpclmulqdq	xmm4,xmm4,xmm3,0x10
+	vpxor	xmm2,xmm1,XMMWORD[rdi]
+	vaesenc	xmm11,xmm11,xmm15
+	vpxor	xmm0,xmm1,XMMWORD[16+rdi]
+	vaesenc	xmm12,xmm12,xmm15
+	vpxor	xmm5,xmm1,XMMWORD[32+rdi]
+	vaesenc	xmm13,xmm13,xmm15
+	vpxor	xmm6,xmm1,XMMWORD[48+rdi]
+	vaesenc	xmm14,xmm14,xmm15
+	vpxor	xmm7,xmm1,XMMWORD[64+rdi]
+	vpxor	xmm3,xmm1,XMMWORD[80+rdi]
+	vmovdqu	xmm1,XMMWORD[r8]
+
+	vaesenclast	xmm9,xmm9,xmm2
+	vmovdqu	xmm2,XMMWORD[32+r11]
+	vaesenclast	xmm10,xmm10,xmm0
+	vpaddb	xmm0,xmm1,xmm2
+	mov	QWORD[((112+8))+rsp],r13
+	lea	rdi,[96+rdi]
+	vaesenclast	xmm11,xmm11,xmm5
+	vpaddb	xmm5,xmm0,xmm2
+	mov	QWORD[((120+8))+rsp],r12
+	lea	rsi,[96+rsi]
+	vmovdqu	xmm15,XMMWORD[((0-128))+rcx]
+	vaesenclast	xmm12,xmm12,xmm6
+	vpaddb	xmm6,xmm5,xmm2
+	vaesenclast	xmm13,xmm13,xmm7
+	vpaddb	xmm7,xmm6,xmm2
+	vaesenclast	xmm14,xmm14,xmm3
+	vpaddb	xmm3,xmm7,xmm2
+
+	add	r10,0x60
+	sub	rdx,0x6
+	jc	NEAR $L$6x_done
+
+	vmovups	XMMWORD[(-96)+rsi],xmm9
+	vpxor	xmm9,xmm1,xmm15
+	vmovups	XMMWORD[(-80)+rsi],xmm10
+	vmovdqa	xmm10,xmm0
+	vmovups	XMMWORD[(-64)+rsi],xmm11
+	vmovdqa	xmm11,xmm5
+	vmovups	XMMWORD[(-48)+rsi],xmm12
+	vmovdqa	xmm12,xmm6
+	vmovups	XMMWORD[(-32)+rsi],xmm13
+	vmovdqa	xmm13,xmm7
+	vmovups	XMMWORD[(-16)+rsi],xmm14
+	vmovdqa	xmm14,xmm3
+	vmovdqu	xmm7,XMMWORD[((32+8))+rsp]
+	jmp	NEAR $L$oop6x
+
+$L$6x_done:
+	vpxor	xmm8,xmm8,XMMWORD[((16+8))+rsp]
+	vpxor	xmm8,xmm8,xmm4
+
 	DB	0F3h,0C3h		;repret
 
-
 global	aesni_gcm_decrypt
 
+ALIGN	32
 aesni_gcm_decrypt:
-	xor	eax,eax
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aesni_gcm_decrypt:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+	xor	r10,r10
+
+
+
+	cmp	rdx,0x60
+	jb	NEAR $L$gcm_dec_abort
+
+	lea	rax,[rsp]
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	lea	rsp,[((-168))+rsp]
+	movaps	XMMWORD[(-216)+rax],xmm6
+	movaps	XMMWORD[(-200)+rax],xmm7
+	movaps	XMMWORD[(-184)+rax],xmm8
+	movaps	XMMWORD[(-168)+rax],xmm9
+	movaps	XMMWORD[(-152)+rax],xmm10
+	movaps	XMMWORD[(-136)+rax],xmm11
+	movaps	XMMWORD[(-120)+rax],xmm12
+	movaps	XMMWORD[(-104)+rax],xmm13
+	movaps	XMMWORD[(-88)+rax],xmm14
+	movaps	XMMWORD[(-72)+rax],xmm15
+$L$gcm_dec_body:
+	vzeroupper
+
+	vmovdqu	xmm1,XMMWORD[r8]
+	add	rsp,-128
+	mov	ebx,DWORD[12+r8]
+	lea	r11,[$L$bswap_mask]
+	lea	r14,[((-128))+rcx]
+	mov	r15,0xf80
+	vmovdqu	xmm8,XMMWORD[r9]
+	and	rsp,-128
+	vmovdqu	xmm0,XMMWORD[r11]
+	lea	rcx,[128+rcx]
+	lea	r9,[((32+32))+r9]
+	mov	ebp,DWORD[((240-128))+rcx]
+	vpshufb	xmm8,xmm8,xmm0
+
+	and	r14,r15
+	and	r15,rsp
+	sub	r15,r14
+	jc	NEAR $L$dec_no_key_aliasing
+	cmp	r15,768
+	jnc	NEAR $L$dec_no_key_aliasing
+	sub	rsp,r15
+$L$dec_no_key_aliasing:
+
+	vmovdqu	xmm7,XMMWORD[80+rdi]
+	lea	r14,[rdi]
+	vmovdqu	xmm4,XMMWORD[64+rdi]
+
+
+
+
+
+
+
+	lea	r15,[((-192))+rdx*1+rdi]
+
+	vmovdqu	xmm5,XMMWORD[48+rdi]
+	shr	rdx,4
+	xor	r10,r10
+	vmovdqu	xmm6,XMMWORD[32+rdi]
+	vpshufb	xmm7,xmm7,xmm0
+	vmovdqu	xmm2,XMMWORD[16+rdi]
+	vpshufb	xmm4,xmm4,xmm0
+	vmovdqu	xmm3,XMMWORD[rdi]
+	vpshufb	xmm5,xmm5,xmm0
+	vmovdqu	XMMWORD[48+rsp],xmm4
+	vpshufb	xmm6,xmm6,xmm0
+	vmovdqu	XMMWORD[64+rsp],xmm5
+	vpshufb	xmm2,xmm2,xmm0
+	vmovdqu	XMMWORD[80+rsp],xmm6
+	vpshufb	xmm3,xmm3,xmm0
+	vmovdqu	XMMWORD[96+rsp],xmm2
+	vmovdqu	XMMWORD[112+rsp],xmm3
+
+	call	_aesni_ctr32_ghash_6x
+
+	vmovups	XMMWORD[(-96)+rsi],xmm9
+	vmovups	XMMWORD[(-80)+rsi],xmm10
+	vmovups	XMMWORD[(-64)+rsi],xmm11
+	vmovups	XMMWORD[(-48)+rsi],xmm12
+	vmovups	XMMWORD[(-32)+rsi],xmm13
+	vmovups	XMMWORD[(-16)+rsi],xmm14
+
+	vpshufb	xmm8,xmm8,XMMWORD[r11]
+	vmovdqu	XMMWORD[(-64)+r9],xmm8
+
+	vzeroupper
+	movaps	xmm6,XMMWORD[((-216))+rax]
+	movaps	xmm7,XMMWORD[((-200))+rax]
+	movaps	xmm8,XMMWORD[((-184))+rax]
+	movaps	xmm9,XMMWORD[((-168))+rax]
+	movaps	xmm10,XMMWORD[((-152))+rax]
+	movaps	xmm11,XMMWORD[((-136))+rax]
+	movaps	xmm12,XMMWORD[((-120))+rax]
+	movaps	xmm13,XMMWORD[((-104))+rax]
+	movaps	xmm14,XMMWORD[((-88))+rax]
+	movaps	xmm15,XMMWORD[((-72))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	rbx,QWORD[((-8))+rax]
+	lea	rsp,[rax]
+$L$gcm_dec_abort:
+	mov	rax,r10
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+$L$SEH_end_aesni_gcm_decrypt:
+
+ALIGN	32
+_aesni_ctr32_6x:
+	vmovdqu	xmm4,XMMWORD[((0-128))+rcx]
+	vmovdqu	xmm2,XMMWORD[32+r11]
+	lea	r13,[((-1))+rbp]
+	vmovups	xmm15,XMMWORD[((16-128))+rcx]
+	lea	r12,[((32-128))+rcx]
+	vpxor	xmm9,xmm1,xmm4
+	add	ebx,100663296
+	jc	NEAR $L$handle_ctr32_2
+	vpaddb	xmm10,xmm1,xmm2
+	vpaddb	xmm11,xmm10,xmm2
+	vpxor	xmm10,xmm10,xmm4
+	vpaddb	xmm12,xmm11,xmm2
+	vpxor	xmm11,xmm11,xmm4
+	vpaddb	xmm13,xmm12,xmm2
+	vpxor	xmm12,xmm12,xmm4
+	vpaddb	xmm14,xmm13,xmm2
+	vpxor	xmm13,xmm13,xmm4
+	vpaddb	xmm1,xmm14,xmm2
+	vpxor	xmm14,xmm14,xmm4
+	jmp	NEAR $L$oop_ctr32
+
+ALIGN	16
+$L$oop_ctr32:
+	vaesenc	xmm9,xmm9,xmm15
+	vaesenc	xmm10,xmm10,xmm15
+	vaesenc	xmm11,xmm11,xmm15
+	vaesenc	xmm12,xmm12,xmm15
+	vaesenc	xmm13,xmm13,xmm15
+	vaesenc	xmm14,xmm14,xmm15
+	vmovups	xmm15,XMMWORD[r12]
+	lea	r12,[16+r12]
+	dec	r13d
+	jnz	NEAR $L$oop_ctr32
+
+	vmovdqu	xmm3,XMMWORD[r12]
+	vaesenc	xmm9,xmm9,xmm15
+	vpxor	xmm4,xmm3,XMMWORD[rdi]
+	vaesenc	xmm10,xmm10,xmm15
+	vpxor	xmm5,xmm3,XMMWORD[16+rdi]
+	vaesenc	xmm11,xmm11,xmm15
+	vpxor	xmm6,xmm3,XMMWORD[32+rdi]
+	vaesenc	xmm12,xmm12,xmm15
+	vpxor	xmm8,xmm3,XMMWORD[48+rdi]
+	vaesenc	xmm13,xmm13,xmm15
+	vpxor	xmm2,xmm3,XMMWORD[64+rdi]
+	vaesenc	xmm14,xmm14,xmm15
+	vpxor	xmm3,xmm3,XMMWORD[80+rdi]
+	lea	rdi,[96+rdi]
+
+	vaesenclast	xmm9,xmm9,xmm4
+	vaesenclast	xmm10,xmm10,xmm5
+	vaesenclast	xmm11,xmm11,xmm6
+	vaesenclast	xmm12,xmm12,xmm8
+	vaesenclast	xmm13,xmm13,xmm2
+	vaesenclast	xmm14,xmm14,xmm3
+	vmovups	XMMWORD[rsi],xmm9
+	vmovups	XMMWORD[16+rsi],xmm10
+	vmovups	XMMWORD[32+rsi],xmm11
+	vmovups	XMMWORD[48+rsi],xmm12
+	vmovups	XMMWORD[64+rsi],xmm13
+	vmovups	XMMWORD[80+rsi],xmm14
+	lea	rsi,[96+rsi]
+
+	DB	0F3h,0C3h		;repret
+ALIGN	32
+$L$handle_ctr32_2:
+	vpshufb	xmm6,xmm1,xmm0
+	vmovdqu	xmm5,XMMWORD[48+r11]
+	vpaddd	xmm10,xmm6,XMMWORD[64+r11]
+	vpaddd	xmm11,xmm6,xmm5
+	vpaddd	xmm12,xmm10,xmm5
+	vpshufb	xmm10,xmm10,xmm0
+	vpaddd	xmm13,xmm11,xmm5
+	vpshufb	xmm11,xmm11,xmm0
+	vpxor	xmm10,xmm10,xmm4
+	vpaddd	xmm14,xmm12,xmm5
+	vpshufb	xmm12,xmm12,xmm0
+	vpxor	xmm11,xmm11,xmm4
+	vpaddd	xmm1,xmm13,xmm5
+	vpshufb	xmm13,xmm13,xmm0
+	vpxor	xmm12,xmm12,xmm4
+	vpshufb	xmm14,xmm14,xmm0
+	vpxor	xmm13,xmm13,xmm4
+	vpshufb	xmm1,xmm1,xmm0
+	vpxor	xmm14,xmm14,xmm4
+	jmp	NEAR $L$oop_ctr32
+
+
+global	aesni_gcm_encrypt
+
+ALIGN	32
+aesni_gcm_encrypt:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aesni_gcm_encrypt:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+	xor	r10,r10
+
+
+
+
+	cmp	rdx,0x60*3
+	jb	NEAR $L$gcm_enc_abort
+
+	lea	rax,[rsp]
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	lea	rsp,[((-168))+rsp]
+	movaps	XMMWORD[(-216)+rax],xmm6
+	movaps	XMMWORD[(-200)+rax],xmm7
+	movaps	XMMWORD[(-184)+rax],xmm8
+	movaps	XMMWORD[(-168)+rax],xmm9
+	movaps	XMMWORD[(-152)+rax],xmm10
+	movaps	XMMWORD[(-136)+rax],xmm11
+	movaps	XMMWORD[(-120)+rax],xmm12
+	movaps	XMMWORD[(-104)+rax],xmm13
+	movaps	XMMWORD[(-88)+rax],xmm14
+	movaps	XMMWORD[(-72)+rax],xmm15
+$L$gcm_enc_body:
+	vzeroupper
+
+	vmovdqu	xmm1,XMMWORD[r8]
+	add	rsp,-128
+	mov	ebx,DWORD[12+r8]
+	lea	r11,[$L$bswap_mask]
+	lea	r14,[((-128))+rcx]
+	mov	r15,0xf80
+	lea	rcx,[128+rcx]
+	vmovdqu	xmm0,XMMWORD[r11]
+	and	rsp,-128
+	mov	ebp,DWORD[((240-128))+rcx]
+
+	and	r14,r15
+	and	r15,rsp
+	sub	r15,r14
+	jc	NEAR $L$enc_no_key_aliasing
+	cmp	r15,768
+	jnc	NEAR $L$enc_no_key_aliasing
+	sub	rsp,r15
+$L$enc_no_key_aliasing:
+
+	lea	r14,[rsi]
+
+
+
+
+
+
+
+
+	lea	r15,[((-192))+rdx*1+rsi]
+
+	shr	rdx,4
+
+	call	_aesni_ctr32_6x
+	vpshufb	xmm8,xmm9,xmm0
+	vpshufb	xmm2,xmm10,xmm0
+	vmovdqu	XMMWORD[112+rsp],xmm8
+	vpshufb	xmm4,xmm11,xmm0
+	vmovdqu	XMMWORD[96+rsp],xmm2
+	vpshufb	xmm5,xmm12,xmm0
+	vmovdqu	XMMWORD[80+rsp],xmm4
+	vpshufb	xmm6,xmm13,xmm0
+	vmovdqu	XMMWORD[64+rsp],xmm5
+	vpshufb	xmm7,xmm14,xmm0
+	vmovdqu	XMMWORD[48+rsp],xmm6
+
+	call	_aesni_ctr32_6x
+
+	vmovdqu	xmm8,XMMWORD[r9]
+	lea	r9,[((32+32))+r9]
+	sub	rdx,12
+	mov	r10,0x60*2
+	vpshufb	xmm8,xmm8,xmm0
+
+	call	_aesni_ctr32_ghash_6x
+	vmovdqu	xmm7,XMMWORD[32+rsp]
+	vmovdqu	xmm0,XMMWORD[r11]
+	vmovdqu	xmm3,XMMWORD[((0-32))+r9]
+	vpunpckhqdq	xmm1,xmm7,xmm7
+	vmovdqu	xmm15,XMMWORD[((32-32))+r9]
+	vmovups	XMMWORD[(-96)+rsi],xmm9
+	vpshufb	xmm9,xmm9,xmm0
+	vpxor	xmm1,xmm1,xmm7
+	vmovups	XMMWORD[(-80)+rsi],xmm10
+	vpshufb	xmm10,xmm10,xmm0
+	vmovups	XMMWORD[(-64)+rsi],xmm11
+	vpshufb	xmm11,xmm11,xmm0
+	vmovups	XMMWORD[(-48)+rsi],xmm12
+	vpshufb	xmm12,xmm12,xmm0
+	vmovups	XMMWORD[(-32)+rsi],xmm13
+	vpshufb	xmm13,xmm13,xmm0
+	vmovups	XMMWORD[(-16)+rsi],xmm14
+	vpshufb	xmm14,xmm14,xmm0
+	vmovdqu	XMMWORD[16+rsp],xmm9
+	vmovdqu	xmm6,XMMWORD[48+rsp]
+	vmovdqu	xmm0,XMMWORD[((16-32))+r9]
+	vpunpckhqdq	xmm2,xmm6,xmm6
+	vpclmulqdq	xmm5,xmm7,xmm3,0x00
+	vpxor	xmm2,xmm2,xmm6
+	vpclmulqdq	xmm7,xmm7,xmm3,0x11
+	vpclmulqdq	xmm1,xmm1,xmm15,0x00
+
+	vmovdqu	xmm9,XMMWORD[64+rsp]
+	vpclmulqdq	xmm4,xmm6,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((48-32))+r9]
+	vpxor	xmm4,xmm4,xmm5
+	vpunpckhqdq	xmm5,xmm9,xmm9
+	vpclmulqdq	xmm6,xmm6,xmm0,0x11
+	vpxor	xmm5,xmm5,xmm9
+	vpxor	xmm6,xmm6,xmm7
+	vpclmulqdq	xmm2,xmm2,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((80-32))+r9]
+	vpxor	xmm2,xmm2,xmm1
+
+	vmovdqu	xmm1,XMMWORD[80+rsp]
+	vpclmulqdq	xmm7,xmm9,xmm3,0x00
+	vmovdqu	xmm0,XMMWORD[((64-32))+r9]
+	vpxor	xmm7,xmm7,xmm4
+	vpunpckhqdq	xmm4,xmm1,xmm1
+	vpclmulqdq	xmm9,xmm9,xmm3,0x11
+	vpxor	xmm4,xmm4,xmm1
+	vpxor	xmm9,xmm9,xmm6
+	vpclmulqdq	xmm5,xmm5,xmm15,0x00
+	vpxor	xmm5,xmm5,xmm2
+
+	vmovdqu	xmm2,XMMWORD[96+rsp]
+	vpclmulqdq	xmm6,xmm1,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((96-32))+r9]
+	vpxor	xmm6,xmm6,xmm7
+	vpunpckhqdq	xmm7,xmm2,xmm2
+	vpclmulqdq	xmm1,xmm1,xmm0,0x11
+	vpxor	xmm7,xmm7,xmm2
+	vpxor	xmm1,xmm1,xmm9
+	vpclmulqdq	xmm4,xmm4,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((128-32))+r9]
+	vpxor	xmm4,xmm4,xmm5
+
+	vpxor	xmm8,xmm8,XMMWORD[112+rsp]
+	vpclmulqdq	xmm5,xmm2,xmm3,0x00
+	vmovdqu	xmm0,XMMWORD[((112-32))+r9]
+	vpunpckhqdq	xmm9,xmm8,xmm8
+	vpxor	xmm5,xmm5,xmm6
+	vpclmulqdq	xmm2,xmm2,xmm3,0x11
+	vpxor	xmm9,xmm9,xmm8
+	vpxor	xmm2,xmm2,xmm1
+	vpclmulqdq	xmm7,xmm7,xmm15,0x00
+	vpxor	xmm4,xmm7,xmm4
+
+	vpclmulqdq	xmm6,xmm8,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((0-32))+r9]
+	vpunpckhqdq	xmm1,xmm14,xmm14
+	vpclmulqdq	xmm8,xmm8,xmm0,0x11
+	vpxor	xmm1,xmm1,xmm14
+	vpxor	xmm5,xmm6,xmm5
+	vpclmulqdq	xmm9,xmm9,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((32-32))+r9]
+	vpxor	xmm7,xmm8,xmm2
+	vpxor	xmm6,xmm9,xmm4
+
+	vmovdqu	xmm0,XMMWORD[((16-32))+r9]
+	vpxor	xmm9,xmm7,xmm5
+	vpclmulqdq	xmm4,xmm14,xmm3,0x00
+	vpxor	xmm6,xmm6,xmm9
+	vpunpckhqdq	xmm2,xmm13,xmm13
+	vpclmulqdq	xmm14,xmm14,xmm3,0x11
+	vpxor	xmm2,xmm2,xmm13
+	vpslldq	xmm9,xmm6,8
+	vpclmulqdq	xmm1,xmm1,xmm15,0x00
+	vpxor	xmm8,xmm5,xmm9
+	vpsrldq	xmm6,xmm6,8
+	vpxor	xmm7,xmm7,xmm6
+
+	vpclmulqdq	xmm5,xmm13,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((48-32))+r9]
+	vpxor	xmm5,xmm5,xmm4
+	vpunpckhqdq	xmm9,xmm12,xmm12
+	vpclmulqdq	xmm13,xmm13,xmm0,0x11
+	vpxor	xmm9,xmm9,xmm12
+	vpxor	xmm13,xmm13,xmm14
+	vpalignr	xmm14,xmm8,xmm8,8
+	vpclmulqdq	xmm2,xmm2,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((80-32))+r9]
+	vpxor	xmm2,xmm2,xmm1
+
+	vpclmulqdq	xmm4,xmm12,xmm3,0x00
+	vmovdqu	xmm0,XMMWORD[((64-32))+r9]
+	vpxor	xmm4,xmm4,xmm5
+	vpunpckhqdq	xmm1,xmm11,xmm11
+	vpclmulqdq	xmm12,xmm12,xmm3,0x11
+	vpxor	xmm1,xmm1,xmm11
+	vpxor	xmm12,xmm12,xmm13
+	vxorps	xmm7,xmm7,XMMWORD[16+rsp]
+	vpclmulqdq	xmm9,xmm9,xmm15,0x00
+	vpxor	xmm9,xmm9,xmm2
+
+	vpclmulqdq	xmm8,xmm8,XMMWORD[16+r11],0x10
+	vxorps	xmm8,xmm8,xmm14
+
+	vpclmulqdq	xmm5,xmm11,xmm0,0x00
+	vmovdqu	xmm3,XMMWORD[((96-32))+r9]
+	vpxor	xmm5,xmm5,xmm4
+	vpunpckhqdq	xmm2,xmm10,xmm10
+	vpclmulqdq	xmm11,xmm11,xmm0,0x11
+	vpxor	xmm2,xmm2,xmm10
+	vpalignr	xmm14,xmm8,xmm8,8
+	vpxor	xmm11,xmm11,xmm12
+	vpclmulqdq	xmm1,xmm1,xmm15,0x10
+	vmovdqu	xmm15,XMMWORD[((128-32))+r9]
+	vpxor	xmm1,xmm1,xmm9
+
+	vxorps	xmm14,xmm14,xmm7
+	vpclmulqdq	xmm8,xmm8,XMMWORD[16+r11],0x10
+	vxorps	xmm8,xmm8,xmm14
+
+	vpclmulqdq	xmm4,xmm10,xmm3,0x00
+	vmovdqu	xmm0,XMMWORD[((112-32))+r9]
+	vpxor	xmm4,xmm4,xmm5
+	vpunpckhqdq	xmm9,xmm8,xmm8
+	vpclmulqdq	xmm10,xmm10,xmm3,0x11
+	vpxor	xmm9,xmm9,xmm8
+	vpxor	xmm10,xmm10,xmm11
+	vpclmulqdq	xmm2,xmm2,xmm15,0x00
+	vpxor	xmm2,xmm2,xmm1
+
+	vpclmulqdq	xmm5,xmm8,xmm0,0x00
+	vpclmulqdq	xmm7,xmm8,xmm0,0x11
+	vpxor	xmm5,xmm5,xmm4
+	vpclmulqdq	xmm6,xmm9,xmm15,0x10
+	vpxor	xmm7,xmm7,xmm10
+	vpxor	xmm6,xmm6,xmm2
+
+	vpxor	xmm4,xmm7,xmm5
+	vpxor	xmm6,xmm6,xmm4
+	vpslldq	xmm1,xmm6,8
+	vmovdqu	xmm3,XMMWORD[16+r11]
+	vpsrldq	xmm6,xmm6,8
+	vpxor	xmm8,xmm5,xmm1
+	vpxor	xmm7,xmm7,xmm6
+
+	vpalignr	xmm2,xmm8,xmm8,8
+	vpclmulqdq	xmm8,xmm8,xmm3,0x10
+	vpxor	xmm8,xmm8,xmm2
+
+	vpalignr	xmm2,xmm8,xmm8,8
+	vpclmulqdq	xmm8,xmm8,xmm3,0x10
+	vpxor	xmm2,xmm2,xmm7
+	vpxor	xmm8,xmm8,xmm2
+	vpshufb	xmm8,xmm8,XMMWORD[r11]
+	vmovdqu	XMMWORD[(-64)+r9],xmm8
+
+	vzeroupper
+	movaps	xmm6,XMMWORD[((-216))+rax]
+	movaps	xmm7,XMMWORD[((-200))+rax]
+	movaps	xmm8,XMMWORD[((-184))+rax]
+	movaps	xmm9,XMMWORD[((-168))+rax]
+	movaps	xmm10,XMMWORD[((-152))+rax]
+	movaps	xmm11,XMMWORD[((-136))+rax]
+	movaps	xmm12,XMMWORD[((-120))+rax]
+	movaps	xmm13,XMMWORD[((-104))+rax]
+	movaps	xmm14,XMMWORD[((-88))+rax]
+	movaps	xmm15,XMMWORD[((-72))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	rbx,QWORD[((-8))+rax]
+	lea	rsp,[rax]
+$L$gcm_enc_abort:
+	mov	rax,r10
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+$L$SEH_end_aesni_gcm_encrypt:
+ALIGN	64
+$L$bswap_mask:
+DB	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+$L$poly:
+DB	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+$L$one_msb:
+DB	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+$L$two_lsb:
+DB	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+$L$one_lsb:
+DB	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+DB	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108
+DB	101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82
+DB	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
+DB	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+ALIGN	64
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+gcm_se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[120+r8]
+
+	mov	r15,QWORD[((-48))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	rbx,QWORD[((-8))+rax]
+	mov	QWORD[240+r8],r15
+	mov	QWORD[232+r8],r14
+	mov	QWORD[224+r8],r13
+	mov	QWORD[216+r8],r12
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[144+r8],rbx
+
+	lea	rsi,[((-216))+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,20
+	DD	0xa548f3fc
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
 	DB	0F3h,0C3h		;repret
 
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_aesni_gcm_decrypt wrt ..imagebase
+	DD	$L$SEH_end_aesni_gcm_decrypt wrt ..imagebase
+	DD	$L$SEH_gcm_dec_info wrt ..imagebase
+
+	DD	$L$SEH_begin_aesni_gcm_encrypt wrt ..imagebase
+	DD	$L$SEH_end_aesni_gcm_encrypt wrt ..imagebase
+	DD	$L$SEH_gcm_enc_info wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_gcm_dec_info:
+DB	9,0,0,0
+	DD	gcm_se_handler wrt ..imagebase
+	DD	$L$gcm_dec_body wrt ..imagebase,$L$gcm_dec_abort wrt ..imagebase
+$L$SEH_gcm_enc_info:
+DB	9,0,0,0
+	DD	gcm_se_handler wrt ..imagebase
+	DD	$L$gcm_enc_body wrt ..imagebase,$L$gcm_enc_abort wrt ..imagebase
diff --git a/win-x86_64/crypto/modes/ghash-x86_64.asm b/win-x86_64/crypto/modes/ghash-x86_64.asm
index 5d8fadc..e5204bf 100644
--- a/win-x86_64/crypto/modes/ghash-x86_64.asm
+++ b/win-x86_64/crypto/modes/ghash-x86_64.asm
@@ -1309,7 +1309,115 @@
 
 ALIGN	32
 gcm_init_avx:
-	jmp	NEAR $L$_init_clmul
+$L$SEH_begin_gcm_init_avx:
+
+DB	0x48,0x83,0xec,0x18
+DB	0x0f,0x29,0x34,0x24
+	vzeroupper
+
+	vmovdqu	xmm2,XMMWORD[rdx]
+	vpshufd	xmm2,xmm2,78
+
+
+	vpshufd	xmm4,xmm2,255
+	vpsrlq	xmm3,xmm2,63
+	vpsllq	xmm2,xmm2,1
+	vpxor	xmm5,xmm5,xmm5
+	vpcmpgtd	xmm5,xmm5,xmm4
+	vpslldq	xmm3,xmm3,8
+	vpor	xmm2,xmm2,xmm3
+
+
+	vpand	xmm5,xmm5,XMMWORD[$L$0x1c2_polynomial]
+	vpxor	xmm2,xmm2,xmm5
+
+	vpunpckhqdq	xmm6,xmm2,xmm2
+	vmovdqa	xmm0,xmm2
+	vpxor	xmm6,xmm6,xmm2
+	mov	r10,4
+	jmp	NEAR $L$init_start_avx
+ALIGN	32
+$L$init_loop_avx:
+	vpalignr	xmm5,xmm4,xmm3,8
+	vmovdqu	XMMWORD[(-16)+rcx],xmm5
+	vpunpckhqdq	xmm3,xmm0,xmm0
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm1,xmm0,xmm2,0x11
+	vpclmulqdq	xmm0,xmm0,xmm2,0x00
+	vpclmulqdq	xmm3,xmm3,xmm6,0x00
+	vpxor	xmm4,xmm1,xmm0
+	vpxor	xmm3,xmm3,xmm4
+
+	vpslldq	xmm4,xmm3,8
+	vpsrldq	xmm3,xmm3,8
+	vpxor	xmm0,xmm0,xmm4
+	vpxor	xmm1,xmm1,xmm3
+	vpsllq	xmm3,xmm0,57
+	vpsllq	xmm4,xmm0,62
+	vpxor	xmm4,xmm4,xmm3
+	vpsllq	xmm3,xmm0,63
+	vpxor	xmm4,xmm4,xmm3
+	vpslldq	xmm3,xmm4,8
+	vpsrldq	xmm4,xmm4,8
+	vpxor	xmm0,xmm0,xmm3
+	vpxor	xmm1,xmm1,xmm4
+
+	vpsrlq	xmm4,xmm0,1
+	vpxor	xmm1,xmm1,xmm0
+	vpxor	xmm0,xmm0,xmm4
+	vpsrlq	xmm4,xmm4,5
+	vpxor	xmm0,xmm0,xmm4
+	vpsrlq	xmm0,xmm0,1
+	vpxor	xmm0,xmm0,xmm1
+$L$init_start_avx:
+	vmovdqa	xmm5,xmm0
+	vpunpckhqdq	xmm3,xmm0,xmm0
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm1,xmm0,xmm2,0x11
+	vpclmulqdq	xmm0,xmm0,xmm2,0x00
+	vpclmulqdq	xmm3,xmm3,xmm6,0x00
+	vpxor	xmm4,xmm1,xmm0
+	vpxor	xmm3,xmm3,xmm4
+
+	vpslldq	xmm4,xmm3,8
+	vpsrldq	xmm3,xmm3,8
+	vpxor	xmm0,xmm0,xmm4
+	vpxor	xmm1,xmm1,xmm3
+	vpsllq	xmm3,xmm0,57
+	vpsllq	xmm4,xmm0,62
+	vpxor	xmm4,xmm4,xmm3
+	vpsllq	xmm3,xmm0,63
+	vpxor	xmm4,xmm4,xmm3
+	vpslldq	xmm3,xmm4,8
+	vpsrldq	xmm4,xmm4,8
+	vpxor	xmm0,xmm0,xmm3
+	vpxor	xmm1,xmm1,xmm4
+
+	vpsrlq	xmm4,xmm0,1
+	vpxor	xmm1,xmm1,xmm0
+	vpxor	xmm0,xmm0,xmm4
+	vpsrlq	xmm4,xmm4,5
+	vpxor	xmm0,xmm0,xmm4
+	vpsrlq	xmm0,xmm0,1
+	vpxor	xmm0,xmm0,xmm1
+	vpshufd	xmm3,xmm5,78
+	vpshufd	xmm4,xmm0,78
+	vpxor	xmm3,xmm3,xmm5
+	vmovdqu	XMMWORD[rcx],xmm5
+	vpxor	xmm4,xmm4,xmm0
+	vmovdqu	XMMWORD[16+rcx],xmm0
+	lea	rcx,[48+rcx]
+	sub	r10,1
+	jnz	NEAR $L$init_loop_avx
+
+	vpalignr	xmm5,xmm3,xmm4,8
+	vmovdqu	XMMWORD[(-16)+rcx],xmm5
+
+	vzeroupper
+	movaps	xmm6,XMMWORD[rsp]
+	lea	rsp,[24+rsp]
+$L$SEH_end_gcm_init_avx:
+	DB	0F3h,0C3h		;repret
 
 global	gcm_gmult_avx
 
@@ -1321,7 +1429,403 @@
 
 ALIGN	32
 gcm_ghash_avx:
-	jmp	NEAR $L$_ghash_clmul
+	lea	rax,[((-136))+rsp]
+$L$SEH_begin_gcm_ghash_avx:
+
+DB	0x48,0x8d,0x60,0xe0
+DB	0x0f,0x29,0x70,0xe0
+DB	0x0f,0x29,0x78,0xf0
+DB	0x44,0x0f,0x29,0x00
+DB	0x44,0x0f,0x29,0x48,0x10
+DB	0x44,0x0f,0x29,0x50,0x20
+DB	0x44,0x0f,0x29,0x58,0x30
+DB	0x44,0x0f,0x29,0x60,0x40
+DB	0x44,0x0f,0x29,0x68,0x50
+DB	0x44,0x0f,0x29,0x70,0x60
+DB	0x44,0x0f,0x29,0x78,0x70
+	vzeroupper
+
+	vmovdqu	xmm10,XMMWORD[rcx]
+	lea	r10,[$L$0x1c2_polynomial]
+	lea	rdx,[64+rdx]
+	vmovdqu	xmm13,XMMWORD[$L$bswap_mask]
+	vpshufb	xmm10,xmm10,xmm13
+	cmp	r9,0x80
+	jb	NEAR $L$short_avx
+	sub	r9,0x80
+
+	vmovdqu	xmm14,XMMWORD[112+r8]
+	vmovdqu	xmm6,XMMWORD[((0-64))+rdx]
+	vpshufb	xmm14,xmm14,xmm13
+	vmovdqu	xmm7,XMMWORD[((32-64))+rdx]
+
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vmovdqu	xmm15,XMMWORD[96+r8]
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm9,xmm9,xmm14
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((16-64))+rdx]
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vmovdqu	xmm14,XMMWORD[80+r8]
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm8,xmm8,xmm15
+
+	vpshufb	xmm14,xmm14,xmm13
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((48-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+	vmovdqu	xmm15,XMMWORD[64+r8]
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((80-64))+rdx]
+
+	vpshufb	xmm15,xmm15,xmm13
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm4,xmm4,xmm1
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((64-64))+rdx]
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm8,xmm8,xmm15
+
+	vmovdqu	xmm14,XMMWORD[48+r8]
+	vpxor	xmm0,xmm0,xmm3
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpxor	xmm1,xmm1,xmm4
+	vpshufb	xmm14,xmm14,xmm13
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((96-64))+rdx]
+	vpxor	xmm2,xmm2,xmm5
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((128-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+
+	vmovdqu	xmm15,XMMWORD[32+r8]
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm4,xmm4,xmm1
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((112-64))+rdx]
+	vpxor	xmm5,xmm5,xmm2
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm8,xmm8,xmm15
+
+	vmovdqu	xmm14,XMMWORD[16+r8]
+	vpxor	xmm0,xmm0,xmm3
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpxor	xmm1,xmm1,xmm4
+	vpshufb	xmm14,xmm14,xmm13
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((144-64))+rdx]
+	vpxor	xmm2,xmm2,xmm5
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((176-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+
+	vmovdqu	xmm15,XMMWORD[r8]
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm4,xmm4,xmm1
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((160-64))+rdx]
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm9,xmm7,0x10
+
+	lea	r8,[128+r8]
+	cmp	r9,0x80
+	jb	NEAR $L$tail_avx
+
+	vpxor	xmm15,xmm15,xmm10
+	sub	r9,0x80
+	jmp	NEAR $L$oop8x_avx
+
+ALIGN	32
+$L$oop8x_avx:
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vmovdqu	xmm14,XMMWORD[112+r8]
+	vpxor	xmm3,xmm3,xmm0
+	vpxor	xmm8,xmm8,xmm15
+	vpclmulqdq	xmm10,xmm15,xmm6,0x00
+	vpshufb	xmm14,xmm14,xmm13
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm11,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((0-64))+rdx]
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm12,xmm8,xmm7,0x00
+	vmovdqu	xmm7,XMMWORD[((32-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+
+	vmovdqu	xmm15,XMMWORD[96+r8]
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpxor	xmm10,xmm10,xmm3
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vxorps	xmm11,xmm11,xmm4
+	vmovdqu	xmm6,XMMWORD[((16-64))+rdx]
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm12,xmm12,xmm5
+	vxorps	xmm8,xmm8,xmm15
+
+	vmovdqu	xmm14,XMMWORD[80+r8]
+	vpxor	xmm12,xmm12,xmm10
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpxor	xmm12,xmm12,xmm11
+	vpslldq	xmm9,xmm12,8
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vpsrldq	xmm12,xmm12,8
+	vpxor	xmm10,xmm10,xmm9
+	vmovdqu	xmm6,XMMWORD[((48-64))+rdx]
+	vpshufb	xmm14,xmm14,xmm13
+	vxorps	xmm11,xmm11,xmm12
+	vpxor	xmm4,xmm4,xmm1
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((80-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+	vpxor	xmm5,xmm5,xmm2
+
+	vmovdqu	xmm15,XMMWORD[64+r8]
+	vpalignr	xmm12,xmm10,xmm10,8
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpshufb	xmm15,xmm15,xmm13
+	vpxor	xmm0,xmm0,xmm3
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((64-64))+rdx]
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vxorps	xmm8,xmm8,xmm15
+	vpxor	xmm2,xmm2,xmm5
+
+	vmovdqu	xmm14,XMMWORD[48+r8]
+	vpclmulqdq	xmm10,xmm10,XMMWORD[r10],0x10
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpshufb	xmm14,xmm14,xmm13
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((96-64))+rdx]
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((128-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+	vpxor	xmm5,xmm5,xmm2
+
+	vmovdqu	xmm15,XMMWORD[32+r8]
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpshufb	xmm15,xmm15,xmm13
+	vpxor	xmm0,xmm0,xmm3
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((112-64))+rdx]
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm1,xmm1,xmm4
+	vpclmulqdq	xmm2,xmm9,xmm7,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vpxor	xmm2,xmm2,xmm5
+	vxorps	xmm10,xmm10,xmm12
+
+	vmovdqu	xmm14,XMMWORD[16+r8]
+	vpalignr	xmm12,xmm10,xmm10,8
+	vpclmulqdq	xmm3,xmm15,xmm6,0x00
+	vpshufb	xmm14,xmm14,xmm13
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm4,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((144-64))+rdx]
+	vpclmulqdq	xmm10,xmm10,XMMWORD[r10],0x10
+	vxorps	xmm12,xmm12,xmm11
+	vpunpckhqdq	xmm9,xmm14,xmm14
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm5,xmm8,xmm7,0x10
+	vmovdqu	xmm7,XMMWORD[((176-64))+rdx]
+	vpxor	xmm9,xmm9,xmm14
+	vpxor	xmm5,xmm5,xmm2
+
+	vmovdqu	xmm15,XMMWORD[r8]
+	vpclmulqdq	xmm0,xmm14,xmm6,0x00
+	vpshufb	xmm15,xmm15,xmm13
+	vpclmulqdq	xmm1,xmm14,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((160-64))+rdx]
+	vpxor	xmm15,xmm15,xmm12
+	vpclmulqdq	xmm2,xmm9,xmm7,0x10
+	vpxor	xmm15,xmm15,xmm10
+
+	lea	r8,[128+r8]
+	sub	r9,0x80
+	jnc	NEAR $L$oop8x_avx
+
+	add	r9,0x80
+	jmp	NEAR $L$tail_no_xor_avx
+
+ALIGN	32
+$L$short_avx:
+	vmovdqu	xmm14,XMMWORD[((-16))+r9*1+r8]
+	lea	r8,[r9*1+r8]
+	vmovdqu	xmm6,XMMWORD[((0-64))+rdx]
+	vmovdqu	xmm7,XMMWORD[((32-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+
+	vmovdqa	xmm3,xmm0
+	vmovdqa	xmm4,xmm1
+	vmovdqa	xmm5,xmm2
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-32))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((16-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vpsrldq	xmm7,xmm7,8
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-48))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((48-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vmovdqu	xmm7,XMMWORD[((80-64))+rdx]
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-64))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((64-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vpsrldq	xmm7,xmm7,8
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-80))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((96-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vmovdqu	xmm7,XMMWORD[((128-64))+rdx]
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-96))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((112-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vpsrldq	xmm7,xmm7,8
+	sub	r9,0x10
+	jz	NEAR $L$tail_avx
+
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vmovdqu	xmm14,XMMWORD[((-112))+r8]
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vmovdqu	xmm6,XMMWORD[((144-64))+rdx]
+	vpshufb	xmm15,xmm14,xmm13
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+	vmovq	xmm7,QWORD[((184-64))+rdx]
+	sub	r9,0x10
+	jmp	NEAR $L$tail_avx
+
+ALIGN	32
+$L$tail_avx:
+	vpxor	xmm15,xmm15,xmm10
+$L$tail_no_xor_avx:
+	vpunpckhqdq	xmm8,xmm15,xmm15
+	vpxor	xmm3,xmm3,xmm0
+	vpclmulqdq	xmm0,xmm15,xmm6,0x00
+	vpxor	xmm8,xmm8,xmm15
+	vpxor	xmm4,xmm4,xmm1
+	vpclmulqdq	xmm1,xmm15,xmm6,0x11
+	vpxor	xmm5,xmm5,xmm2
+	vpclmulqdq	xmm2,xmm8,xmm7,0x00
+
+	vmovdqu	xmm12,XMMWORD[r10]
+
+	vpxor	xmm10,xmm3,xmm0
+	vpxor	xmm11,xmm4,xmm1
+	vpxor	xmm5,xmm5,xmm2
+
+	vpxor	xmm5,xmm5,xmm10
+	vpxor	xmm5,xmm5,xmm11
+	vpslldq	xmm9,xmm5,8
+	vpsrldq	xmm5,xmm5,8
+	vpxor	xmm10,xmm10,xmm9
+	vpxor	xmm11,xmm11,xmm5
+
+	vpclmulqdq	xmm9,xmm10,xmm12,0x10
+	vpalignr	xmm10,xmm10,xmm10,8
+	vpxor	xmm10,xmm10,xmm9
+
+	vpclmulqdq	xmm9,xmm10,xmm12,0x10
+	vpalignr	xmm10,xmm10,xmm10,8
+	vpxor	xmm10,xmm10,xmm11
+	vpxor	xmm10,xmm10,xmm9
+
+	cmp	r9,0
+	jne	NEAR $L$short_avx
+
+	vpshufb	xmm10,xmm10,xmm13
+	vmovdqu	XMMWORD[rcx],xmm10
+	vzeroupper
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	xmm15,XMMWORD[144+rsp]
+	lea	rsp,[168+rsp]
+$L$SEH_end_gcm_ghash_avx:
+	DB	0F3h,0C3h		;repret
 
 ALIGN	64
 $L$bswap_mask:
@@ -1478,6 +1982,13 @@
 	DD	$L$SEH_begin_gcm_ghash_clmul wrt ..imagebase
 	DD	$L$SEH_end_gcm_ghash_clmul wrt ..imagebase
 	DD	$L$SEH_info_gcm_ghash_clmul wrt ..imagebase
+	DD	$L$SEH_begin_gcm_init_avx wrt ..imagebase
+	DD	$L$SEH_end_gcm_init_avx wrt ..imagebase
+	DD	$L$SEH_info_gcm_init_clmul wrt ..imagebase
+
+	DD	$L$SEH_begin_gcm_ghash_avx wrt ..imagebase
+	DD	$L$SEH_end_gcm_ghash_avx wrt ..imagebase
+	DD	$L$SEH_info_gcm_ghash_clmul wrt ..imagebase
 section	.xdata rdata align=8
 ALIGN	8
 $L$SEH_info_gcm_gmult_4bit: